예제 #1
0
 def hasAllele(subject, allele, locus, i):
     fieldName = alleleFieldNames[locus][i]
     admissibles = [
         mhctools.MhcObject(a) for a in subject[fieldName].split(';')
     ]
     in_adms = [adm_allele in allele for adm_allele in admissibles]
     return any(in_adms)
예제 #2
0
def parseHLAstring(hlastring, censor_code, locus=None):
    ## parse the HLA based on the censor code
    if censor_code == missing_code:
        return mhctools.MhcObject(locus=locus)
    loc = hlastring[0]
    if locus is not None and loc != locus:
        raise Exception("the passed locus does not correspond to locus inferred from the HLA string")
    elif censor_code == one_field_code:
        field1 = hlastring[1:3]
        return mhctools.MhcObject(locus=loc, field1=field1)
    elif censor_code == uncensored_code:
        field1 = hlastring[1:3]
        field2 = hlastring[3:5]
        return mhctools.MhcObject(locus=loc, field1=field1, field2=field2)
    else:
        raise Exception("invalid censor code.")
예제 #3
0
def addLeafBars(tree, hlaAlleles, counts, represDict=None):
    """
    input:
    - tree -- an ete object with hlaAlleles in the leafs
    - hlaAlleles -- ordered HLA alleles, indexed by locus
    - counts -- counts in the same order as hlaAlleles
    - represDict -- if some of the hlaAlleles are not in the tree,
      then they must be represented by another alleles. (default: None)
    output: None, the tree is modified
    """
    ## loop over leaves of the tree
    if represDict is not None:
        equivClassDict = aux.invertRepresDict(represDict)
    for leaf in tree:
        ## find the index of the HLA allele
        mhc = mhctools.MhcObject(leaf.hla)
        if represDict is not None:
            ## note that the represDict can contain more alleles than listed in hlaAlleles
            mhcs = [
                x for x in equivClassDict[mhc] if x in hlaAlleles[mhc.locus]
            ]
        else:
            mhcs = [mhc]
        total_count = 0
        for mhc in mhcs:
            hlaIdx = hlaAlleles[mhc.locus].index(mhc)
            total_count += counts[mhc.locus][hlaIdx]
        ## add a bar to the leaf
        F = ete3.RectFace(width=total_count * 2.5,
                          height=10,
                          fgcolor='k',
                          bgcolor='k')
        leaf.add_face(F, 0)
예제 #4
0
def funFitPMM(subjectFileName,
              hlaFileName,
              pSeqFileName,
              aaCovFileName,
              summaryFileName,
              parDict,
              chain_len=1000,
              chain_thin=10,
              num_chains=4,
              dry_run=False,
              use_cache=False,
              name_base="anon",
              sampler="jags",
              cross_val=None,
              parallel=True):
    """Todo: implement cross_val, PMM could also be used for many other situations..."""
    traitFieldName = parDict["traitFieldName"]
    alleleFieldNames = parDict["alleleFieldNames"]
    dataDict = fetcher.importSubjectData(subjectFileName,
                                         traitFieldName,
                                         alleleFieldNames,
                                         verbose=True,
                                         traitTransform=np.log10)
    hlaAlleles = dataDict["Alleles"]
    ## define models by their HLA tree
    pSeqDict = mhcclus.mkPseqDict(pSeqFileName)
    ## restrict the pSeqDict to alleles in the dataset
    loci = sorted(hlaAlleles.keys())
    pSeqDictRestr = dict(
        (hla, pSeqDict[hla]) for X in loci for hla in hlaAlleles[X])
    ## choose distances between amino-acids
    aaDistDict = sequences.mkAaDistDict(aaCovFileName,
                                        rescale=True,
                                        ignore_aas=sequences.ambiguous_aas)
    ## use the data to make a tree (in Newick format)
    ## using representatives for NetMHCpan equivalence classes
    newick_str_pseq, represDict = mhcclus.mkNewickTree(pSeqDictRestr,
                                                       aaDistDict=aaDistDict,
                                                       collapse=True)
    hlaCovMat, alleleList = mhcclus.treeToCovmat(newick_str_pseq)
    hlaCovMatHeader = [
        mhctools.MhcObject(allele, fmt="Newick") for allele in alleleList
    ]
    ## run the model
    result = fittrees.fitPMMweights(dataDict,
                                    parDict,
                                    hlaFileName,
                                    hlaCovMat,
                                    hlaCovMatHeader,
                                    represDict=represDict,
                                    chain_len=chain_len,
                                    chain_thin=chain_thin,
                                    num_chains=num_chains,
                                    modelName=name_base,
                                    verbose=True,
                                    dry_run=dry_run,
                                    use_cache=use_cache,
                                    parallel=parallel)
    return result
예제 #5
0
def importNetMhcOutput(fileName, version="3.0"):
    """
    Import binding predictions produced by NetMHCpan and
    store them in a dictionary.
    """
    assert (version in supportedNetMHCpanVersions)
    fileHandle = open(fileName, 'r')
    csvReader = csv.reader(fileHandle, delimiter='\t', quotechar='\n')
    predictionTable = [line for line in csvReader]
    fileHandle.close()

    header1PredictionTable = predictionTable[0]
    header2PredictionTable = predictionTable[1]
    predictionTable = predictionTable[2:]

    ## TODO: infer form header
    if version == "2.8":
        fstMhcCol = 3
        numMhcCols = 3  ## aff, nM, rank
        peptideCol = 1
        posCol = 0
        proteinCol = 2
        affCol = 0
        nMCol = 1
        rankCol = 2
    elif version == "3.0":
        fstMhcCol = 3
        numMhcCols = 4  ## core, aff, nM, rank
        peptideCol = 1
        posCol = 0
        proteinCol = 2
        coreCol = 0
        affCol = 1
        nMCol = 2
        rankCol = 3

    mhc_names = [x for x in header1PredictionTable if x != '']
    predPerMhcDict = {}
    for row in predictionTable:
        for i, mhc_name in enumerate(mhc_names):
            ## put hla in proper format
            mhc = mhctools.MhcObject(mhctools.fromNetMhcFormat(mhc_name))
            predDict = {}
            pos = predDict["pos"] = int(row[posCol])
            pep = predDict["pep"] = row[peptideCol]
            prot = predDict["prot"] = row[proteinCol]
            pepid = "{0:s}_{1:d}_{2:s}".format(prot, pos, pep)
            predDict["aff"] = float(row[fstMhcCol + i * numMhcCols + affCol])
            predDict["nM"] = float(row[fstMhcCol + i * numMhcCols + nMCol])
            predDict["rank"] = float(row[fstMhcCol + i * numMhcCols + rankCol])
            if version == "3.0":
                predDict["core"] = row[fstMhcCol + i * numMhcCols + coreCol]
            if mhc in list(predPerMhcDict.keys()):
                predPerMhcDict[mhc][pepid] = predDict
            else:
                predPerMhcDict[mhc] = {pepid: predDict}
    return predPerMhcDict
예제 #6
0
def mkPseqDict(pSeqFileName): ## MHC_pseudo.dat
    """
    input:
    - pSeqFileName -- name of file with pseudo sequences
    output:
    - pSeqDict -- dictionary with HLA alleles -> pseudo sequences
    """
    classic_human_loci = ["HLA-A", 'HLA-B', "HLA-C"]
    with open(pSeqFileName, 'r') as pSeqFileHandle:
        pSeqDict = pSeqFileHandle.read().split('\n')
    pSeqDict = [row.split(' ') for row in pSeqDict]
    pSeqDict = dict((mhctools.MhcObject(mhctools.fromNetMhcFormat(row[0])), row[1])
                    for row in pSeqDict if any([x in row[0] for x in classic_human_loci]))
    return pSeqDict
예제 #7
0
def compareModels(modelname1, modelname2, deltalpd_threshold=0.1):
    """
    Compare two cross-validated models.
    """
    modelnames = [modelname1, modelname2]
    filenames = ["cv_alleles_summary.{}.tsv".format(modelname) for modelname in modelnames]
    work_folder = os.getcwd()
    fullfilenames = [os.path.join(work_folder, "data", filename) for filename in filenames]
    lpdDicts = [{}, {}]
    for filename, lpdDict in zip(fullfilenames, lpdDicts):
        with open(filename, 'r') as f:
            reader = csv.DictReader(f, delimiter='\t')
            for record in reader:
                allele = mhctools.MhcObject(record["allele"])
                lpd = float(record["lpd"]) ## NB: using weighted lpd
                lpdDict[allele] = lpd
    alleles = sorted(lpdDicts[0].keys())
    loci = aux.unique([allele.locus for allele in alleles])
    alleles = dict((X, [allele for allele in alleles if allele.locus == X]) for X in loci)
    fig, axs = plt.subplots(1, len(loci), figsize=(15,10))
    height = np.max([len(alleles[locus]) for locus in loci])
    maxabsdiff = 0 ## used for x-limits of the axes
    for locus, ax in zip(loci, axs):
        selected_diffs = []
        selected_alleles = []
        for allele in alleles[locus]:
            vals = [lpdDict[allele] for lpdDict in lpdDicts]
            diff = vals[1] - vals[0]
            if np.abs(diff) >= deltalpd_threshold:
                selected_diffs.append(diff)
                selected_alleles.append(allele)
            maxabsdiff = max(np.abs(diff), maxabsdiff)
        pos = np.argsort(np.abs(selected_diffs))[::-1]
        ax.grid(axis='y')
        ax.barh(range(len(pos)), [selected_diffs[p] for p in pos], 0.3,
                color=defn.locusColorDict[locus])
        ax.set_yticks(range(len(pos)))
        ax.set_yticklabels([selected_alleles[p].short_str() for p in pos])
        ax.axvline(x=0, color='k')
        ax.set_xlabel("$\\Delta lpd$ HLA-{}".format(locus))
        ## do some basic statistics
        positive = np.sum(1 for x in selected_diffs if x > 0)
        total = len(selected_diffs)
        pval = sts.binom_test(positive, total)
        print("HLA-{0}: positive = {1}, total = {2}, P = {3}".format(locus, positive, total, pval))
    ## additional formatting...
    for ax in axs:
        ax.set_xlim((-maxabsdiff, maxabsdiff))
    fig.tight_layout()
    fig.savefig("compare.{0}.{1}.png".format(modelname1, modelname2), dpi=300, bbox_inches='tight')
예제 #8
0
def importAlleleFrequencyData(
        fileName):  ## "mhc-top0.99counts-ncbi-Sub-Saharan-Africa.tsv"
    """
    input: name of the file with frequency data.
    output: a dictionary with HLA counts, of the form
        MhcObject -> int
    """
    with open(fileName, 'r') as fileHandle:
        table = [
            row.split('\t') for row in fileHandle.read().split('\n')
            if row != ''
        ]
    header = table[0]
    table = table[1:]
    countDict = dict(
        (mhctools.MhcObject(row[0]), int(row[1])) for row in table)
    return countDict
예제 #9
0
def mkEteHlaTree(newick_str, midpoint=False, colorfun=(lambda x: 'black')):
    """
    Transform a Newick tree into the ETE format.

    Args:
        newick_str (str): a string representing the tree in the Newick format

    Kwargs:
        midpoint (bool): if True, find the midpoint of the tree and re-root.
        colorfun (function): a rule for coloring faces (TODO)

    Returns:
        A tuple consisting of an ete3.Tree object and a list of integer node names
    """
    tree = ete3.Tree(newick_str)
    if midpoint:
        tree.set_outgroup(
            tree.get_leaves()[0])  ## make arbitrary leaf the outgroup
        outgrp_node = tree.get_midpoint_outgroup()  ## now use ete algorithm
        tree.set_outgroup(outgrp_node)  ## and re-define the outgroup
    ## give nodes a name to be used in Bayesian model
    nodeNames = []
    for i, node in enumerate(tree.traverse()):
        if node.name != '':
            hla = mhctools.MhcObject(node.name, fmt="Newick")
            node.add_feature('hla', str(hla))  ## FIXME: use MhcObject
        node.name = str(i)
        nodeNames.append(i)
        ## set width and color of edges
        size = 10
        nstyle = ete3.treeview.NodeStyle()
        nstyle["vt_line_width"] = size
        nstyle[
            "hz_line_width"] = size  ## FIXME make sure that ultrametric trees are rendered properly
        ## make sure that ultrametric trees are rendered correctly
        nstyle["size"] = 0  ## do not render nodes
        node.set_style(nstyle)
        ## NB: colorEteTree overrides set_style
    ## set colors of the HLA tags
    for leaf in tree:
        N = ete3.AttrFace("hla", fsize=50, fgcolor=colorfun(leaf.hla))
        leaf.add_face(N, 0)
    return (tree, nodeNames)
예제 #10
0
def mkSubSeqNewickTree(hlaAlleles, positions, start=0, aaDistDict=None, verbose=False):
    """
    make a tree using particular positions in the MHC sequence.
    example: re-create netMHCpan pseudo sequences.
    input:
    - hlaAlleles -- the alleles at the leafs of the tree
    - fastaFileName -- aligned MHC protein sequences
    - positions -- the positions of interest
    - start -- the alignment could start at a different position
      (see the function sequences.mkPseudoSequence)
    - verbose -- print comments (default: False) TODO: unused
    output:
    - newick_str -- the resulting tree in Newick format
    - represDict -- a dictionary mapping the alleles to the chosen representative
    """
    ## import IMGT sequences
    imgtSeqDict = {}
    work_folder = os.getcwd() ## FIXME pass path and name of fasta files?
    loci = sorted(hlaAlleles.keys())
    for locus in loci:
        imgtFileName = os.path.join(work_folder, "data/hla/HLA-{}-IMGT-protein-alignment.fasta".format(locus))
        imgtSeqDict.update(sequences.readFastaFile(imgtFileName))
    ## get sub-sequences
    subSeqDict = {}
    imgtKeys = sorted(imgtSeqDict.keys())
    for locus in loci:
        for hla in hlaAlleles[locus]:
            ## find a key in imgtSeqDict containing the HLA allele
            key = next(filter(lambda k: mhctools.MhcObject(k) in hla, imgtKeys), None)
            ## NB: changed behavior: previously hla.subtype_str() in key
            if key is None:
                raise Exception("HLA allele {} not present in IMGT database".format(hla))
            seq = imgtSeqDict[key]
            subSeq = sequences.mkPseudoSequence(seq, positions, start)
            subSeqDict[hla] = subSeq
    ## make a newick tree, collapsing equivalence classes into one representative in the tree
    newick_str, represDict = mkNewickTree(subSeqDict, aaDistDict=aaDistDict, collapse=True)
    if verbose:
        print("representative alleles:")
        for repres in aux.unique(list(represDict.values())):
            print(repres, subSeqDict[repres])
    return (newick_str, represDict)
예제 #11
0
cohortHlaAlleles = {}
for X in "ABC":
    uniques = [mustGetUnique(recordsByID[ID][key]) for key in hlaLocusKeyDict[X] for ID in subjectIDs]
    censors = [getHLAcensor(hlastring) for hlastring in uniques]
    ## only use uncensored values
    alleles = [parseHLAstring(x, c, X) for x, c in censors if c == uncensored_code]
    cohortHlaAlleles[X] = aux.unique(alleles)

## import HLA alleles from another cohort

with open(rootFolder + "hla/mhc-top0.99counts-ncbi-Sub-Saharan-Africa.tsv", 'r') as f:
    hla_table = [row.split('\t') for row in f.read().split('\n') if row != '']
    hla_header = hla_table[0]
    hla_table = hla_table[1:]
hlaCountDict = dict((mhctools.MhcObject(row[0]), int(row[1])) for row in hla_table)

popHlaAlleles = {
    X : sorted(filter(lambda x: x.locus == X, hlaCountDict.keys()),
               key=lambda x: hlaCountDict[x], reverse=True)
    for X in "ABC"
}

## make a combined list of admissible alleles
admHlaAlleles = {
    X : aux.unique(cohortHlaAlleles[X] + popHlaAlleles[X])
    for X in "ABC"
}

## import alleles belonging to serotypes
with open(os.path.join(data_folder, "HLA_serotypes.tsv"), 'r') as f:
예제 #12
0
def compareModelsSEM(modelname1, modelname2, deltalpd_threshold=0.1):
    """
    Compare two cross-validated models, also calculate the (scaled) SEMs
    """
    modelnames = [modelname1, modelname2]
    filenames = ["cv_alleles_summary.{}.tsv".format(modelname) for modelname in modelnames]
    work_folder = os.getcwd()
    fullfilenames = [os.path.join(work_folder, "data", filename) for filename in filenames]
    subjectDicts = [{}, {}]
    for filename, subjDict in zip(fullfilenames, subjectDicts):
        with open(filename, 'r') as f:
            reader = csv.DictReader(f, delimiter='\t')
            for record in reader:
                allele = mhctools.MhcObject(record["allele"])
                subject_line = record["subjects"]
                subject_table = [row.split(",") for row in subject_line.split(";")]
                subject_dict = {int(row[0]) : (float(row[1]), float(row[2])) for row in subject_table}
                subjDict[allele] = subject_dict
    alleles = sorted(subjectDicts[0].keys())
    ## compute Delta lpd and scaled sems
    deltaLpdDict = {}
    semDict = {}
    for allele in alleles:
        subject_dicts = [subjDict[allele] for subjDict in subjectDicts]
        idxs = list(subject_dicts[0].keys())
        diffs = []
        weights = []
        for idx in idxs:
            clpd0, ppc0 = subject_dicts[0][idx]
            clpd1, ppc1 = subject_dicts[1][idx]
            if ppc0 != 0 and ppc1 != 0:
                diffs.append(clpd1 - clpd0)
                weights.append(ppc1 * ppc0)
        if len(diffs) > 0:
            deltaLpdDict[allele] = np.sum([d*w for d, w in zip(diffs, weights)])
            if len(diffs) > 1:
                mu = deltaLpdDict[allele] / np.sum(weights)
                semDict[allele] = np.sqrt(np.sum([w*(d-mu)**2 for d, w in zip(diffs, weights)]))
            else:
                semDict[allele] = np.nan
        else:
            deltaLpdDict[allele] = np.nan
            semDict[allele] = np.nan
    ## make figure
    loci = aux.unique([allele.locus for allele in alleles])
    alleles = dict((X, [allele for allele in alleles if allele.locus == X]) for X in loci)
    fig, axs = plt.subplots(1, len(loci), figsize=(10,6.67))
    height = np.max([len(alleles[locus]) for locus in loci])
    for locus, ax in zip(loci, axs):
        maxabsdiff = 0 ## used for x-limits of the axes
        selected_diffs = []
        selected_alleles = []
        selected_errs = []
        for allele in alleles[locus]:
            diff = deltaLpdDict[allele]
            if diff is not None and np.abs(diff) >= deltalpd_threshold:
                selected_diffs.append(diff)
                selected_alleles.append(allele)
                selected_errs.append(semDict[allele])
                maxabsdiff = max(np.abs(diff), maxabsdiff)
        ## revert order to get a "christmas tree plot"
        pos = np.argsort(np.abs(selected_diffs))[::-1]
        ax.grid(axis='y')
        ax.barh(range(len(pos)), [selected_diffs[p] for p in pos], 0.3,
                color=defn.locusColorDict[locus],
                xerr=np.array([selected_errs[p] for p in pos]),
                #ecolor=defn.locusColorDict[locus],
                capsize=5)
        ax.set_yticks(range(len(pos)))
        ax.set_yticklabels([selected_alleles[p].short_str() for p in pos])
        ax.axvline(x=0, color='k')
        ax.set_xlabel("$\\Delta {{\\rm lpd}}$ HLA-{}".format(locus))
        ## do some basic statistics
        positive = np.sum(1 for x in selected_diffs if x > 0)
        total = len(selected_diffs)
        pval_binom = sts.binom_test(positive, total)
        print("HLA-{0}: positive = {1}, total = {2}, P = {3} (binomial)".format(locus, positive, total, pval_binom))
        W, pval_wilcox = sts.wilcoxon(selected_diffs)
        print("HLA-{0}: W = {1}, total = {2}, P = {3} (wilcoxon)".format(locus, W, total, pval_wilcox))
        ax.set_xlim((-1.05*maxabsdiff, 1.05*maxabsdiff))
    ## additional formatting...
    fig.tight_layout()
    fig.savefig("compare.{0}.{1}.png".format(modelname1, modelname2), dpi=300, bbox_inches='tight')
예제 #13
0
def importSubjectData(fileName,
                      traitFieldName,
                      alleleFieldNames,
                      covariateFieldNames=None,
                      traitCensFieldName=None,
                      subset=None,
                      verbose=False,
                      traitTransform=(lambda x: x),
                      categorical=False):
    """
    Import MHC data and continuous disease traits.

    Args:
        fileName (str): name of the tsv/csv file with allele and trait data
            per subject
        traitFieldName (str): the name of the trait of interest in the header
            of the tsv file
        alleleFieldNames (dict): a dictionary with the field names for the
            alleles.

    Kwargs:
        covariateFieldNames (list of str): list with covariate names
            to be imported from the data file.
        traitCensFieldName (str or bool): title of the column with censoring values.
            If False, all data is assumed to be uncensored.
            If True (default) the key is assumed to be X_censoring,
            where X is the traitFieldName.
        subset (int): take a sample from the complete data.
            If None (default), use all data.
        verbose (bool): print some basic statistics on the imported data.
        traitTransform (function): a funcion to transform the trait value.
            For instance numpy.log, or the identity (default)
        categorical (bool): True if the trait is categorical, False otherwise (default).
            Use traitTransform to map string to required values (e.g. 0/1)

    Returns:
        A dictionary containing
            - TraitValues: the trait values
            - CensCodes: censoring codes for the TraitValues
            - CensBounds: upper (or lower) bounds for left (or right) censored Trait
                Only included if not categorical
            - Categories: a list of possible traitValues
                Only included if categorical
            - AlleleVecs: admissible alleles
            - Alleles: the alleles in the dataset, in the right order for the allele vectors

    Todo:
        - Intelligently handle censoring codes
        - Give a good discription of the expected file format
        - Allow for covariates
    """
    ## detect if the file is tsv or csv
    file_base, file_extension = os.path.splitext(fileName)
    if file_extension == ".csv":
        delim = ','
    elif file_extension == ".tsv":
        delim = '\t'
    elif file_extension == ".xlsx":
        ## @todo: implement excel files
        raise Exception("xlsx file format not yet implemented")
    else:
        raise Exception("invalid file format (csv or tsv expected)")
    ## read the contents of the file
    with open(fileName, 'r') as fileHandle:
        reader = csv.DictReader(fileHandle, delimiter=delim)
        data = [row for row in reader]
    ## take a sub-sample
    if subset is not None:
        idxs = np.random.choice(len(data), size=subset, replace=False)
        data = [data[i] for i in idxs]
    ## find data in the table
    Ploidy = 2
    loci = sorted(alleleFieldNames.keys())
    Alleles = {}
    for locus in loci:
        alls = [
            row[alleleFieldNames[locus][i]].split(';') for i in range(Ploidy)
            for row in data
        ]
        ## make MhcObject-s
        alls = [mhctools.MhcObject(a) for a in aux.unique(aux.flatten(alls))]
        ## remove non-expressed alleles and reduce to 2-field
        alls = [a.protein for a in alls]  ## include Null alleles etc.
        ## find unique elements again
        Alleles[locus] = aux.unique(alls)

    ## for each subject, make a vector with zeros and ones
    ## indicating whether the allele is present
    def hasAllele(subject, allele, locus, i):
        fieldName = alleleFieldNames[locus][i]
        admissibles = [
            mhctools.MhcObject(a) for a in subject[fieldName].split(';')
        ]
        in_adms = [adm_allele in allele for adm_allele in admissibles]
        return any(in_adms)

    AlleleVecs = {}
    for locus in sorted(alleleFieldNames.keys()):
        AlleleVecs[locus] = [[[
            hasAllele(subject, allele, locus, i) for allele in Alleles[locus]
        ] for i in range(Ploidy)] for subject in data]

    ## get the trait values and censoring information
    if type(traitCensFieldName) is bool and traitCensFieldName == True:
        ## use the default key, based on the traitFieldName
        traitCensFieldName = "{0}_censoring".format(traitFieldName)
    ## now, get censoring from data set if traitCensFieldName is str
    if type(traitCensFieldName) is str:
        CensCodes = np.array([
            subject[traitCensFieldName]
            if traitCensFieldName in subject.keys() else
            defn.uncensored_code  ## FIXME, make sure that this is consistant
            for subject in data
        ])
    else:  ## assume everything is uncensored
        CensCodes = np.array([defn.uncensored_code for _ in data])

    ## get trait values
    if not categorical:
        TraitValues = np.array([
            traitTransform(float(subject[traitFieldName]))
            if cens_code == defn.uncensored_code else np.nan
            for cens_code, subject in zip(CensCodes, data)
        ])
        ## find the upper or lower bound for the left and (resp.) right censored values
        ## FIXME For the non-interval-censored data, use an arbitrary (low) Trait value as lower bound
        CensBounds = np.array([
            traitTransform(float(subject[traitFieldName]))
            if cens_code == defn.left_censored_code or cens_code
            == defn.right_censored_code else defn.auxiliaryLowerCensBound
            for cens_code, subject in zip(CensCodes, data)
        ])
    else:  ## the trait is categorical: Censoring can only be missing or uncensored
        TraitValues = np.array([
            traitTransform(subject[traitFieldName])
            if cens_code == defn.uncensored_code else np.nan
            for cens_code, subject in zip(CensCodes, data)
        ])
    ## make a list of possible trait values
    Categories = aux.unique([
        x for x, c in zip(TraitValues, CensCodes) if c == defn.uncensored_code
    ])

    ## import covariates
    if covariateFieldNames is None:
        covariateFieldNames = []  ## empty list
    Covariates = {
    }  ## dictionary indexed by covariate name, empty if no covariates
    for cfn in covariateFieldNames:
        ccfn = "{0}_censoring".format(cfn)
        covVals, covCCs, covCBs = fetchContinuousValue(cfn, ccfn, data)
        Covariates[cfn] = {
            "Values": covVals,
            "CensCodes": covCCs,
            "CensBounds": covCBs
        }

    ## do some printing...
    if verbose:
        ## number op subjects
        print("number of subjects: {0}".format(len(data)))
        ## Trait statistics
        if not categorical:
            mtrait = np.nanmedian(TraitValues)
            ltrait = np.nanpercentile(TraitValues, 2.5)
            htrait = np.nanpercentile(TraitValues, 97.5)
            print(
                "median trait value: {0:0.2f}, 2.5 - 97.5 percentiles: {1:0.2f} - {2:0.2f}"
                .format(mtrait, ltrait, htrait))
        else:
            catcounts = {
                cat: len([x for x in TraitValues if x == cat])
                for cat in Categories
            }
            print("category histogram:")
            for cat in Categories:
                print(f"\t{cat}:\t{catcounts[cat]}")
        ## allele statistics
        print("total number of alleles:",
              np.sum([len(Alleles[locus]) for locus in loci]))
        for locus in sorted(Alleles.keys()):
            print(f"number of {locus} alleles: {len(Alleles[locus])}")
        numCompleteHaplotypes = 0
        for idx in range(len(data)):
            compl = [
                sum(1 for x in alleleVec if x) == 1
                for locus in sorted(Alleles.keys())
                for alleleVec in AlleleVecs[locus][idx]
            ]
            if all(compl):
                ## subject has with 2-field typed haplotype
                numCompleteHaplotypes += 1
        print(
            "number of complete haplotypes: {0}".format(numCompleteHaplotypes))
        ## Covariates
        if len(covariateFieldNames) > 0:
            print("covariate statistics:")
        for cfn in covariateFieldNames:
            mcov = np.nanmean(Covariates[cfn]["Values"])
            lcov = np.nanpercentile(Covariates[cfn]["Values"], 2.5)
            hcov = np.nanpercentile(Covariates[cfn]["Values"], 97.5)
            print(
                f"\t'{cfn}' mean value: {mcov:0.2f}, 2.5 - 97.5 percentiles: {lcov:0.2f} - {hcov:0.2f}"
            )

    ## return relevant data in a dictionary
    dataDict = {
        "TraitValues": TraitValues,
        "CensCodes": CensCodes,
        "AlleleVecs": AlleleVecs,
        "Alleles": Alleles,
        "Covariates": Covariates
    }
    ## include censoring bounds if the trait value is a real number
    if not categorical:
        dataDict.update({"CensBounds": CensBounds})
    else:
        dataDict.update({"Categories": Categories})
    return dataDict