def hasAllele(subject, allele, locus, i): fieldName = alleleFieldNames[locus][i] admissibles = [ mhctools.MhcObject(a) for a in subject[fieldName].split(';') ] in_adms = [adm_allele in allele for adm_allele in admissibles] return any(in_adms)
def parseHLAstring(hlastring, censor_code, locus=None): ## parse the HLA based on the censor code if censor_code == missing_code: return mhctools.MhcObject(locus=locus) loc = hlastring[0] if locus is not None and loc != locus: raise Exception("the passed locus does not correspond to locus inferred from the HLA string") elif censor_code == one_field_code: field1 = hlastring[1:3] return mhctools.MhcObject(locus=loc, field1=field1) elif censor_code == uncensored_code: field1 = hlastring[1:3] field2 = hlastring[3:5] return mhctools.MhcObject(locus=loc, field1=field1, field2=field2) else: raise Exception("invalid censor code.")
def addLeafBars(tree, hlaAlleles, counts, represDict=None): """ input: - tree -- an ete object with hlaAlleles in the leafs - hlaAlleles -- ordered HLA alleles, indexed by locus - counts -- counts in the same order as hlaAlleles - represDict -- if some of the hlaAlleles are not in the tree, then they must be represented by another alleles. (default: None) output: None, the tree is modified """ ## loop over leaves of the tree if represDict is not None: equivClassDict = aux.invertRepresDict(represDict) for leaf in tree: ## find the index of the HLA allele mhc = mhctools.MhcObject(leaf.hla) if represDict is not None: ## note that the represDict can contain more alleles than listed in hlaAlleles mhcs = [ x for x in equivClassDict[mhc] if x in hlaAlleles[mhc.locus] ] else: mhcs = [mhc] total_count = 0 for mhc in mhcs: hlaIdx = hlaAlleles[mhc.locus].index(mhc) total_count += counts[mhc.locus][hlaIdx] ## add a bar to the leaf F = ete3.RectFace(width=total_count * 2.5, height=10, fgcolor='k', bgcolor='k') leaf.add_face(F, 0)
def funFitPMM(subjectFileName, hlaFileName, pSeqFileName, aaCovFileName, summaryFileName, parDict, chain_len=1000, chain_thin=10, num_chains=4, dry_run=False, use_cache=False, name_base="anon", sampler="jags", cross_val=None, parallel=True): """Todo: implement cross_val, PMM could also be used for many other situations...""" traitFieldName = parDict["traitFieldName"] alleleFieldNames = parDict["alleleFieldNames"] dataDict = fetcher.importSubjectData(subjectFileName, traitFieldName, alleleFieldNames, verbose=True, traitTransform=np.log10) hlaAlleles = dataDict["Alleles"] ## define models by their HLA tree pSeqDict = mhcclus.mkPseqDict(pSeqFileName) ## restrict the pSeqDict to alleles in the dataset loci = sorted(hlaAlleles.keys()) pSeqDictRestr = dict( (hla, pSeqDict[hla]) for X in loci for hla in hlaAlleles[X]) ## choose distances between amino-acids aaDistDict = sequences.mkAaDistDict(aaCovFileName, rescale=True, ignore_aas=sequences.ambiguous_aas) ## use the data to make a tree (in Newick format) ## using representatives for NetMHCpan equivalence classes newick_str_pseq, represDict = mhcclus.mkNewickTree(pSeqDictRestr, aaDistDict=aaDistDict, collapse=True) hlaCovMat, alleleList = mhcclus.treeToCovmat(newick_str_pseq) hlaCovMatHeader = [ mhctools.MhcObject(allele, fmt="Newick") for allele in alleleList ] ## run the model result = fittrees.fitPMMweights(dataDict, parDict, hlaFileName, hlaCovMat, hlaCovMatHeader, represDict=represDict, chain_len=chain_len, chain_thin=chain_thin, num_chains=num_chains, modelName=name_base, verbose=True, dry_run=dry_run, use_cache=use_cache, parallel=parallel) return result
def importNetMhcOutput(fileName, version="3.0"): """ Import binding predictions produced by NetMHCpan and store them in a dictionary. """ assert (version in supportedNetMHCpanVersions) fileHandle = open(fileName, 'r') csvReader = csv.reader(fileHandle, delimiter='\t', quotechar='\n') predictionTable = [line for line in csvReader] fileHandle.close() header1PredictionTable = predictionTable[0] header2PredictionTable = predictionTable[1] predictionTable = predictionTable[2:] ## TODO: infer form header if version == "2.8": fstMhcCol = 3 numMhcCols = 3 ## aff, nM, rank peptideCol = 1 posCol = 0 proteinCol = 2 affCol = 0 nMCol = 1 rankCol = 2 elif version == "3.0": fstMhcCol = 3 numMhcCols = 4 ## core, aff, nM, rank peptideCol = 1 posCol = 0 proteinCol = 2 coreCol = 0 affCol = 1 nMCol = 2 rankCol = 3 mhc_names = [x for x in header1PredictionTable if x != ''] predPerMhcDict = {} for row in predictionTable: for i, mhc_name in enumerate(mhc_names): ## put hla in proper format mhc = mhctools.MhcObject(mhctools.fromNetMhcFormat(mhc_name)) predDict = {} pos = predDict["pos"] = int(row[posCol]) pep = predDict["pep"] = row[peptideCol] prot = predDict["prot"] = row[proteinCol] pepid = "{0:s}_{1:d}_{2:s}".format(prot, pos, pep) predDict["aff"] = float(row[fstMhcCol + i * numMhcCols + affCol]) predDict["nM"] = float(row[fstMhcCol + i * numMhcCols + nMCol]) predDict["rank"] = float(row[fstMhcCol + i * numMhcCols + rankCol]) if version == "3.0": predDict["core"] = row[fstMhcCol + i * numMhcCols + coreCol] if mhc in list(predPerMhcDict.keys()): predPerMhcDict[mhc][pepid] = predDict else: predPerMhcDict[mhc] = {pepid: predDict} return predPerMhcDict
def mkPseqDict(pSeqFileName): ## MHC_pseudo.dat """ input: - pSeqFileName -- name of file with pseudo sequences output: - pSeqDict -- dictionary with HLA alleles -> pseudo sequences """ classic_human_loci = ["HLA-A", 'HLA-B', "HLA-C"] with open(pSeqFileName, 'r') as pSeqFileHandle: pSeqDict = pSeqFileHandle.read().split('\n') pSeqDict = [row.split(' ') for row in pSeqDict] pSeqDict = dict((mhctools.MhcObject(mhctools.fromNetMhcFormat(row[0])), row[1]) for row in pSeqDict if any([x in row[0] for x in classic_human_loci])) return pSeqDict
def compareModels(modelname1, modelname2, deltalpd_threshold=0.1): """ Compare two cross-validated models. """ modelnames = [modelname1, modelname2] filenames = ["cv_alleles_summary.{}.tsv".format(modelname) for modelname in modelnames] work_folder = os.getcwd() fullfilenames = [os.path.join(work_folder, "data", filename) for filename in filenames] lpdDicts = [{}, {}] for filename, lpdDict in zip(fullfilenames, lpdDicts): with open(filename, 'r') as f: reader = csv.DictReader(f, delimiter='\t') for record in reader: allele = mhctools.MhcObject(record["allele"]) lpd = float(record["lpd"]) ## NB: using weighted lpd lpdDict[allele] = lpd alleles = sorted(lpdDicts[0].keys()) loci = aux.unique([allele.locus for allele in alleles]) alleles = dict((X, [allele for allele in alleles if allele.locus == X]) for X in loci) fig, axs = plt.subplots(1, len(loci), figsize=(15,10)) height = np.max([len(alleles[locus]) for locus in loci]) maxabsdiff = 0 ## used for x-limits of the axes for locus, ax in zip(loci, axs): selected_diffs = [] selected_alleles = [] for allele in alleles[locus]: vals = [lpdDict[allele] for lpdDict in lpdDicts] diff = vals[1] - vals[0] if np.abs(diff) >= deltalpd_threshold: selected_diffs.append(diff) selected_alleles.append(allele) maxabsdiff = max(np.abs(diff), maxabsdiff) pos = np.argsort(np.abs(selected_diffs))[::-1] ax.grid(axis='y') ax.barh(range(len(pos)), [selected_diffs[p] for p in pos], 0.3, color=defn.locusColorDict[locus]) ax.set_yticks(range(len(pos))) ax.set_yticklabels([selected_alleles[p].short_str() for p in pos]) ax.axvline(x=0, color='k') ax.set_xlabel("$\\Delta lpd$ HLA-{}".format(locus)) ## do some basic statistics positive = np.sum(1 for x in selected_diffs if x > 0) total = len(selected_diffs) pval = sts.binom_test(positive, total) print("HLA-{0}: positive = {1}, total = {2}, P = {3}".format(locus, positive, total, pval)) ## additional formatting... for ax in axs: ax.set_xlim((-maxabsdiff, maxabsdiff)) fig.tight_layout() fig.savefig("compare.{0}.{1}.png".format(modelname1, modelname2), dpi=300, bbox_inches='tight')
def importAlleleFrequencyData( fileName): ## "mhc-top0.99counts-ncbi-Sub-Saharan-Africa.tsv" """ input: name of the file with frequency data. output: a dictionary with HLA counts, of the form MhcObject -> int """ with open(fileName, 'r') as fileHandle: table = [ row.split('\t') for row in fileHandle.read().split('\n') if row != '' ] header = table[0] table = table[1:] countDict = dict( (mhctools.MhcObject(row[0]), int(row[1])) for row in table) return countDict
def mkEteHlaTree(newick_str, midpoint=False, colorfun=(lambda x: 'black')): """ Transform a Newick tree into the ETE format. Args: newick_str (str): a string representing the tree in the Newick format Kwargs: midpoint (bool): if True, find the midpoint of the tree and re-root. colorfun (function): a rule for coloring faces (TODO) Returns: A tuple consisting of an ete3.Tree object and a list of integer node names """ tree = ete3.Tree(newick_str) if midpoint: tree.set_outgroup( tree.get_leaves()[0]) ## make arbitrary leaf the outgroup outgrp_node = tree.get_midpoint_outgroup() ## now use ete algorithm tree.set_outgroup(outgrp_node) ## and re-define the outgroup ## give nodes a name to be used in Bayesian model nodeNames = [] for i, node in enumerate(tree.traverse()): if node.name != '': hla = mhctools.MhcObject(node.name, fmt="Newick") node.add_feature('hla', str(hla)) ## FIXME: use MhcObject node.name = str(i) nodeNames.append(i) ## set width and color of edges size = 10 nstyle = ete3.treeview.NodeStyle() nstyle["vt_line_width"] = size nstyle[ "hz_line_width"] = size ## FIXME make sure that ultrametric trees are rendered properly ## make sure that ultrametric trees are rendered correctly nstyle["size"] = 0 ## do not render nodes node.set_style(nstyle) ## NB: colorEteTree overrides set_style ## set colors of the HLA tags for leaf in tree: N = ete3.AttrFace("hla", fsize=50, fgcolor=colorfun(leaf.hla)) leaf.add_face(N, 0) return (tree, nodeNames)
def mkSubSeqNewickTree(hlaAlleles, positions, start=0, aaDistDict=None, verbose=False): """ make a tree using particular positions in the MHC sequence. example: re-create netMHCpan pseudo sequences. input: - hlaAlleles -- the alleles at the leafs of the tree - fastaFileName -- aligned MHC protein sequences - positions -- the positions of interest - start -- the alignment could start at a different position (see the function sequences.mkPseudoSequence) - verbose -- print comments (default: False) TODO: unused output: - newick_str -- the resulting tree in Newick format - represDict -- a dictionary mapping the alleles to the chosen representative """ ## import IMGT sequences imgtSeqDict = {} work_folder = os.getcwd() ## FIXME pass path and name of fasta files? loci = sorted(hlaAlleles.keys()) for locus in loci: imgtFileName = os.path.join(work_folder, "data/hla/HLA-{}-IMGT-protein-alignment.fasta".format(locus)) imgtSeqDict.update(sequences.readFastaFile(imgtFileName)) ## get sub-sequences subSeqDict = {} imgtKeys = sorted(imgtSeqDict.keys()) for locus in loci: for hla in hlaAlleles[locus]: ## find a key in imgtSeqDict containing the HLA allele key = next(filter(lambda k: mhctools.MhcObject(k) in hla, imgtKeys), None) ## NB: changed behavior: previously hla.subtype_str() in key if key is None: raise Exception("HLA allele {} not present in IMGT database".format(hla)) seq = imgtSeqDict[key] subSeq = sequences.mkPseudoSequence(seq, positions, start) subSeqDict[hla] = subSeq ## make a newick tree, collapsing equivalence classes into one representative in the tree newick_str, represDict = mkNewickTree(subSeqDict, aaDistDict=aaDistDict, collapse=True) if verbose: print("representative alleles:") for repres in aux.unique(list(represDict.values())): print(repres, subSeqDict[repres]) return (newick_str, represDict)
cohortHlaAlleles = {} for X in "ABC": uniques = [mustGetUnique(recordsByID[ID][key]) for key in hlaLocusKeyDict[X] for ID in subjectIDs] censors = [getHLAcensor(hlastring) for hlastring in uniques] ## only use uncensored values alleles = [parseHLAstring(x, c, X) for x, c in censors if c == uncensored_code] cohortHlaAlleles[X] = aux.unique(alleles) ## import HLA alleles from another cohort with open(rootFolder + "hla/mhc-top0.99counts-ncbi-Sub-Saharan-Africa.tsv", 'r') as f: hla_table = [row.split('\t') for row in f.read().split('\n') if row != ''] hla_header = hla_table[0] hla_table = hla_table[1:] hlaCountDict = dict((mhctools.MhcObject(row[0]), int(row[1])) for row in hla_table) popHlaAlleles = { X : sorted(filter(lambda x: x.locus == X, hlaCountDict.keys()), key=lambda x: hlaCountDict[x], reverse=True) for X in "ABC" } ## make a combined list of admissible alleles admHlaAlleles = { X : aux.unique(cohortHlaAlleles[X] + popHlaAlleles[X]) for X in "ABC" } ## import alleles belonging to serotypes with open(os.path.join(data_folder, "HLA_serotypes.tsv"), 'r') as f:
def compareModelsSEM(modelname1, modelname2, deltalpd_threshold=0.1): """ Compare two cross-validated models, also calculate the (scaled) SEMs """ modelnames = [modelname1, modelname2] filenames = ["cv_alleles_summary.{}.tsv".format(modelname) for modelname in modelnames] work_folder = os.getcwd() fullfilenames = [os.path.join(work_folder, "data", filename) for filename in filenames] subjectDicts = [{}, {}] for filename, subjDict in zip(fullfilenames, subjectDicts): with open(filename, 'r') as f: reader = csv.DictReader(f, delimiter='\t') for record in reader: allele = mhctools.MhcObject(record["allele"]) subject_line = record["subjects"] subject_table = [row.split(",") for row in subject_line.split(";")] subject_dict = {int(row[0]) : (float(row[1]), float(row[2])) for row in subject_table} subjDict[allele] = subject_dict alleles = sorted(subjectDicts[0].keys()) ## compute Delta lpd and scaled sems deltaLpdDict = {} semDict = {} for allele in alleles: subject_dicts = [subjDict[allele] for subjDict in subjectDicts] idxs = list(subject_dicts[0].keys()) diffs = [] weights = [] for idx in idxs: clpd0, ppc0 = subject_dicts[0][idx] clpd1, ppc1 = subject_dicts[1][idx] if ppc0 != 0 and ppc1 != 0: diffs.append(clpd1 - clpd0) weights.append(ppc1 * ppc0) if len(diffs) > 0: deltaLpdDict[allele] = np.sum([d*w for d, w in zip(diffs, weights)]) if len(diffs) > 1: mu = deltaLpdDict[allele] / np.sum(weights) semDict[allele] = np.sqrt(np.sum([w*(d-mu)**2 for d, w in zip(diffs, weights)])) else: semDict[allele] = np.nan else: deltaLpdDict[allele] = np.nan semDict[allele] = np.nan ## make figure loci = aux.unique([allele.locus for allele in alleles]) alleles = dict((X, [allele for allele in alleles if allele.locus == X]) for X in loci) fig, axs = plt.subplots(1, len(loci), figsize=(10,6.67)) height = np.max([len(alleles[locus]) for locus in loci]) for locus, ax in zip(loci, axs): maxabsdiff = 0 ## used for x-limits of the axes selected_diffs = [] selected_alleles = [] selected_errs = [] for allele in alleles[locus]: diff = deltaLpdDict[allele] if diff is not None and np.abs(diff) >= deltalpd_threshold: selected_diffs.append(diff) selected_alleles.append(allele) selected_errs.append(semDict[allele]) maxabsdiff = max(np.abs(diff), maxabsdiff) ## revert order to get a "christmas tree plot" pos = np.argsort(np.abs(selected_diffs))[::-1] ax.grid(axis='y') ax.barh(range(len(pos)), [selected_diffs[p] for p in pos], 0.3, color=defn.locusColorDict[locus], xerr=np.array([selected_errs[p] for p in pos]), #ecolor=defn.locusColorDict[locus], capsize=5) ax.set_yticks(range(len(pos))) ax.set_yticklabels([selected_alleles[p].short_str() for p in pos]) ax.axvline(x=0, color='k') ax.set_xlabel("$\\Delta {{\\rm lpd}}$ HLA-{}".format(locus)) ## do some basic statistics positive = np.sum(1 for x in selected_diffs if x > 0) total = len(selected_diffs) pval_binom = sts.binom_test(positive, total) print("HLA-{0}: positive = {1}, total = {2}, P = {3} (binomial)".format(locus, positive, total, pval_binom)) W, pval_wilcox = sts.wilcoxon(selected_diffs) print("HLA-{0}: W = {1}, total = {2}, P = {3} (wilcoxon)".format(locus, W, total, pval_wilcox)) ax.set_xlim((-1.05*maxabsdiff, 1.05*maxabsdiff)) ## additional formatting... fig.tight_layout() fig.savefig("compare.{0}.{1}.png".format(modelname1, modelname2), dpi=300, bbox_inches='tight')
def importSubjectData(fileName, traitFieldName, alleleFieldNames, covariateFieldNames=None, traitCensFieldName=None, subset=None, verbose=False, traitTransform=(lambda x: x), categorical=False): """ Import MHC data and continuous disease traits. Args: fileName (str): name of the tsv/csv file with allele and trait data per subject traitFieldName (str): the name of the trait of interest in the header of the tsv file alleleFieldNames (dict): a dictionary with the field names for the alleles. Kwargs: covariateFieldNames (list of str): list with covariate names to be imported from the data file. traitCensFieldName (str or bool): title of the column with censoring values. If False, all data is assumed to be uncensored. If True (default) the key is assumed to be X_censoring, where X is the traitFieldName. subset (int): take a sample from the complete data. If None (default), use all data. verbose (bool): print some basic statistics on the imported data. traitTransform (function): a funcion to transform the trait value. For instance numpy.log, or the identity (default) categorical (bool): True if the trait is categorical, False otherwise (default). Use traitTransform to map string to required values (e.g. 0/1) Returns: A dictionary containing - TraitValues: the trait values - CensCodes: censoring codes for the TraitValues - CensBounds: upper (or lower) bounds for left (or right) censored Trait Only included if not categorical - Categories: a list of possible traitValues Only included if categorical - AlleleVecs: admissible alleles - Alleles: the alleles in the dataset, in the right order for the allele vectors Todo: - Intelligently handle censoring codes - Give a good discription of the expected file format - Allow for covariates """ ## detect if the file is tsv or csv file_base, file_extension = os.path.splitext(fileName) if file_extension == ".csv": delim = ',' elif file_extension == ".tsv": delim = '\t' elif file_extension == ".xlsx": ## @todo: implement excel files raise Exception("xlsx file format not yet implemented") else: raise Exception("invalid file format (csv or tsv expected)") ## read the contents of the file with open(fileName, 'r') as fileHandle: reader = csv.DictReader(fileHandle, delimiter=delim) data = [row for row in reader] ## take a sub-sample if subset is not None: idxs = np.random.choice(len(data), size=subset, replace=False) data = [data[i] for i in idxs] ## find data in the table Ploidy = 2 loci = sorted(alleleFieldNames.keys()) Alleles = {} for locus in loci: alls = [ row[alleleFieldNames[locus][i]].split(';') for i in range(Ploidy) for row in data ] ## make MhcObject-s alls = [mhctools.MhcObject(a) for a in aux.unique(aux.flatten(alls))] ## remove non-expressed alleles and reduce to 2-field alls = [a.protein for a in alls] ## include Null alleles etc. ## find unique elements again Alleles[locus] = aux.unique(alls) ## for each subject, make a vector with zeros and ones ## indicating whether the allele is present def hasAllele(subject, allele, locus, i): fieldName = alleleFieldNames[locus][i] admissibles = [ mhctools.MhcObject(a) for a in subject[fieldName].split(';') ] in_adms = [adm_allele in allele for adm_allele in admissibles] return any(in_adms) AlleleVecs = {} for locus in sorted(alleleFieldNames.keys()): AlleleVecs[locus] = [[[ hasAllele(subject, allele, locus, i) for allele in Alleles[locus] ] for i in range(Ploidy)] for subject in data] ## get the trait values and censoring information if type(traitCensFieldName) is bool and traitCensFieldName == True: ## use the default key, based on the traitFieldName traitCensFieldName = "{0}_censoring".format(traitFieldName) ## now, get censoring from data set if traitCensFieldName is str if type(traitCensFieldName) is str: CensCodes = np.array([ subject[traitCensFieldName] if traitCensFieldName in subject.keys() else defn.uncensored_code ## FIXME, make sure that this is consistant for subject in data ]) else: ## assume everything is uncensored CensCodes = np.array([defn.uncensored_code for _ in data]) ## get trait values if not categorical: TraitValues = np.array([ traitTransform(float(subject[traitFieldName])) if cens_code == defn.uncensored_code else np.nan for cens_code, subject in zip(CensCodes, data) ]) ## find the upper or lower bound for the left and (resp.) right censored values ## FIXME For the non-interval-censored data, use an arbitrary (low) Trait value as lower bound CensBounds = np.array([ traitTransform(float(subject[traitFieldName])) if cens_code == defn.left_censored_code or cens_code == defn.right_censored_code else defn.auxiliaryLowerCensBound for cens_code, subject in zip(CensCodes, data) ]) else: ## the trait is categorical: Censoring can only be missing or uncensored TraitValues = np.array([ traitTransform(subject[traitFieldName]) if cens_code == defn.uncensored_code else np.nan for cens_code, subject in zip(CensCodes, data) ]) ## make a list of possible trait values Categories = aux.unique([ x for x, c in zip(TraitValues, CensCodes) if c == defn.uncensored_code ]) ## import covariates if covariateFieldNames is None: covariateFieldNames = [] ## empty list Covariates = { } ## dictionary indexed by covariate name, empty if no covariates for cfn in covariateFieldNames: ccfn = "{0}_censoring".format(cfn) covVals, covCCs, covCBs = fetchContinuousValue(cfn, ccfn, data) Covariates[cfn] = { "Values": covVals, "CensCodes": covCCs, "CensBounds": covCBs } ## do some printing... if verbose: ## number op subjects print("number of subjects: {0}".format(len(data))) ## Trait statistics if not categorical: mtrait = np.nanmedian(TraitValues) ltrait = np.nanpercentile(TraitValues, 2.5) htrait = np.nanpercentile(TraitValues, 97.5) print( "median trait value: {0:0.2f}, 2.5 - 97.5 percentiles: {1:0.2f} - {2:0.2f}" .format(mtrait, ltrait, htrait)) else: catcounts = { cat: len([x for x in TraitValues if x == cat]) for cat in Categories } print("category histogram:") for cat in Categories: print(f"\t{cat}:\t{catcounts[cat]}") ## allele statistics print("total number of alleles:", np.sum([len(Alleles[locus]) for locus in loci])) for locus in sorted(Alleles.keys()): print(f"number of {locus} alleles: {len(Alleles[locus])}") numCompleteHaplotypes = 0 for idx in range(len(data)): compl = [ sum(1 for x in alleleVec if x) == 1 for locus in sorted(Alleles.keys()) for alleleVec in AlleleVecs[locus][idx] ] if all(compl): ## subject has with 2-field typed haplotype numCompleteHaplotypes += 1 print( "number of complete haplotypes: {0}".format(numCompleteHaplotypes)) ## Covariates if len(covariateFieldNames) > 0: print("covariate statistics:") for cfn in covariateFieldNames: mcov = np.nanmean(Covariates[cfn]["Values"]) lcov = np.nanpercentile(Covariates[cfn]["Values"], 2.5) hcov = np.nanpercentile(Covariates[cfn]["Values"], 97.5) print( f"\t'{cfn}' mean value: {mcov:0.2f}, 2.5 - 97.5 percentiles: {lcov:0.2f} - {hcov:0.2f}" ) ## return relevant data in a dictionary dataDict = { "TraitValues": TraitValues, "CensCodes": CensCodes, "AlleleVecs": AlleleVecs, "Alleles": Alleles, "Covariates": Covariates } ## include censoring bounds if the trait value is a real number if not categorical: dataDict.update({"CensBounds": CensBounds}) else: dataDict.update({"Categories": Categories}) return dataDict