def load_ensembl2interpro(self): #Interpro DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt') load_start(DATAFILE) ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0)) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]}) ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False) load_done('[%d]' % len(ensembl2interpro)) return self.convert2entrez(ensembl2interpro)
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])}) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d}
def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort() return {'pathway': _d}
def _map_line_to_json(fields): vid = fields[0].split(":") chrom = re.search(r'[1-9]+', vid[0]).group() if chrom == '23': chrom = chrom.replace('23', 'X') HGVS = "chr%s:%s" % (chrom, vid[1]) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "emv": { "gene": fields[2], "variant_id": fields[3], "exon": fields[4], "egl_variant": fields[5], "egl_protein": fields[6], "egl_classification": fields[7], "egl_classification_date": fields[8], "hgvs": fields[9].split(" | "), "clinvar_rcv": fields[10], } } return unlist(dict_sweep(value_convert(one_snp_json), vals=[""]))
def load_x(idx, fieldname, cvt_fn=None): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(2,19,idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value} gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def load_ensembl2pfam(self): #Prosite DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt') load_start(DATAFILE) ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0)) ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pfam)) return self.convert2entrez(ensembl2pfam)
def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) load_done('[%d]' % len(ensembl2taxid)) return ensembl2taxid
def load_genedoc(self=None): reporter_d = {} for module in reporter_modules: reporter_d.update(module.loaddata()) platform_li = reporter_d.keys() genedoc_d = merge_dict([reporter_d[k] for k in platform_li], platform_li) fn = lambda value: {'reporter': value} genedoc_d = value_convert(genedoc_d, fn, traverse_list=False) return genedoc_d
def load_pharmgkb(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip') load_start(DATAFILE) gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) load_done('[%d]' % len(gene2pharmgkb)) return gene2pharmgkb
def load_ensembl2pos(self): #Genomic position DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert( ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4]) }) ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False) load_done('[%d]' % len(ensembl2pos)) return self.convert2entrez(ensembl2pos)
def load_uniprot(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(0, 1, 2, 18)) # UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene) for value in dupline_seperator( dupline=ld, dup_idx=[2, 3 ], # GeneID and EnsemblID columns may have duplicates dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xli.append(value) ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2] != '' and x[3] != ''], 0, alwayslist=True) xli2 = [] for uniprot_acc, section, entrez_id, ensembl_id in xli: if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) else: #otherwise, just use ensembl_id xli2.append((uniprot_acc, section, ensembl_id)) gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0))) return gene2uniprot
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG) def _fn(x): out={} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) load_done('[%d]' % len(ensembl2name)) return ensembl2name
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt') load_start(DATAFILE) ensembl2name = tab2dict(DATAFILE, (1, 2, 7), 0, includefn=_not_LRG) def _fn(x): out = {} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) load_done('[%d]' % len(ensembl2name)) return ensembl2name
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print('# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))) print('# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))) print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get( eid, {})) # need to make a copy of the value here. return d # otherwise, it will cause issue when multiple entrezgene ids # match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g for id in data: if isinstance(data[id], dict): _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def load_x(idx, fieldname, cvt_fn=None): '''idx is 0-based column number''' print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(2, 19, idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0] != '' and x[1] != '']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: { fieldname: sorted(value) if isinstance(value, list) else value } gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def convert2entrez(self, ensembl2x): '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.''' if not self.ensembl2entrez_li: self._load_ensembl2entrez_li() if not self.ensembl_main: self.ensembl_main = self.load_ensembl_main() ensembl2entrez = list2dict(self.ensembl2entrez_li, 0) entrez2ensembl = list2dict(self.ensembl2entrez_li, 1) #Now make a dictionary indexed by entrez gene id print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez)) print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez)) print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez)) #all genes with matched entrez def _fn(eid, taxid=None): d = copy.copy(ensembl2x.get(eid, {})) #need to make a copy of the value here. return d #otherwise, it will cause issue when multiple entrezgene ids #match the same ensembl gene, for example, # ENSMUSG00000027104 --> (11909, 100047997) data = value_convert(entrez2ensembl, _fn) #add those has no matched entrez geneid, using ensembl id as the key for eid in (set(ensembl2x) - set(ensembl2entrez)): _g = ensembl2x[eid] #_g.update(self.ensembl_main.get(eid, {})) data[eid] = _g doc_li = [] for id in data: if type(data[id]) is types.DictType: _doc = dict_nodup(data[id], sort=True) else: #if one entrez gene matches multiple ensembl genes _doc = dict_attrmerge(data[id], removedup=True, sort=True) data[id] = _doc return data
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count = item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def load_uniprot(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1): ld = listitems(ld, *(0,1,2,19)) #UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene) for value in dupline_seperator(dupline=ld, dup_idx=[2,3], #GeneID and EnsemblID columns may have duplicates dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xli.append(value) ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2]!='' and x[3]!=''], 0, alwayslist=True) xli2 = [] for uniprot_acc, section, entrez_id, ensembl_id in xli: if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) else: #otherwise, just use ensembl_id xli2.append((uniprot_acc, section, ensembl_id)) gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0))) return gene2uniprot
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "exac": { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return url = 'http://myvariant.info/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "grasp": { 'hg19': { 'chr': fields[5], 'pos': fields[6] }, 'hupfield': fields[1], 'last_curation_date': fields[2], 'creation_date': fields[3], 'srsid': fields[4], 'publication': { 'journal': fields[16], 'title': fields[17], 'pmid': fields[7], 'snpid': fields[8], 'location_within_paper': fields[9], 'p_value': fields[10], 'phenotype': fields[11], 'paper_phenotype_description': fields[12], 'paper_phenotype_categories': fields[13], 'date_pub': fields[14] }, 'includes_male_female_only_analyses': fields[18], 'exclusively_male_female': fields[19], 'initial_sample_description': fields[20], 'replication_sample_description': fields[21], 'platform_snps_passing_qc': fields[22], 'gwas_ancestry_description': fields[23], 'discovery': { 'total_samples': fields[25], 'european': fields[26], 'african': fields[27], 'east_asian': fields[28], 'indian_south_asian': fields[29], 'hispanic': fields[30], 'native': fields[31], 'micronesian': fields[32], 'arab_me': fields[33], 'mixed': fields[34], 'unspecified': fields[35], 'filipino': fields[36], 'indonesian': fields[37] }, 'replication': { 'total_samples': fields[38], 'european': fields[39], 'african': fields[40], 'east_asian': fields[41], 'indian_south_asian': fields[42], 'hispanic': fields[43], 'native': fields[44], 'micronesian': fields[45], 'arab_me': fields[46], 'mixed': fields[47], 'unspecified': fields[48], 'filipino': fields[49], 'indonesian': fields[50] }, 'in_gene': fields[51], 'nearest_gene': fields[52], 'in_lincrna': fields[53], 'in_mirna': fields[54], 'in_mirna_bs': fields[55], 'oreg_anno': fields[61], 'conserv_pred_tfbs': fields[62], 'human_enhancer': fields[63], 'rna_edit': fields[64], 'polyphen2': fields[65], 'sift': fields[66], 'ls_snp': fields[67], 'uniprot': fields[68], 'eqtl_meth_metab_study': fields[69] } } return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0] synonyms = [] conditions_name = '' for name in trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart_19 = None chromEnd_19 = None chromStart_38 = None chromEnd_38 = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start chromEnd_19 = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: ref = SequenceLocation.referenceAllele if not alt: alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None if hg19: chromStart = chromStart_19 chromEnd = chromEnd_19 else: chromStart = chromStart_38 chromEnd = chromEnd_38 # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute. get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet. Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet. Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome and chrom: hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return for key in HGVS: HGVS[key].sort() rsid = None cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': cosmic = XRef.ID elif XRef.DB == 'OMIM': omim = XRef.ID elif XRef.DB == 'UniProtKB/Swiss-Prot': uniprot = XRef.ID elif XRef.DB == 'dbVar': dbvar = XRef.ID # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "variant_id": variant_id, "chrom": chrom, "omim": omim, "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, "hg19": { "start": chromStart_19, "end": chromEnd_19 }, "hg38": { "start": chromStart_38, "end": chromEnd_38 }, "type": variation_type, "gene": { "id": gene_id, "symbol": symbol }, "rcv": { "accession": rcv_accession, "clinical_significance": clinical_significance, "number_submitters": number_submitters, "review_status": review_status, "last_evaluated": str(last_evaluated), "preferred_name": name, "origin": origin, "conditions": { "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset } }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } return dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
def _map_line_to_json(fields): # specific variable treatment chrom = fields[0] if fields[7] == ".": hg18_end = "." else: hg18_end = int(fields[7])+1 chromStart = int(fields[1]) chromEnd = int(fields[1]) + 1 allele1 = fields[2] allele2 = fields[3] HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2) if fields[74] == ".": siphy = "." else: freq = fields[74].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[11].rstrip().rstrip(';').split(";") pos = fields[13].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "chrom": chrom, "hg19": { "start": fields[1], "end": chromEnd }, "hg18": { "start": fields[7], "end": hg18_end }, "hg38": { "chrom": fields[8], "pos": fields[9] }, "allele1": allele1, "allele2": allele2, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[23], "refcodon": fields[16], "codonpos": fields[18], "aapos_sift": fields[24], "aapos_fathmm": fields[25] }, "genename": fields[10], "uniprot": uniprot, "interpro_domain": fields[14], "cds_strand": fields[15], "slr_test_statistic": fields[17], "fold-degenerate": fields[19], "ancestral_allele": fields[20], "ensembl": { "geneid": fields[21], "transcriptid": fields[22] }, "sift": { "score": fields[26], "converted_rankscore": fields[27], "pred": fields[28] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37] }, "mutationtaster": { "score": fields[38], "converted_rankscore": fields[39], "pred": fields[40] }, "mutationassessor": { "score": fields[41], "rankscore": fields[42], "pred": fields[43] }, "fathmm": { "score": fields[44], "rankscore": fields[45], "pred": fields[46] }, "radialsvm": { "score": fields[47], "rankscore": fields[48], "pred": fields[49] }, "lr": { "score": fields[50], "rankscore": fields[51], "pred": fields[52] }, "reliability_index": fields[53], "vest3": { "score": fields[54], "rankscore": fields[55] }, "cadd": { "raw": fields[56], "raw_rankscore": fields[57], "phred": fields[58] }, "gerp++": { "nr": fields[59], "rs": fields[60], "rs_rankscore": fields[61] }, "phylop": { "46way": { "primate": fields[62], "primate_rankscore": fields[63], "placental": fields[64], "placental_rankscore": fields[65], }, "100way": { "vertebrate": fields[66], "vertebrate_rankscore": fields[67] } }, "phastcons": { "46way": { "primate": fields[68], "primate_rankscore": fields[69], "placental": fields[70], "placental_rankscore": fields[71], }, "100way": { "vertebrate": fields[72], "vertebrate_rankscore": fields[73] } }, "siphy_29way": { "pi": siphy, "logodds": fields[75], "logodds_rankscore": fields[76] }, "lrt_omega": fields[77], "unisnp_ids": fields[78], "1000gp1": { "ac": fields[79], "af": fields[80], "afr_ac": fields[81], "afr_af": fields[82], "eur_ac": fields[83], "eur_af": fields[84], "amr_ac": fields[85], "amr_af": fields[86], "asn_ac": fields[87], "asn_af": fields[88] }, "esp6500": { "aa_af": fields[89], "ea_af": fields[90] }, "aric5606": { "aa_ac": fields[91], "aa_af": fields[92], "ea_ac": fields[93], "ea_af": fields[94] }, "clinvar": { "rs": fields[95], "clin_sig": fields[96], "trait": fields[97] } } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chr_info = re.findall(r"[\w']+", fields[17]) chrom = chr_info[0] # Mutation GRCh37 genome position chromStart = chr_info[1] chromEnd = chr_info[2] HGVS = None cds = fields[13] sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds) ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds) delete = cds.find('del') != -1 del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds) comp = re.search(r'[ATCGMN]+', cds) if sub: HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group()) elif ins: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group()) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif del_ins: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group()) # elif comp: # HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group()) else: HGVS = fields[12] print "Error2:", fields[15], cds, fields[17] # load as json data if HGVS is None: return one_snp_json = { "sorter": fields[17] + fields[13], "_id": HGVS, "cosmic": { "gene": { "symbol": fields[0], # Gene name "id": fields[3], # HGNC ID "cds_length": fields[2] }, "transcript": fields[1], # Accession Number "sample": { "name": fields[4], # Sample name "id": fields[5] # ID_sample }, "tumour": { "id": fields[6], # ID_tumour "primary_site": fields[7], # Primary site "site_subtype": fields[8], # Site subtype "primary_histology": fields[9], # Primary histology "histology_subtype": fields[10], # Histology subtype "origin": fields[1] }, "mutation": { "id": "COSM" + fields[12], # Mutation ID "cds": cds, # Mutation CDS "aa": fields[14], # Mutation AA "description": fields[15], # Mutation Description "zygosity": fields[16], # Mutation zygosity "somatic_status": fields[21] # Mutation somatic status }, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "pubmed": fields[22] # Pubmed_PMID } } return dict_sweep(value_convert(one_snp_json), vals=[""])
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return url = "http://myvariant.info/v1/query?q=dbsnp.rsid:" + rsid + "&fields=_id" r = requests.get(url) for hits in r.json()["hits"]: HGVS = hits["_id"] one_snp_json = { "_id": HGVS, "grasp": { "hg19": {"chr": fields[5], "pos": fields[6]}, "hupfield": fields[1], "last_curation_date": fields[2], "creation_date": fields[3], "srsid": fields[4], "publication": { "journal": fields[16], "title": fields[17], "pmid": fields[7], "snpid": fields[8], "location_within_paper": fields[9], "p_value": fields[10], "phenotype": fields[11], "paper_phenotype_description": fields[12], "paper_phenotype_categories": fields[13], "date_pub": fields[14], }, "includes_male_female_only_analyses": fields[18], "exclusively_male_female": fields[19], "initial_sample_description": fields[20], "replication_sample_description": fields[21], "platform_snps_passing_qc": fields[22], "gwas_ancestry_description": fields[23], "discovery": { "total_samples": fields[25], "european": fields[26], "african": fields[27], "east_asian": fields[28], "indian_south_asian": fields[29], "hispanic": fields[30], "native": fields[31], "micronesian": fields[32], "arab_me": fields[33], "mixed": fields[34], "unspecified": fields[35], "filipino": fields[36], "indonesian": fields[37], }, "replication": { "total_samples": fields[38], "european": fields[39], "african": fields[40], "east_asian": fields[41], "indian_south_asian": fields[42], "hispanic": fields[43], "native": fields[44], "micronesian": fields[45], "arab_me": fields[46], "mixed": fields[47], "unspecified": fields[48], "filipino": fields[49], "indonesian": fields[50], }, "in_gene": fields[51], "nearest_gene": fields[52], "in_lincrna": fields[53], "in_mirna": fields[54], "in_mirna_bs": fields[55], "oreg_anno": fields[61], "conserv_pred_tfbs": fields[62], "human_enhancer": fields[63], "rna_edit": fields[64], "polyphen2": fields[65], "sift": fields[66], "ls_snp": fields[67], "uniprot": fields[68], "eqtl_meth_metab_study": fields[69], }, } return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "exac": { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'], "ac_female": info['AC_FEMALE'], "ac_male": info['AC_MALE'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields, version): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) # in case of no hg19 position provided, remove the item if fields[8] == '.': return None else: chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[105] == ".": siphy = "." else: freq = fields[105].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = fields[181].split('|') gtex_tissue = fields[182].split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = fields[52].split(';') sift_score = fields[23].split(';') hdiv_score = fields[29].split(';') hvar_score = fields[32].split(';') lrt_score = fields[35].split(';') dann_score = fields[69].split(';') mutationtaster_score = fields[39].split(';') mutationassessor_score = fields[46].split(';') vest3_score = fields[57].split(';') metasvm_score = fields[59].split(';') fathmm_score = fields[49].split(';') lr_score = fields[62].split(';') fathmm_coding_score = fields[71].split(';') integrated_fitcons_score = fields[82].split(';') gm12878_fitcons_score = fields[85].split(';') h1_hesc_fitcons_score = fields[88].split(';') huvec_fitcons_score = fields[91].split(';') if len(provean_score) > 1: for i in range(len(provean_score)): if provean_score[i] == '.': provean_score[i] = None if len(sift_score) > 1: for i in range(len(sift_score)): if sift_score[i] == '.': sift_score[i] = None if len(hdiv_score) > 1: for i in range(len(hdiv_score)): if hdiv_score[i] == '.': hdiv_score[i] = None if len(hvar_score) > 1: for i in range(len(hvar_score)): if hvar_score[i] == '.': hvar_score[i] = None if len(lrt_score) > 1: for i in range(len(lrt_score)): if lrt_score[i] == '.': lrt_score[i] = None if len(mutationtaster_score) > 1: for i in range(len(mutationtaster_score)): if mutationtaster_score[i] == '.': mutationtaster_score[i] = None if len(mutationassessor_score) > 1: for i in range(len(mutationassessor_score)): if mutationassessor_score[i] == '.': mutationassessor_score[i] = None if len(metasvm_score) > 1: for i in range(len(metasvm_score)): if metasvm_score[i] == '.': metasvm_score[i] = None if len(vest3_score) > 1: for i in range(len(vest3_score)): if vest3_score[i] == '.': vest3_score[i] = None if len(fathmm_score) > 1: for i in range(len(fathmm_score)): if fathmm_score[i] == '.': fathmm_score[i] = None if len(lr_score) > 1: for i in range(len(lr_score)): if lr_score[i] == '.': lr_score[i] = None if len(fathmm_coding_score) > 1: for i in range(len(fathmm_coding_score)): if fathmm_coding_score[i] == '.': fathmm_coding_score[i] = None if len(dann_score) > 1: for i in range(len(dann_score)): if dann_score[i] == '.': dann_score[i] = None if len(integrated_fitcons_score) > 1: for i in range(len(integrated_fitcons_score)): if integrated_fitcons_score[i] == '.': integrated_fitcons_score[i] = None if len(gm12878_fitcons_score) > 1: for i in range(len(gm12878_fitcons_score)): if gm12878_fitcons_score[i] == '.': gm12878_fitcons_score[i] = None if len(h1_hesc_fitcons_score) > 1: for i in range(len(h1_hesc_fitcons_score)): if h1_hesc_fitcons_score[i] == '.': h1_hesc_fitcons_score[i] = None if len(huvec_fitcons_score) > 1: for i in range(len(huvec_fitcons_score)): if huvec_fitcons_score[i] == '.': huvec_fitcons_score[i] = None # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], "codon_degeneracy": fields[15] }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[180], "cds_strand": fields[12], "ancestral_allele": fields[16], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": fields[19], "transcriptid": fields[20], "proteinid": fields[21] }, "sift": { "score": sift_score, "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": hvar_score, "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": lrt_score, "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fathmm_score, "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": provean_score, "rankscore": fields[53], "pred": fields[54] }, "vest3": { "score": vest3_score, "rankscore": fields[57], "transcriptid": fields[55], "transcriptvar": fields[56] }, "fathmm-mkl": { "coding_score": fathmm_coding_score, "coding_rankscore": fields[72], "coding_pred": fields[73], "coding_group": fields[74] }, "eigen": { "raw": fields[75], "phred": fields[76], "raw_rankscore": fields[77] }, "eigen-pc": { "raw": fields[78], "raw_rankscore": fields[79] }, "genocanyon": { "score": fields[80], "rankscore": fields[81] }, "metasvm": { "score": metasvm_score, "rankscore": fields[60], "pred": fields[61] }, "metalr": { "score": lr_score, "rankscore": fields[63], "pred": fields[64] }, "reliability_index": fields[65], "dann": { "score": dann_score, "rankscore": fields[70] }, "gerp++": { "nr": fields[94], "rs": fields[95], "rs_rankscore": fields[96] }, "integrated": { "fitcons_score": integrated_fitcons_score, "fitcons_rankscore": fields[83], "confidence_value": fields[84] }, "gm12878": { "fitcons_score": gm12878_fitcons_score, "fitcons_rankscore": fields[86], "confidence_value": fields[87] }, "h1-hesc": { "fitcons_score": h1_hesc_fitcons_score, "fitcons_rankscore": fields[89], "confidence_value": fields[90] }, "huvec": { "fitcons_score": huvec_fitcons_score, "fitcons_rankscore": fields[92], "confidence_value": fields[93] }, "phylo": { "p100way": { "vertebrate": fields[97], "vertebrate_rankscore": fields[98] }, "p20way": { "mammalian": fields[99], "mammalian_rankscore": fields[100] } }, "phastcons": { "100way": { "vertebrate": fields[101], "vertebrate_rankscore": fields[102] }, "20way": { "mammalian": fields[103], "mammalian_rankscore": fields[104] } }, "siphy_29way": { "pi": siphy, "logodds": fields[106], "logodds_rankscore": fields[107] }, "1000gp3": { "ac": fields[108], "af": fields[109], "afr_ac": fields[110], "afr_af": fields[111], "eur_ac": fields[112], "eur_af": fields[113], "amr_ac": fields[114], "amr_af": fields[115], "eas_ac": fields[116], "eas_af": fields[117], "sas_ac": fields[118], "sas_af": fields[119] }, "twinsuk": { "ac": fields[120], "af": fields[121] }, "alspac": { "ac": fields[122], "af": fields[123] }, "esp6500": { "aa_ac": fields[124], "aa_af": fields[125], "ea_ac": fields[126], "ea_af": fields[127] }, "exac": { "ac": fields[128], "af": fields[129], "adj_ac": fields[130], "adj_af": fields[131], "afr_ac": fields[132], "afr_af": fields[133], "amr_ac": fields[134], "amr_af": fields[135], "eas_ac": fields[136], "eas_af": fields[137], "fin_ac": fields[138], "fin_af": fields[139], "nfe_ac": fields[140], "nfe_af": fields[141], "sas_ac": fields[142], "sas_af": fields[143] }, "exac_nontcga": { "ac": fields[144], "af": fields[145], "adj_ac": fields[146], "adj_af": fields[147], "afr_ac": fields[148], "afr_af": fields[149], "amr_ac": fields[150], "amr_af": fields[151], "eas_ac": fields[152], "eas_af": fields[153], "fin_ac": fields[154], "fin_af": fields[155], "nfe_ac": fields[156], "nfe_af": fields[157], "sas_ac": fields[158], "sas_af": fields[159] }, "exac_nonpsych": { "ac": fields[160], "af": fields[161], "adj_ac": fields[162], "adj_af": fields[163], "afr_ac": fields[164], "afr_af": fields[165], "amr_ac": fields[166], "amr_af": fields[167], "eas_ac": fields[168], "eas_af": fields[169], "fin_ac": fields[170], "fin_af": fields[171], "nfe_ac": fields[172], "nfe_af": fields[173] }, "clinvar": { "rs": fields[176], "clinsig": fields[177], "trait": fields[178], "golden_stars": fields[179] }, "gtex": gtex } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields, version): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) if version == 'hg19': HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) elif version == 'hg38': HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def _map_line_to_json(fields): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs map_location description type_of_gene Symbol_from nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location, description, type_of_gene, other_designations, modification_date) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) if locus_tag != '-': out['locus_tag'] = locus_tag if other_designations != "-": out['other_names'] = normalized_value( other_designations.split('|')) # when merged, this will become the default timestamp out["_timestamp"] = datetime.datetime.strptime( modification_date, "%Y%m%d") for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and \ xd[0] in ['VGNC', 'HGNC', 'MGI']: # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' xd = xd[1:] try: _db, _id = xd except: print(repr(x)) raise # we don't need ensembl xref from here, we will get it from # Ensembl directly if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need 'IMGT/GENE-DB" xref either, because they # are mostly the same as gene symbol continue # add "MGI:" prefix for MGI ids. if _db.lower() == 'mgi': _id = "MGI:" + _id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def _map_line_to_json(fields): if len(fields) == VALID_COLUMN_NO: chrom = fields[0] chromStart = fields[1] allele1 = fields[2] allele2 = fields[4] HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, allele1, allele2) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'encode': { 'exp': fields[29], 'h3k27ac': fields[30], 'h3k4me1': fields[31], 'h3k4me3': fields[32], 'nucleo': fields[33], 'occ': fields[34], 'p_val': { 'comb': fields[35], 'dnas': fields[36], 'faire': fields[37], 'polii': fields[38], 'ctcf': fields[39], 'mycp': fields[40] }, 'sig': { 'dnase': fields[41], 'faire': fields[42], 'polii': fields[43], 'ctcf': fields[44], 'myc': fields[45] }, }, 'segway': fields[46], 'motif': { 'toverlap': fields[47], 'dist': fields[48], 'ecount': fields[49], 'ename': fields[50], 'ehipos': fields[51], 'escorechng': fields[52] }, 'tf': { 'bs': fields[53], 'bs_peaks': fields[54], 'bs_peaks_max': fields[55] }, 'isknownvariant': fields[56], 'esp': { 'af': fields[57], 'afr': fields[58], 'eur': fields[59] }, '1000g': { 'af': fields[60], 'asn': fields[61], 'amr': fields[62], 'afr': fields[63], 'eur': fields[64] }, 'min_dist_tss': fields[65], 'min_dist_tse': fields[66], 'gene': { 'gene_id': fields[67], 'feature_id': fields[68], 'ccds_id': fields[69], 'genename': fields[70], 'cds': { 'cdna_pos': fields[71], 'rel_cdna_pos': fields[72], 'cds_pos': fields[73], 'rel_cds_pos': fields[74] }, 'prot': { 'protpos': fields[75], 'rel_prot_pos': fields[76], 'oaa': fields[81], 'naa': fields[82] }, 'dst_2_splice': fields[77], 'dst_2_spltype': fields[78], 'exon': fields[79], 'intron': fields[80] }, 'grantham': fields[83], 'polyphen': { 'cat': fields[84], 'val': fields[85] }, 'sift': { 'cat': fields[86], 'val': fields[87] }, 'rawscore': fields[88], 'phred': fields[89] } } return dict_sweep(unlist(value_convert(one_snp_json)), "NA")
def _map_line_to_json(cp): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0] synonyms = [] conditions_name = '' for name in trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None chromStart_38 = None chromEnd_38 = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: ref = SequenceLocation.referenceAllele if not alt: alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position + 3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position + 3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position + 3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return for key in HGVS: HGVS[key].sort() rsid = None cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': cosmic = XRef.ID elif XRef.DB == 'OMIM': omim = XRef.ID elif XRef.DB == 'UniProtKB/Swiss-Prot': uniprot = XRef.ID elif XRef.DB == 'dbVar': dbvar = XRef.ID # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "variant_id": variant_id, "chrom": chrom, "omim": omim, "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, "hg19": { "start": chromStart, "end": chromEnd }, "hg38": { "start": chromStart_38, "end": chromEnd_38 }, "type": variation_type, "gene": { "id": gene_id, "symbol": symbol }, "rcv": { "accession": rcv_accession, "clinical_significance": clinical_significance, "number_submitters": number_submitters, "review_status": review_status, "last_evaluated": str(last_evaluated), "preferred_name": name, "origin": origin, "conditions": { "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset } }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep( unlist( value_convert(one_snp_json, [ 'chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc' ])), [None, '', 'None'])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"]) yield obj
def load(self, aslist=False): ''' loading ncbi "gene_info" file This must be called first to create basic gene documents with all basic fields, e.g., name, symbol, synonyms, etc. format of gene_info file: #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da te (tab is used as a separator, pound sign - start of a comment) ''' load_start(self.datafile) gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter) def _ff(d): ( taxid, symbol, synonyms, dbxrefs, map_location, description, type_of_gene ) = d out = dict(taxid=int(taxid), symbol=symbol, name=description) if map_location != '-': out['map_location'] = map_location if type_of_gene != '-': out['type_of_gene'] = type_of_gene if synonyms != '-': out['alias'] = normalized_value(synonyms.split('|')) for x in dbxrefs.split('|'): if x == '-': continue xd = x.split(':') if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']: xd = xd[1:] # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328' try: _db, _id = xd except: print(x) raise if _db.lower() in ['ensembl', 'imgt/gene-db']: # we don't need ensembl xref from here, we will get it from Ensembl directly continue # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol if _db.lower() == 'mgi': # add "MGI:" prefix for MGI ids. _id = "MGI:"+_id out[_db] = _id return out gene_d = value_convert(gene_d, _ff) # add entrezgene field for geneid in gene_d: d = gene_d[geneid] d['entrezgene'] = int(geneid) gene_d[geneid] = d load_done('[%d]' % len(gene_d)) if aslist: return dict_to_list(gene_d) else: return gene_d
def _map_line_to_json(cp): clinical_siginificance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position + 3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position + 3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position + 3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return other_ids = '' rsid = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';' # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "type": variation_type, "name": name, "gene": { "id": gene_id, "symbol": symbol }, "clinical_significance": clinical_siginificance, "rsid": rsid, "rcv_accession": rcv_accession, "origin": origin, "cytogenic": cytogenic, "review_status": review_status, "hgvs": HGVS, "number_submitters": number_submitters, "last_evaluated": str(last_evaluated), "other_ids": other_ids, "clinvar_id": CLINVAR_ID, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(df, version, index): # specific variable treatment chrom = df.get_value(index, "#chr") if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df.get_value(index, "hg18_pos(1-based)") if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df.get_value(index, "hg19_pos(1-based)") == '.': return None else: chromStart = int(df.get_value(index, "hg19_pos(1-based)")) chromEnd = chromStart chromStart_38 = int(df.get_value(index, "pos(1-based)")) ref = df.get_value(index, "ref").upper() alt = df.get_value(index, "alt").upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df.get_value(index, "SiPhy_29way_pi") if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df.get_value(index, "GTEx_V6_gene").split('|') gtex_tissue = df.get_value(index, "GTEx_V6_tissue").split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df.get_value(index, "Uniprot_acc_Polyphen2").rstrip().rstrip(';').split(";") pos = df.get_value( index, "Uniprot_aapos_Polyphen2").rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df.get_value(index, "PROVEAN_score").split(';') sift_score = df.get_value(index, "SIFT_score").split(';') hdiv_score = df.get_value(index, "Polyphen2_HDIV_score").split(';') hvar_score = df.get_value(index, "Polyphen2_HVAR_score").split(';') lrt_score = df.get_value(index, "LRT_score").split(';') m_cap_score = df.get_value(index, "M-CAP_score").split(';') mutationtaster_score = df.get_value(index, "MutationTaster_score").split(';') mutationassessor_score = df.get_value(index, "MutationAssessor_score").split(';') vest3_score = df.get_value(index, "VEST3_score").split(';') metasvm_score = df.get_value(index, "MetaSVM_score").split(';') fathmm_score = df.get_value(index, "FATHMM_score").split(';') metalr_score = df.get_value(index, "MetaLR_score").split(';') modify_score_list = [ provean_score, sift_score, hdiv_score, hvar_score, lrt_score, m_cap_score, mutationtaster_score, mutationassessor_score, vest3_score, metasvm_score, fathmm_score, metalr_score ] for _score in modify_score_list: [None if item == '.' else item for item in _score] # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df.get_value(index, "rs_dbSNP147"), #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df.get_value(index, "hg18_pos(1-based)"), "end": hg18_end }, "hg38": { "start": df.get_value(index, "pos(1-based)"), "end": df.get_value(index, "pos(1-based)") }, "ref": ref, "alt": alt, "aa": { "ref": df.get_value(index, "aaref"), "alt": df.get_value(index, "aaalt"), "pos": df.get_value(index, "aapos"), "refcodon": df.get_value(index, "refcodon"), "codonpos": df.get_value(index, "codonpos"), "codon_degeneracy": df.get_value(index, "codon_degeneracy"), }, "genename": df.get_value(index, "genename"), "uniprot": uniprot, "interpro_domain": df.get_value(index, "Interpro_domain"), "cds_strand": df.get_value(index, "cds_strand"), "ancestral_allele": df.get_value(index, "Ancestral_allele"), #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df.get_value(index, "Ensembl_geneid"), "transcriptid": df.get_value(index, "Ensembl_transcriptid"), "proteinid": df.get_value(index, "Ensembl_proteinid") }, "sift": { "score": sift_score, "converted_rankscore": df.get_value(index, "SIFT_converted_rankscore"), "pred": df.get_value(index, "SIFT_pred") }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df.get_value(index, "Polyphen2_HDIV_rankscore"), "pred": df.get_value(index, "Polyphen2_HDIV_pred") }, "hvar": { "score": hvar_score, "rankscore": df.get_value(index, "Polyphen2_HVAR_rankscore"), "pred": df.get_value(index, "Polyphen2_HVAR_pred") } }, "lrt": { "score": lrt_score, "converted_rankscore": df.get_value(index, "LRT_converted_rankscore"), "pred": df.get_value(index, "LRT_pred"), "omega": df.get_value(index, "LRT_Omega") }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df.get_value(index, "MutationTaster_converted_rankscore"), "pred": df.get_value(index, "MutationTaster_pred"), "model": df.get_value(index, "MutationTaster_model"), "AAE": df.get_value(index, "MutationTaster_AAE") }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df.get_value(index, "MutationAssessor_score_rankscore"), "pred": df.get_value(index, "MutationAssessor_pred") }, "fathmm": { "score": fathmm_score, "rankscore": df.get_value(index, "FATHMM_converted_rankscore"), "pred": df.get_value(index, "FATHMM_pred") }, "provean": { "score": provean_score, "rankscore": df.get_value(index, "PROVEAN_converted_rankscore"), "pred": df.get_value(index, "PROVEAN_pred") }, "vest3": { "score": vest3_score, "rankscore": df.get_value(index, "VEST3_rankscore"), "transcriptid": df.get_value(index, "Transcript_id_VEST3"), "transcriptvar": df.get_value(index, "Transcript_var_VEST3") }, "fathmm-mkl": { "coding_score": df.get_value(index, "fathmm-MKL_coding_score"), "coding_rankscore": df.get_value(index, "fathmm-MKL_coding_rankscore"), "coding_pred": df.get_value(index, "fathmm-MKL_coding_pred"), "coding_group": df.get_value(index, "fathmm-MKL_coding_group") }, "eigen": { "coding_or_noncoding": df.get_value(index, "Eigen_coding_or_noncoding"), "raw": df.get_value(index, "Eigen-raw"), "phred": df.get_value(index, "Eigen-phred") }, "eigen-pc": { "raw": df.get_value(index, "Eigen-PC-raw"), "phred": df.get_value(index, "Eigen-PC-phred"), "raw_rankscore": df.get_value(index, "Eigen-PC-raw_rankscore") }, "genocanyon": { "score": df.get_value(index, "GenoCanyon_score"), "rankscore": df.get_value(index, "GenoCanyon_score_rankscore") }, "metasvm": { "score": metasvm_score, "rankscore": df.get_value(index, "MetaSVM_rankscore"), "pred": df.get_value(index, "MetaSVM_pred") }, "metalr": { "score": metalr_score, "rankscore": df.get_value(index, "MetaLR_rankscore"), "pred": df.get_value(index, "MetaLR_pred") }, "reliability_index": df.get_value(index, "Reliability_index"), "m_cap_score": { "score": m_cap_score, "rankscore": df.get_value(index, "M-CAP_rankscore"), "pred": df.get_value(index, "M-CAP_pred") }, "dann": { "score": df.get_value(index, "DANN_score"), "rankscore": df.get_value(index, "DANN_rankscore") }, "gerp++": { "nr": df.get_value(index, "GERP++_NR"), "rs": df.get_value(index, "GERP++_RS"), "rs_rankscore": df.get_value(index, "GERP++_RS_rankscore") }, "integrated": { "fitcons_score": df.get_value(index, "integrated_fitCons_score"), "fitcons_rankscore": df.get_value(index, "integrated_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "integrated_confidence_value") }, "gm12878": { "fitcons_score": df.get_value(index, "GM12878_fitCons_score"), "fitcons_rankscore": df.get_value(index, "GM12878_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "GM12878_confidence_value") }, "h1-hesc": { "fitcons_score": df.get_value(index, "H1-hESC_fitCons_score"), "fitcons_rankscore": df.get_value(index, "H1-hESC_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "H1-hESC_confidence_value") }, "huvec": { "fitcons_score": df.get_value(index, "HUVEC_fitCons_score"), "fitcons_rankscore": df.get_value(index, "HUVEC_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "HUVEC_confidence_value") }, "phylo": { "p100way": { "vertebrate": df.get_value(index, "phyloP100way_vertebrate"), "vertebrate_rankscore": df.get_value(index, "phyloP100way_vertebrate_rankscore") }, "p20way": { "mammalian": df.get_value(index, "phyloP20way_mammalian"), "mammalian_rankscore": df.get_value(index, "phyloP20way_mammalian_rankscore") } }, "phastcons": { "100way": { "vertebrate": df.get_value(index, "phastCons100way_vertebrate"), "vertebrate_rankscore": df.get_value(index, "phastCons100way_vertebrate_rankscore") }, "20way": { "mammalian": df.get_value(index, "phastCons20way_mammalian"), "mammalian_rankscore": df.get_value(index, "phastCons20way_mammalian_rankscore") } }, "siphy_29way": { "pi": siphy, "logodds": df.get_value(index, "SiPhy_29way_logOdds"), "logodds_rankscore": df.get_value(index, "SiPhy_29way_logOdds_rankscore") }, "1000gp3": { "ac": df.get_value(index, "1000Gp3_AC"), "af": df.get_value(index, "1000Gp3_AF"), "afr_ac": df.get_value(index, "1000Gp3_AFR_AC"), "afr_af": df.get_value(index, "1000Gp3_AFR_AF"), "eur_ac": df.get_value(index, "1000Gp3_EUR_AC"), "eur_af": df.get_value(index, "1000Gp3_EUR_AF"), "amr_ac": df.get_value(index, "1000Gp3_AMR_AC"), "amr_af": df.get_value(index, "1000Gp3_AMR_AF"), "eas_ac": df.get_value(index, "1000Gp3_EAS_AC"), "eas_af": df.get_value(index, "1000Gp3_EAS_AF"), "sas_ac": df.get_value(index, "1000Gp3_SAS_AC"), "sas_af": df.get_value(index, "1000Gp3_SAS_AF") }, "twinsuk": { "ac": df.get_value(index, "TWINSUK_AC"), "af": df.get_value(index, "TWINSUK_AF") }, "alspac": { "ac": df.get_value(index, "ALSPAC_AC"), "af": df.get_value(index, "ALSPAC_AF") }, "esp6500": { "aa_ac": df.get_value(index, "ESP6500_AA_AC"), "aa_af": df.get_value(index, "ESP6500_AA_AF"), "ea_ac": df.get_value(index, "ESP6500_EA_AC"), "ea_af": df.get_value(index, "ESP6500_EA_AF") }, "exac": { "ac": df.get_value(index, "ExAC_AC"), "af": df.get_value(index, "ExAC_AF"), "adj_ac": df.get_value(index, "ExAC_Adj_AC"), "adj_af": df.get_value(index, "ExAC_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_AFR_AC"), "afr_af": df.get_value(index, "ExAC_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_AMR_AC"), "amr_af": df.get_value(index, "ExAC_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_EAS_AC"), "eas_af": df.get_value(index, "ExAC_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_FIN_AC"), "fin_af": df.get_value(index, "ExAC_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_SAS_AC"), "sas_af": df.get_value(index, "ExAC_SAS_AF") }, "exac_nontcga": { "ac": df.get_value(index, "ExAC_nonTCGA_AC"), "af": df.get_value(index, "ExAC_nonTCGA_AF"), "adj_ac": df.get_value(index, "ExAC_nonTCGA_Adj_AC"), "adj_af": df.get_value(index, "ExAC_nonTCGA_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_nonTCGA_AFR_AC"), "afr_af": df.get_value(index, "ExAC_nonTCGA_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_nonTCGA_AMR_AC"), "amr_af": df.get_value(index, "ExAC_nonTCGA_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_nonTCGA_EAS_AC"), "eas_af": df.get_value(index, "ExAC_nonTCGA_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_nonTCGA_FIN_AC"), "fin_af": df.get_value(index, "ExAC_nonTCGA_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_nonTCGA_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_nonTCGA_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_nonTCGA_SAS_AC"), "sas_af": df.get_value(index, "ExAC_nonTCGA_SAS_AF") }, "exac_nonpsych": { "ac": df.get_value(index, "ExAC_nonpsych_AC"), "af": df.get_value(index, "ExAC_nonpsych_AF"), "adj_ac": df.get_value(index, "ExAC_nonpsych_Adj_AC"), "adj_af": df.get_value(index, "ExAC_nonpsych_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_nonpsych_AFR_AC"), "afr_af": df.get_value(index, "ExAC_nonpsych_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_nonpsych_AMR_AC"), "amr_af": df.get_value(index, "ExAC_nonpsych_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_nonpsych_EAS_AC"), "eas_af": df.get_value(index, "ExAC_nonpsych_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_nonpsych_FIN_AC"), "fin_af": df.get_value(index, "ExAC_nonpsych_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_nonpsych_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_nonpsych_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_nonpsych_SAS_AC"), "sas_af": df.get_value(index, "ExAC_nonpsych_SAS_AF") }, "clinvar": { "rs": df.get_value(index, "clinvar_rs"), "clinsig": df.get_value(index, "clinvar_clnsig"), "trait": df.get_value(index, "clinvar_trait"), "golden_stars": df.get_value(index, "clinvar_golden_stars") }, "gtex": gtex } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields, version='hg19'): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[69] == ".": siphy = "." else: freq = fields[69].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], "chrom": chrom, "hg19": { "start": fields[8], "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[111], "cds_strand": fields[12], "ancestral_allele": fields[16], "ensembl": { "geneid": fields[19], "transcriptid": fields[20] }, "sift": { "score": fields[23], "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": fields[39], "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": fields[46], "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fields[49], "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": fields[52], "rankscore": fields[53], "pred": fields[54] }, "metasvm": { "score": fields[55], "rankscore": fields[56], "pred": fields[57] }, "lr": { "score": fields[58], "rankscore": fields[59], "pred": fields[60] }, "reliability_index": fields[61], "gerp++": { "nr": fields[62], "rs": fields[63], "rs_rankscore": fields[64] }, "phylop_7way": { "vertebrate": fields[65], "vertebrate_rankscore": fields[66] }, "phastcons_7way": { "vertebrate": fields[67], "vertebrate_rankscore": fields[68] }, "siphy_29way": { "pi": siphy, "logodds": fields[70], "logodds_rankscore": fields[71] }, "1000gp1": { "ac": fields[72], "af": fields[73], "afr_ac": fields[74], "afr_af": fields[75], "eur_ac": fields[76], "eur_af": fields[77], "amr_ac": fields[78], "amr_af": fields[79], "eas_ac": fields[80], "eas_af": fields[81], "sas_ac": fields[82], "sas_af": fields[83] }, "twinsuk": { "ac": fields[84], "af": fields[85] }, "alspac": { "ac": fields[86], "af": fields[87] }, "esp6500": { "aa_ac": fields[88], "aa_af": fields[89], "ea_ac": fields[90], "ea_af": fields[91] }, "exac": { "ac": fields[92], "af": fields[93], "adj_ac": fields[94], "adj_af": fields[95], "afr_ac": fields[96], "afr_af": fields[97], "amr_ac": fields[98], "amr_af": fields[99], "eas_ac": fields[100], "eas_af": fields[101], "fin_ac": fields[102], "fin_af": fields[103], "nfe_ac": fields[104], "nfe_af": fields[105], "sas_ac": fields[106], "sas_af": fields[107] }, "clinvar": { "rs": fields[108], "clinsig": fields[109], "trait": fields[110] } } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields, version='hg19'): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[69] == ".": siphy = "." else: freq = fields[69].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], "chrom": chrom, "hg19": { "start": fields[8], "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[111], "cds_strand": fields[12], "ancestral_allele": fields[16], "ensembl": { "geneid": fields[19], "transcriptid": fields[20] }, "sift": { "score": fields[23], "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": fields[39], "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": fields[46], "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fields[49], "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": fields[52], "rankscore": fields[53], "pred": fields[54] }, "metasvm": { "score": fields[55], "rankscore": fields[56], "pred": fields[57] }, "lr": { "score": fields[58], "rankscore": fields[59], "pred": fields[60] }, "reliability_index": fields[61], "gerp++": { "nr": fields[62], "rs": fields[63], "rs_rankscore": fields[64] }, "phylop_7way": { "vertebrate": fields[65], "vertebrate_rankscore": fields[66] }, "phastcons_7way": { "vertebrate": fields[67], "vertebrate_rankscore": fields[68] }, "siphy_29way": { "pi": siphy, "logodds": fields[70], "logodds_rankscore": fields[71] }, "1000gp1": { "ac": fields[72], "af": fields[73], "afr_ac": fields[74], "afr_af": fields[75], "eur_ac": fields[76], "eur_af": fields[77], "amr_ac": fields[78], "amr_af": fields[79], "eas_ac": fields[80], "eas_af": fields[81], "sas_ac": fields[82], "sas_af": fields[83] }, "twinsuk": { "ac": fields[84], "af": fields[85] }, "alspac": { "ac": fields[86], "af": fields[87] }, "esp6500": { "aa_ac": fields[88], "aa_af": fields[89], "ea_ac": fields[90], "ea_af": fields[91] }, "exac": { "ac": fields[92], "af": fields[93], "adj_ac": fields[94], "adj_af": fields[95], "afr_ac": fields[96], "afr_af": fields[97], "amr_ac": fields[98], "amr_af": fields[99], "eas_ac": fields[100], "eas_af": fields[101], "fin_ac": fields[102], "fin_af": fields[103], "nfe_ac": fields[104], "nfe_af": fields[105], "sas_ac": fields[106], "sas_af": fields[107] }, "clinvar": { "rs": fields[108], "clinsig": fields[109], "trait": fields[110] } } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields, version): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) # in case of no hg19 position provided, remove the item if fields[8] == '.': return None else: chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[105] == ".": siphy = "." else: freq = fields[105].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = fields[181].split('|') gtex_tissue = fields[182].split('|') gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = fields[52].split(';') sift_score = fields[23].split(';') hdiv_score = fields[29].split(';') hvar_score = fields[32].split(';') lrt_score = fields[35].split(';') dann_score = fields[69].split(';') mutationtaster_score = fields[39].split(';') mutationassessor_score = fields[46].split(';') vest3_score = fields[57].split(';') metasvm_score = fields[59].split(';') fathmm_score = fields[49].split(';') lr_score = fields[62].split(';') fathmm_coding_score = fields[71].split(';') integrated_fitcons_score = fields[82].split(';') gm12878_fitcons_score = fields[85].split(';') h1_hesc_fitcons_score = fields[88].split(';') huvec_fitcons_score = fields[91].split(';') if len(provean_score) > 1: for i in range(len(provean_score)): if provean_score[i] == '.': provean_score[i] = None if len(sift_score) > 1: for i in range(len(sift_score)): if sift_score[i] == '.': sift_score[i] = None if len(hdiv_score) > 1: for i in range(len(hdiv_score)): if hdiv_score[i] == '.': hdiv_score[i] = None if len(hvar_score) > 1: for i in range(len(hvar_score)): if hvar_score[i] == '.': hvar_score[i] = None if len(lrt_score) > 1: for i in range(len(lrt_score)): if lrt_score[i] == '.': lrt_score[i] = None if len(mutationtaster_score) > 1: for i in range(len(mutationtaster_score)): if mutationtaster_score[i] == '.': mutationtaster_score[i] = None if len(mutationassessor_score) > 1: for i in range(len(mutationassessor_score)): if mutationassessor_score[i] == '.': mutationassessor_score[i] = None if len(metasvm_score) > 1: for i in range(len(metasvm_score)): if metasvm_score[i] == '.': metasvm_score[i] = None if len(vest3_score) > 1: for i in range(len(vest3_score)): if vest3_score[i] == '.': vest3_score[i] = None if len(fathmm_score) > 1: for i in range(len(fathmm_score)): if fathmm_score[i] == '.': fathmm_score[i] = None if len(lr_score) > 1: for i in range(len(lr_score)): if lr_score[i] == '.': lr_score[i] = None if len(fathmm_coding_score) > 1: for i in range(len(fathmm_coding_score)): if fathmm_coding_score[i] == '.': fathmm_coding_score[i] = None if len(dann_score) > 1: for i in range(len(dann_score)): if dann_score[i] == '.': dann_score[i] = None if len(integrated_fitcons_score) > 1: for i in range(len(integrated_fitcons_score)): if integrated_fitcons_score[i] == '.': integrated_fitcons_score[i] = None if len(gm12878_fitcons_score) > 1: for i in range(len(gm12878_fitcons_score)): if gm12878_fitcons_score[i] == '.': gm12878_fitcons_score[i] = None if len(h1_hesc_fitcons_score) > 1: for i in range(len(h1_hesc_fitcons_score)): if h1_hesc_fitcons_score[i] == '.': h1_hesc_fitcons_score[i] = None if len(huvec_fitcons_score) > 1: for i in range(len(huvec_fitcons_score)): if huvec_fitcons_score[i] == '.': huvec_fitcons_score[i] = None # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], "codon_degeneracy": fields[15] }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[180], "cds_strand": fields[12], "ancestral_allele": fields[16], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": fields[19], "transcriptid": fields[20], "proteinid": fields[21] }, "sift": { "score": sift_score, "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": hvar_score, "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": lrt_score, "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fathmm_score, "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": provean_score, "rankscore": fields[53], "pred": fields[54] }, "vest3": { "score": vest3_score, "rankscore": fields[57], "transcriptid": fields[55], "transcriptvar": fields[56] }, "fathmm-mkl": { "coding_score": fathmm_coding_score, "coding_rankscore": fields[72], "coding_pred": fields[73], "coding_group": fields[74] }, "eigen": { "raw": fields[75], "phred": fields[76], "raw_rankscore": fields[77] }, "eigen-pc": { "raw": fields[78], "raw_rankscore": fields[79] }, "genocanyon": { "score": fields[80], "rankscore": fields[81] }, "metasvm": { "score": metasvm_score, "rankscore": fields[60], "pred": fields[61] }, "metalr": { "score": lr_score, "rankscore": fields[63], "pred": fields[64] }, "reliability_index": fields[65], "dann": { "score": dann_score, "rankscore": fields[70] }, "gerp++": { "nr": fields[94], "rs": fields[95], "rs_rankscore": fields[96] }, "integrated": { "fitcons_score": integrated_fitcons_score, "fitcons_rankscore": fields[83], "confidence_value": fields[84] }, "gm12878": { "fitcons_score": gm12878_fitcons_score, "fitcons_rankscore": fields[86], "confidence_value": fields[87] }, "h1-hesc": { "fitcons_score": h1_hesc_fitcons_score, "fitcons_rankscore": fields[89], "confidence_value": fields[90] }, "huvec": { "fitcons_score": huvec_fitcons_score, "fitcons_rankscore": fields[92], "confidence_value": fields[93] }, "phylo": { "p100way": { "vertebrate": fields[97], "vertebrate_rankscore": fields[98] }, "p20way": { "mammalian": fields[99], "mammalian_rankscore": fields[100] } }, "phastcons": { "100way": { "vertebrate": fields[101], "vertebrate_rankscore": fields[102] }, "20way": { "mammalian": fields[103], "mammalian_rankscore": fields[104] } }, "siphy_29way": { "pi": siphy, "logodds": fields[106], "logodds_rankscore": fields[107] }, "1000gp3": { "ac": fields[108], "af": fields[109], "afr_ac": fields[110], "afr_af": fields[111], "eur_ac": fields[112], "eur_af": fields[113], "amr_ac": fields[114], "amr_af": fields[115], "eas_ac": fields[116], "eas_af": fields[117], "sas_ac": fields[118], "sas_af": fields[119] }, "twinsuk": { "ac": fields[120], "af": fields[121] }, "alspac": { "ac": fields[122], "af": fields[123] }, "esp6500": { "aa_ac": fields[124], "aa_af": fields[125], "ea_ac": fields[126], "ea_af": fields[127] }, "exac": { "ac": fields[128], "af": fields[129], "adj_ac": fields[130], "adj_af": fields[131], "afr_ac": fields[132], "afr_af": fields[133], "amr_ac": fields[134], "amr_af": fields[135], "eas_ac": fields[136], "eas_af": fields[137], "fin_ac": fields[138], "fin_af": fields[139], "nfe_ac": fields[140], "nfe_af": fields[141], "sas_ac": fields[142], "sas_af": fields[143] }, "exac_nontcga": { "ac": fields[144], "af": fields[145], "adj_ac": fields[146], "adj_af": fields[147], "afr_ac": fields[148], "afr_af": fields[149], "amr_ac": fields[150], "amr_af": fields[151], "eas_ac": fields[152], "eas_af": fields[153], "fin_ac": fields[154], "fin_af": fields[155], "nfe_ac": fields[156], "nfe_af": fields[157], "sas_ac": fields[158], "sas_af": fields[159] }, "exac_nonpsych": { "ac": fields[160], "af": fields[161], "adj_ac": fields[162], "adj_af": fields[163], "afr_ac": fields[164], "afr_af": fields[165], "amr_ac": fields[166], "amr_af": fields[167], "eas_ac": fields[168], "eas_af": fields[169], "fin_ac": fields[170], "fin_af": fields[171], "nfe_ac": fields[172], "nfe_af": fields[173] }, "clinvar": { "rs": fields[176], "clinsig": fields[177], "trait": fields[178], "golden_stars": fields[179] }, "gtex": gtex } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(cp): clinical_siginificance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute. get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet. Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet. Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return other_ids = '' rsid = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';' # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "type": variation_type, "name": name, "gene": { "id": gene_id, "symbol": symbol }, "clinical_significance": clinical_siginificance, "rsid": rsid, "rcv_accession": rcv_accession, "origin": origin, "cytogenic": cytogenic, "review_status": review_status, "hgvs": HGVS, "number_submitters": number_submitters, "last_evaluated": str(last_evaluated), "other_ids": other_ids, "clinvar_id": CLINVAR_ID, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[13] chromStart = fields[14] chromEnd = fields[15] HGVS = None cds = fields[18].split(":") cds = cds[1] replace = re.findall(r'[ATCGMNYR=]+', cds) sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) delete = fields[1] == 'deletion' indel = fields[1] == 'indel' dup = re.search(r'dup', cds) inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) if ins: delete = None indel = None elif delete: ins = None indel = None # parse from vcf file. Input chrom number # and chromStart, and return REF, ALT if chromStart: record = vcf_reader.fetch(chrom, int(chromStart)) else: record = None if record: REF = record.REF ALT = record.ALT ALT = ALT[0] if record.is_snp and len(ALT) < 2: mod = [REF, ALT] else: mod = ALT else: return if sub and record.is_snp: HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) elif ins: HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif indel: try: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) except AttributeError: print "ERROR:", fields[1], cds elif dup: HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) elif inv: HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) elif replace: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) else: print 'ERROR:', fields[1], cds # load as json data if HGVS is None: print 'None:', fields[1], cds return None one_snp_json = { "_id": HGVS, "clinvar": { "allele_id": fields[0], "hg19": { "chr": fields[13], "start": fields[14], "end": fields[15] }, "type": fields[1], "name": fields[2], "gene": { "id": fields[3], "symbol": fields[4] }, "clinical_significance": fields[5].split(";"), "rsid": 'rs' + str(fields[6]), "nsv_dbvar": fields[7], "rcv_accession": fields[8].split(";"), "tested_in_gtr": fields[9], "phenotype_id": other_id(fields[10]), "origin": fields[11], "cytogenic": fields[16], "review_status": fields[17], "hgvs": { "coding": fields[18], "protein": fields[19] }, "number_submitters": fields[20], "last_evaluated": fields[21], "guidelines": fields[22], "other_ids": other_id(fields[23]), "clinvar_id": fields[24] } } return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])