def _map_line_to_json(fields): vid = fields[0].split(":") chrom = re.search(r'[1-9]+', vid[0]).group() if chrom == '23': chrom = chrom.replace('23', 'X') HGVS = "chr%s:%s" % (chrom, vid[1]) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "emv": { "gene": fields[2], "variant_id": fields[3], "exon": fields[4], "egl_variant": fields[5], "egl_protein": fields[6], "egl_classification": fields[7], "egl_classification_date": fields[8], "hgvs": fields[9].split(" | "), "clinvar_rcv": fields[10], } } return unlist(dict_sweep(value_convert(one_snp_json), vals=[""]))
def _map_line_to_json(fields): assert len(fields) == VALID_CO_NUMBER HGVS = fields[1] if HGVS is None: return one_snp_json = { "_id": HGVS, 'drugbank': { 'drug': fields[2], 'interacting_gene_or_enzyme': fields[3], 'snp_rs_id': fields[0], 'allele_name': fields[4], 'defining_change': fields[5], 'adverse_reaction': fields[6], 'references': fields[7] } } return dict_sweep(one_snp_json, ['Not Available'])
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count = item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return url = "http://myvariant.info/v1/query?q=dbsnp.rsid:" + rsid + "&fields=_id" r = requests.get(url) for hits in r.json()["hits"]: HGVS = hits["_id"] one_snp_json = { "_id": HGVS, "grasp": { "hg19": {"chr": fields[5], "pos": fields[6]}, "hupfield": fields[1], "last_curation_date": fields[2], "creation_date": fields[3], "srsid": fields[4], "publication": { "journal": fields[16], "title": fields[17], "pmid": fields[7], "snpid": fields[8], "location_within_paper": fields[9], "p_value": fields[10], "phenotype": fields[11], "paper_phenotype_description": fields[12], "paper_phenotype_categories": fields[13], "date_pub": fields[14], }, "includes_male_female_only_analyses": fields[18], "exclusively_male_female": fields[19], "initial_sample_description": fields[20], "replication_sample_description": fields[21], "platform_snps_passing_qc": fields[22], "gwas_ancestry_description": fields[23], "discovery": { "total_samples": fields[25], "european": fields[26], "african": fields[27], "east_asian": fields[28], "indian_south_asian": fields[29], "hispanic": fields[30], "native": fields[31], "micronesian": fields[32], "arab_me": fields[33], "mixed": fields[34], "unspecified": fields[35], "filipino": fields[36], "indonesian": fields[37], }, "replication": { "total_samples": fields[38], "european": fields[39], "african": fields[40], "east_asian": fields[41], "indian_south_asian": fields[42], "hispanic": fields[43], "native": fields[44], "micronesian": fields[45], "arab_me": fields[46], "mixed": fields[47], "unspecified": fields[48], "filipino": fields[49], "indonesian": fields[50], }, "in_gene": fields[51], "nearest_gene": fields[52], "in_lincrna": fields[53], "in_mirna": fields[54], "in_mirna_bs": fields[55], "oreg_anno": fields[61], "conserv_pred_tfbs": fields[62], "human_enhancer": fields[63], "rna_edit": fields[64], "polyphen2": fields[65], "sift": fields[66], "ls_snp": fields[67], "uniprot": fields[68], "eqtl_meth_metab_study": fields[69], }, } return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(cp): clinical_siginificance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position + 3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position + 3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position + 3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return other_ids = '' rsid = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';' # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "type": variation_type, "name": name, "gene": { "id": gene_id, "symbol": symbol }, "clinical_significance": clinical_siginificance, "rsid": rsid, "rcv_accession": rcv_accession, "origin": origin, "cytogenic": cytogenic, "review_status": review_status, "hgvs": HGVS, "number_submitters": number_submitters, "last_evaluated": str(last_evaluated), "other_ids": other_ids, "clinvar_id": CLINVAR_ID, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "exac": { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "exac": { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'], "ac_female": info['AC_FEMALE'], "ac_male": info['AC_MALE'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields, version): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) if version == 'hg19': HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) elif version == 'hg38': HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return url = 'http://myvariant.info/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "grasp": { 'hg19': { 'chr': fields[5], 'pos': fields[6] }, 'hupfield': fields[1], 'last_curation_date': fields[2], 'creation_date': fields[3], 'srsid': fields[4], 'publication': { 'journal': fields[16], 'title': fields[17], 'pmid': fields[7], 'snpid': fields[8], 'location_within_paper': fields[9], 'p_value': fields[10], 'phenotype': fields[11], 'paper_phenotype_description': fields[12], 'paper_phenotype_categories': fields[13], 'date_pub': fields[14] }, 'includes_male_female_only_analyses': fields[18], 'exclusively_male_female': fields[19], 'initial_sample_description': fields[20], 'replication_sample_description': fields[21], 'platform_snps_passing_qc': fields[22], 'gwas_ancestry_description': fields[23], 'discovery': { 'total_samples': fields[25], 'european': fields[26], 'african': fields[27], 'east_asian': fields[28], 'indian_south_asian': fields[29], 'hispanic': fields[30], 'native': fields[31], 'micronesian': fields[32], 'arab_me': fields[33], 'mixed': fields[34], 'unspecified': fields[35], 'filipino': fields[36], 'indonesian': fields[37] }, 'replication': { 'total_samples': fields[38], 'european': fields[39], 'african': fields[40], 'east_asian': fields[41], 'indian_south_asian': fields[42], 'hispanic': fields[43], 'native': fields[44], 'micronesian': fields[45], 'arab_me': fields[46], 'mixed': fields[47], 'unspecified': fields[48], 'filipino': fields[49], 'indonesian': fields[50] }, 'in_gene': fields[51], 'nearest_gene': fields[52], 'in_lincrna': fields[53], 'in_mirna': fields[54], 'in_mirna_bs': fields[55], 'oreg_anno': fields[61], 'conserv_pred_tfbs': fields[62], 'human_enhancer': fields[63], 'rna_edit': fields[64], 'polyphen2': fields[65], 'sift': fields[66], 'ls_snp': fields[67], 'uniprot': fields[68], 'eqtl_meth_metab_study': fields[69] } } return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0] synonyms = [] conditions_name = '' for name in trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart_19 = None chromEnd_19 = None chromStart_38 = None chromEnd_38 = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start chromEnd_19 = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: ref = SequenceLocation.referenceAllele if not alt: alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None if hg19: chromStart = chromStart_19 chromEnd = chromEnd_19 else: chromStart = chromStart_38 chromEnd = chromEnd_38 # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute. get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet. Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet. Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome and chrom: hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return for key in HGVS: HGVS[key].sort() rsid = None cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': cosmic = XRef.ID elif XRef.DB == 'OMIM': omim = XRef.ID elif XRef.DB == 'UniProtKB/Swiss-Prot': uniprot = XRef.ID elif XRef.DB == 'dbVar': dbvar = XRef.ID # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "variant_id": variant_id, "chrom": chrom, "omim": omim, "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, "hg19": { "start": chromStart_19, "end": chromEnd_19 }, "hg38": { "start": chromStart_38, "end": chromEnd_38 }, "type": variation_type, "gene": { "id": gene_id, "symbol": symbol }, "rcv": { "accession": rcv_accession, "clinical_significance": clinical_significance, "number_submitters": number_submitters, "review_status": review_status, "last_evaluated": str(last_evaluated), "preferred_name": name, "origin": origin, "conditions": { "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset } }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) yield obj
def _map_line_to_json(fields, version='hg19'): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[69] == ".": siphy = "." else: freq = fields[69].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], "chrom": chrom, "hg19": { "start": fields[8], "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[111], "cds_strand": fields[12], "ancestral_allele": fields[16], "ensembl": { "geneid": fields[19], "transcriptid": fields[20] }, "sift": { "score": fields[23], "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": fields[39], "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": fields[46], "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fields[49], "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": fields[52], "rankscore": fields[53], "pred": fields[54] }, "metasvm": { "score": fields[55], "rankscore": fields[56], "pred": fields[57] }, "lr": { "score": fields[58], "rankscore": fields[59], "pred": fields[60] }, "reliability_index": fields[61], "gerp++": { "nr": fields[62], "rs": fields[63], "rs_rankscore": fields[64] }, "phylop_7way": { "vertebrate": fields[65], "vertebrate_rankscore": fields[66] }, "phastcons_7way": { "vertebrate": fields[67], "vertebrate_rankscore": fields[68] }, "siphy_29way": { "pi": siphy, "logodds": fields[70], "logodds_rankscore": fields[71] }, "1000gp1": { "ac": fields[72], "af": fields[73], "afr_ac": fields[74], "afr_af": fields[75], "eur_ac": fields[76], "eur_af": fields[77], "amr_ac": fields[78], "amr_af": fields[79], "eas_ac": fields[80], "eas_af": fields[81], "sas_ac": fields[82], "sas_af": fields[83] }, "twinsuk": { "ac": fields[84], "af": fields[85] }, "alspac": { "ac": fields[86], "af": fields[87] }, "esp6500": { "aa_ac": fields[88], "aa_af": fields[89], "ea_ac": fields[90], "ea_af": fields[91] }, "exac": { "ac": fields[92], "af": fields[93], "adj_ac": fields[94], "adj_af": fields[95], "afr_ac": fields[96], "afr_af": fields[97], "amr_ac": fields[98], "amr_af": fields[99], "eas_ac": fields[100], "eas_af": fields[101], "fin_ac": fields[102], "fin_af": fields[103], "nfe_ac": fields[104], "nfe_af": fields[105], "sas_ac": fields[106], "sas_af": fields[107] }, "clinvar": { "rs": fields[108], "clinsig": fields[109], "trait": fields[110] } } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } return dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
def _map_line_to_json(fields, version): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) # in case of no hg19 position provided, remove the item if fields[8] == '.': return None else: chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[105] == ".": siphy = "." else: freq = fields[105].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = fields[181].split('|') gtex_tissue = fields[182].split('|') gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = fields[52].split(';') sift_score = fields[23].split(';') hdiv_score = fields[29].split(';') hvar_score = fields[32].split(';') lrt_score = fields[35].split(';') dann_score = fields[69].split(';') mutationtaster_score = fields[39].split(';') mutationassessor_score = fields[46].split(';') vest3_score = fields[57].split(';') metasvm_score = fields[59].split(';') fathmm_score = fields[49].split(';') lr_score = fields[62].split(';') fathmm_coding_score = fields[71].split(';') integrated_fitcons_score = fields[82].split(';') gm12878_fitcons_score = fields[85].split(';') h1_hesc_fitcons_score = fields[88].split(';') huvec_fitcons_score = fields[91].split(';') if len(provean_score) > 1: for i in range(len(provean_score)): if provean_score[i] == '.': provean_score[i] = None if len(sift_score) > 1: for i in range(len(sift_score)): if sift_score[i] == '.': sift_score[i] = None if len(hdiv_score) > 1: for i in range(len(hdiv_score)): if hdiv_score[i] == '.': hdiv_score[i] = None if len(hvar_score) > 1: for i in range(len(hvar_score)): if hvar_score[i] == '.': hvar_score[i] = None if len(lrt_score) > 1: for i in range(len(lrt_score)): if lrt_score[i] == '.': lrt_score[i] = None if len(mutationtaster_score) > 1: for i in range(len(mutationtaster_score)): if mutationtaster_score[i] == '.': mutationtaster_score[i] = None if len(mutationassessor_score) > 1: for i in range(len(mutationassessor_score)): if mutationassessor_score[i] == '.': mutationassessor_score[i] = None if len(metasvm_score) > 1: for i in range(len(metasvm_score)): if metasvm_score[i] == '.': metasvm_score[i] = None if len(vest3_score) > 1: for i in range(len(vest3_score)): if vest3_score[i] == '.': vest3_score[i] = None if len(fathmm_score) > 1: for i in range(len(fathmm_score)): if fathmm_score[i] == '.': fathmm_score[i] = None if len(lr_score) > 1: for i in range(len(lr_score)): if lr_score[i] == '.': lr_score[i] = None if len(fathmm_coding_score) > 1: for i in range(len(fathmm_coding_score)): if fathmm_coding_score[i] == '.': fathmm_coding_score[i] = None if len(dann_score) > 1: for i in range(len(dann_score)): if dann_score[i] == '.': dann_score[i] = None if len(integrated_fitcons_score) > 1: for i in range(len(integrated_fitcons_score)): if integrated_fitcons_score[i] == '.': integrated_fitcons_score[i] = None if len(gm12878_fitcons_score) > 1: for i in range(len(gm12878_fitcons_score)): if gm12878_fitcons_score[i] == '.': gm12878_fitcons_score[i] = None if len(h1_hesc_fitcons_score) > 1: for i in range(len(h1_hesc_fitcons_score)): if h1_hesc_fitcons_score[i] == '.': h1_hesc_fitcons_score[i] = None if len(huvec_fitcons_score) > 1: for i in range(len(huvec_fitcons_score)): if huvec_fitcons_score[i] == '.': huvec_fitcons_score[i] = None # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], "codon_degeneracy": fields[15] }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[180], "cds_strand": fields[12], "ancestral_allele": fields[16], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": fields[19], "transcriptid": fields[20], "proteinid": fields[21] }, "sift": { "score": sift_score, "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": hvar_score, "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": lrt_score, "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fathmm_score, "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": provean_score, "rankscore": fields[53], "pred": fields[54] }, "vest3": { "score": vest3_score, "rankscore": fields[57], "transcriptid": fields[55], "transcriptvar": fields[56] }, "fathmm-mkl": { "coding_score": fathmm_coding_score, "coding_rankscore": fields[72], "coding_pred": fields[73], "coding_group": fields[74] }, "eigen": { "raw": fields[75], "phred": fields[76], "raw_rankscore": fields[77] }, "eigen-pc": { "raw": fields[78], "raw_rankscore": fields[79] }, "genocanyon": { "score": fields[80], "rankscore": fields[81] }, "metasvm": { "score": metasvm_score, "rankscore": fields[60], "pred": fields[61] }, "metalr": { "score": lr_score, "rankscore": fields[63], "pred": fields[64] }, "reliability_index": fields[65], "dann": { "score": dann_score, "rankscore": fields[70] }, "gerp++": { "nr": fields[94], "rs": fields[95], "rs_rankscore": fields[96] }, "integrated": { "fitcons_score": integrated_fitcons_score, "fitcons_rankscore": fields[83], "confidence_value": fields[84] }, "gm12878": { "fitcons_score": gm12878_fitcons_score, "fitcons_rankscore": fields[86], "confidence_value": fields[87] }, "h1-hesc": { "fitcons_score": h1_hesc_fitcons_score, "fitcons_rankscore": fields[89], "confidence_value": fields[90] }, "huvec": { "fitcons_score": huvec_fitcons_score, "fitcons_rankscore": fields[92], "confidence_value": fields[93] }, "phylo": { "p100way": { "vertebrate": fields[97], "vertebrate_rankscore": fields[98] }, "p20way": { "mammalian": fields[99], "mammalian_rankscore": fields[100] } }, "phastcons": { "100way": { "vertebrate": fields[101], "vertebrate_rankscore": fields[102] }, "20way": { "mammalian": fields[103], "mammalian_rankscore": fields[104] } }, "siphy_29way": { "pi": siphy, "logodds": fields[106], "logodds_rankscore": fields[107] }, "1000gp3": { "ac": fields[108], "af": fields[109], "afr_ac": fields[110], "afr_af": fields[111], "eur_ac": fields[112], "eur_af": fields[113], "amr_ac": fields[114], "amr_af": fields[115], "eas_ac": fields[116], "eas_af": fields[117], "sas_ac": fields[118], "sas_af": fields[119] }, "twinsuk": { "ac": fields[120], "af": fields[121] }, "alspac": { "ac": fields[122], "af": fields[123] }, "esp6500": { "aa_ac": fields[124], "aa_af": fields[125], "ea_ac": fields[126], "ea_af": fields[127] }, "exac": { "ac": fields[128], "af": fields[129], "adj_ac": fields[130], "adj_af": fields[131], "afr_ac": fields[132], "afr_af": fields[133], "amr_ac": fields[134], "amr_af": fields[135], "eas_ac": fields[136], "eas_af": fields[137], "fin_ac": fields[138], "fin_af": fields[139], "nfe_ac": fields[140], "nfe_af": fields[141], "sas_ac": fields[142], "sas_af": fields[143] }, "exac_nontcga": { "ac": fields[144], "af": fields[145], "adj_ac": fields[146], "adj_af": fields[147], "afr_ac": fields[148], "afr_af": fields[149], "amr_ac": fields[150], "amr_af": fields[151], "eas_ac": fields[152], "eas_af": fields[153], "fin_ac": fields[154], "fin_af": fields[155], "nfe_ac": fields[156], "nfe_af": fields[157], "sas_ac": fields[158], "sas_af": fields[159] }, "exac_nonpsych": { "ac": fields[160], "af": fields[161], "adj_ac": fields[162], "adj_af": fields[163], "afr_ac": fields[164], "afr_af": fields[165], "amr_ac": fields[166], "amr_af": fields[167], "eas_ac": fields[168], "eas_af": fields[169], "fin_ac": fields[170], "fin_af": fields[171], "nfe_ac": fields[172], "nfe_af": fields[173] }, "clinvar": { "rs": fields[176], "clinsig": fields[177], "trait": fields[178], "golden_stars": fields[179] }, "gtex": gtex } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): # specific variable treatment chrom = fields[0] if fields[7] == ".": hg18_end = "." else: hg18_end = int(fields[7])+1 chromStart = int(fields[1]) chromEnd = int(fields[1]) + 1 allele1 = fields[2] allele2 = fields[3] HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2) if fields[74] == ".": siphy = "." else: freq = fields[74].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[11].rstrip().rstrip(';').split(";") pos = fields[13].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "chrom": chrom, "hg19": { "start": fields[1], "end": chromEnd }, "hg18": { "start": fields[7], "end": hg18_end }, "hg38": { "chrom": fields[8], "pos": fields[9] }, "allele1": allele1, "allele2": allele2, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[23], "refcodon": fields[16], "codonpos": fields[18], "aapos_sift": fields[24], "aapos_fathmm": fields[25] }, "genename": fields[10], "uniprot": uniprot, "interpro_domain": fields[14], "cds_strand": fields[15], "slr_test_statistic": fields[17], "fold-degenerate": fields[19], "ancestral_allele": fields[20], "ensembl": { "geneid": fields[21], "transcriptid": fields[22] }, "sift": { "score": fields[26], "converted_rankscore": fields[27], "pred": fields[28] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37] }, "mutationtaster": { "score": fields[38], "converted_rankscore": fields[39], "pred": fields[40] }, "mutationassessor": { "score": fields[41], "rankscore": fields[42], "pred": fields[43] }, "fathmm": { "score": fields[44], "rankscore": fields[45], "pred": fields[46] }, "radialsvm": { "score": fields[47], "rankscore": fields[48], "pred": fields[49] }, "lr": { "score": fields[50], "rankscore": fields[51], "pred": fields[52] }, "reliability_index": fields[53], "vest3": { "score": fields[54], "rankscore": fields[55] }, "cadd": { "raw": fields[56], "raw_rankscore": fields[57], "phred": fields[58] }, "gerp++": { "nr": fields[59], "rs": fields[60], "rs_rankscore": fields[61] }, "phylop": { "46way": { "primate": fields[62], "primate_rankscore": fields[63], "placental": fields[64], "placental_rankscore": fields[65], }, "100way": { "vertebrate": fields[66], "vertebrate_rankscore": fields[67] } }, "phastcons": { "46way": { "primate": fields[68], "primate_rankscore": fields[69], "placental": fields[70], "placental_rankscore": fields[71], }, "100way": { "vertebrate": fields[72], "vertebrate_rankscore": fields[73] } }, "siphy_29way": { "pi": siphy, "logodds": fields[75], "logodds_rankscore": fields[76] }, "lrt_omega": fields[77], "unisnp_ids": fields[78], "1000gp1": { "ac": fields[79], "af": fields[80], "afr_ac": fields[81], "afr_af": fields[82], "eur_ac": fields[83], "eur_af": fields[84], "amr_ac": fields[85], "amr_af": fields[86], "asn_ac": fields[87], "asn_af": fields[88] }, "esp6500": { "aa_af": fields[89], "ea_af": fields[90] }, "aric5606": { "aa_ac": fields[91], "aa_af": fields[92], "ea_ac": fields[93], "ea_af": fields[94] }, "clinvar": { "rs": fields[95], "clin_sig": fields[96], "trait": fields[97] } } } one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(cp): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0] synonyms = [] conditions_name = '' for name in trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None chromStart_38 = None chromEnd_38 = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: ref = SequenceLocation.referenceAllele if not alt: alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position + 3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position + 3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position + 3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return for key in HGVS: HGVS[key].sort() rsid = None cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': cosmic = XRef.ID elif XRef.DB == 'OMIM': omim = XRef.ID elif XRef.DB == 'UniProtKB/Swiss-Prot': uniprot = XRef.ID elif XRef.DB == 'dbVar': dbvar = XRef.ID # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "variant_id": variant_id, "chrom": chrom, "omim": omim, "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, "hg19": { "start": chromStart, "end": chromEnd }, "hg38": { "start": chromStart_38, "end": chromEnd_38 }, "type": variation_type, "gene": { "id": gene_id, "symbol": symbol }, "rcv": { "accession": rcv_accession, "clinical_significance": clinical_significance, "number_submitters": number_submitters, "review_status": review_status, "last_evaluated": str(last_evaluated), "preferred_name": name, "origin": origin, "conditions": { "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset } }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep( unlist( value_convert(one_snp_json, [ 'chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc' ])), [None, '', 'None'])) yield obj
def _map_line_to_json(cp): clinical_siginificance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart = None chromEnd = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart = SequenceLocation.start chromEnd = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute. get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet. Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append(AttributeSet. Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append(AttributeSet.Attribute. get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet. Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] hgvs_id = "chr%s:g.%s_%sdel%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': hgvs_id = "chr%s:g.%s_%sdel" % \ (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome: hgvs_id = "chr" + hgvs_genome.split('.')[1] +\ hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: print "couldn't find any id", rcv_accession return else: print 'no measure.attribute', rcv_accession return other_ids = '' rsid = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';' # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "type": variation_type, "name": name, "gene": { "id": gene_id, "symbol": symbol }, "clinical_significance": clinical_siginificance, "rsid": rsid, "rcv_accession": rcv_accession, "origin": origin, "cytogenic": cytogenic, "review_status": review_status, "hgvs": HGVS, "number_submitters": number_submitters, "last_evaluated": str(last_evaluated), "other_ids": other_ids, "clinvar_id": CLINVAR_ID, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields, version='hg19'): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[69] == ".": siphy = "." else: freq = fields[69].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], "chrom": chrom, "hg19": { "start": fields[8], "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[111], "cds_strand": fields[12], "ancestral_allele": fields[16], "ensembl": { "geneid": fields[19], "transcriptid": fields[20] }, "sift": { "score": fields[23], "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": fields[29], "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": fields[32], "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": fields[35], "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": fields[39], "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": fields[46], "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fields[49], "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": fields[52], "rankscore": fields[53], "pred": fields[54] }, "metasvm": { "score": fields[55], "rankscore": fields[56], "pred": fields[57] }, "lr": { "score": fields[58], "rankscore": fields[59], "pred": fields[60] }, "reliability_index": fields[61], "gerp++": { "nr": fields[62], "rs": fields[63], "rs_rankscore": fields[64] }, "phylop_7way": { "vertebrate": fields[65], "vertebrate_rankscore": fields[66] }, "phastcons_7way": { "vertebrate": fields[67], "vertebrate_rankscore": fields[68] }, "siphy_29way": { "pi": siphy, "logodds": fields[70], "logodds_rankscore": fields[71] }, "1000gp1": { "ac": fields[72], "af": fields[73], "afr_ac": fields[74], "afr_af": fields[75], "eur_ac": fields[76], "eur_af": fields[77], "amr_ac": fields[78], "amr_af": fields[79], "eas_ac": fields[80], "eas_af": fields[81], "sas_ac": fields[82], "sas_af": fields[83] }, "twinsuk": { "ac": fields[84], "af": fields[85] }, "alspac": { "ac": fields[86], "af": fields[87] }, "esp6500": { "aa_ac": fields[88], "aa_af": fields[89], "ea_ac": fields[90], "ea_af": fields[91] }, "exac": { "ac": fields[92], "af": fields[93], "adj_ac": fields[94], "adj_af": fields[95], "afr_ac": fields[96], "afr_af": fields[97], "amr_ac": fields[98], "amr_af": fields[99], "eas_ac": fields[100], "eas_af": fields[101], "fin_ac": fields[102], "fin_af": fields[103], "nfe_ac": fields[104], "nfe_af": fields[105], "sas_ac": fields[106], "sas_af": fields[107] }, "clinvar": { "rs": fields[108], "clinsig": fields[109], "trait": fields[110] } } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def annotate_by_snpeff(self, varobj_list): '''load data''' # title of vcf vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' # extract each item from list, transform into vcf format snpeff_valid_id = [] for item in varobj_list: if '>' in item: hgvs_info = self.snp_hgvs_id_parser(item) try: vcf_stdin += self.snp_vcf_constructer(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif item.endswith('del'): hgvs_info = self.del_hgvs_id_parser(item) try: vcf_stdin += self.del_vcf_constructor(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif 'ins' in item and 'del' not in item: hgvs_info = self.ins_hgvs_id_parser(item) try: vcf_stdin += self.ins_vcf_constructor(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif 'delins' in item: hgvs_info = self.delins_hgvs_id_parser(item) try: vcf_stdin += self.delins_vcf_constructor(hgvs_info) except TypeError: print(item) continue else: print(item) print('beyond current capacity') proc = subprocess.Popen(SNPEFF_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate(vcf_stdin) assert stderr == '', stderr vcf_stdout_raw = stdout.split('\n') for vcf_line in vcf_stdout_raw: if vcf_line.startswith('#'): continue elif vcf_line == '': continue else: # assume the first item is 'ANN' ann_info = vcf_line.split(';')[0] ann = [] # Multiple annotations per VCF line for item in ann_info.split(','): if len(item.split('|')) > 1: (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7] (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15] print(effect) if cdna: (cdna_position, cdna_len) = cdna.split('/') else: cdna_position = None cdna_len = None if cds: (cds_position, cds_len) = cds.split('/') else: cds_position = None cds_len = None if protein: (protein_position, protein_len) = protein.split('/') else: protein_position = None protein_len = None if exon: (rank, total) = exon.split('/') else: rank = None total = None ann.append({ "effect": effect, "putative_impact": putative_impact, "genename": gene_name, "gene_id": gene_id, "feature_type": feature_type, "feature_id": feature_id, "transcript_biotype": transcript_biotype, "rank": rank, "total": total, "hgvs.c": hgvs_coding, "hgvs.p": hgvs_protein, "cdna": { "position": cdna_position, "length": cdna_len }, "cds": { "position": cds_position, "length": cds_len }, "protein": { "position": protein_position, "length": protein_len }, "distance_to_feature": distance_to_feature }) print(ann) # not all annotations include lof & nmd information. Set them to 'None' as default lof = None nmd = None # the case that annotation include 'ann' & 'lof' & 'nmd' if len(vcf_line.split(';')) == 3: (lof_info, nmd_info) = vcf_line.split(';')[1:3] # assume the second item is 'lof' assert lof_info.startswith('LOF') # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)' lof_info = lof_info.split('(')[1].split(')')[0] nmd_info = nmd_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } # the case that annotation include 'ann' & 'lof or nmd' elif len(vcf_line.split(';')) == 2: (ann_info, idk_info) = vcf_line.split(';') if idk_info.startswith('LOF'): lof_info = idk_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } else: nmd_info = idk_info.split('(')[1].split(')')[0] (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5] hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt) one_snp_json = { "id": hgvs_id, "snpeff": { "ann": ann, "lof": lof, "nmd": nmd, "vcf": { "position": pos, "ref": ref, "alt": alt } } } snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None]) yield snpeff_json
def _map_line_to_json(fields, version): # specific variable treatment chrom = fields[0] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos if fields[10] == ".": hg18_end = "." else: hg18_end = int(fields[10]) # in case of no hg19 position provided, remove the item if fields[8] == '.': return None else: chromStart = int(fields[8]) chromEnd = int(fields[8]) chromStart_38 = int(fields[1]) ref = fields[2].upper() alt = fields[3].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 if fields[105] == ".": siphy = "." else: freq = fields[105].split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = fields[181].split('|') gtex_tissue = fields[182].split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = fields[26].rstrip().rstrip(';').split(";") pos = fields[28].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = fields[52].split(';') sift_score = fields[23].split(';') hdiv_score = fields[29].split(';') hvar_score = fields[32].split(';') lrt_score = fields[35].split(';') dann_score = fields[69].split(';') mutationtaster_score = fields[39].split(';') mutationassessor_score = fields[46].split(';') vest3_score = fields[57].split(';') metasvm_score = fields[59].split(';') fathmm_score = fields[49].split(';') lr_score = fields[62].split(';') fathmm_coding_score = fields[71].split(';') integrated_fitcons_score = fields[82].split(';') gm12878_fitcons_score = fields[85].split(';') h1_hesc_fitcons_score = fields[88].split(';') huvec_fitcons_score = fields[91].split(';') if len(provean_score) > 1: for i in range(len(provean_score)): if provean_score[i] == '.': provean_score[i] = None if len(sift_score) > 1: for i in range(len(sift_score)): if sift_score[i] == '.': sift_score[i] = None if len(hdiv_score) > 1: for i in range(len(hdiv_score)): if hdiv_score[i] == '.': hdiv_score[i] = None if len(hvar_score) > 1: for i in range(len(hvar_score)): if hvar_score[i] == '.': hvar_score[i] = None if len(lrt_score) > 1: for i in range(len(lrt_score)): if lrt_score[i] == '.': lrt_score[i] = None if len(mutationtaster_score) > 1: for i in range(len(mutationtaster_score)): if mutationtaster_score[i] == '.': mutationtaster_score[i] = None if len(mutationassessor_score) > 1: for i in range(len(mutationassessor_score)): if mutationassessor_score[i] == '.': mutationassessor_score[i] = None if len(metasvm_score) > 1: for i in range(len(metasvm_score)): if metasvm_score[i] == '.': metasvm_score[i] = None if len(vest3_score) > 1: for i in range(len(vest3_score)): if vest3_score[i] == '.': vest3_score[i] = None if len(fathmm_score) > 1: for i in range(len(fathmm_score)): if fathmm_score[i] == '.': fathmm_score[i] = None if len(lr_score) > 1: for i in range(len(lr_score)): if lr_score[i] == '.': lr_score[i] = None if len(fathmm_coding_score) > 1: for i in range(len(fathmm_coding_score)): if fathmm_coding_score[i] == '.': fathmm_coding_score[i] = None if len(dann_score) > 1: for i in range(len(dann_score)): if dann_score[i] == '.': dann_score[i] = None if len(integrated_fitcons_score) > 1: for i in range(len(integrated_fitcons_score)): if integrated_fitcons_score[i] == '.': integrated_fitcons_score[i] = None if len(gm12878_fitcons_score) > 1: for i in range(len(gm12878_fitcons_score)): if gm12878_fitcons_score[i] == '.': gm12878_fitcons_score[i] = None if len(h1_hesc_fitcons_score) > 1: for i in range(len(h1_hesc_fitcons_score)): if h1_hesc_fitcons_score[i] == '.': h1_hesc_fitcons_score[i] = None if len(huvec_fitcons_score) > 1: for i in range(len(huvec_fitcons_score)): if huvec_fitcons_score[i] == '.': huvec_fitcons_score[i] = None # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": fields[6], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": fields[10], "end": hg18_end }, "hg38": { "start": fields[1], "end": fields[1] }, "ref": ref, "alt": alt, "aa": { "ref": fields[4], "alt": fields[5], "pos": fields[22], "refcodon": fields[13], "codonpos": fields[14], "codon_degeneracy": fields[15] }, "genename": fields[11], "uniprot": uniprot, "interpro_domain": fields[180], "cds_strand": fields[12], "ancestral_allele": fields[16], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": fields[19], "transcriptid": fields[20], "proteinid": fields[21] }, "sift": { "score": sift_score, "converted_rankscore": fields[24], "pred": fields[25] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": fields[30], "pred": fields[31] }, "hvar": { "score": hvar_score, "rankscore": fields[33], "pred": fields[34] } }, "lrt": { "score": lrt_score, "converted_rankscore": fields[36], "pred": fields[37], "omega": fields[38] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": fields[40], "pred": fields[41], "model": fields[42], "AAE": fields[43] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": fields[47], "pred": fields[48] }, "fathmm": { "score": fathmm_score, "rankscore": fields[50], "pred": fields[51] }, "provean": { "score": provean_score, "rankscore": fields[53], "pred": fields[54] }, "vest3": { "score": vest3_score, "rankscore": fields[57], "transcriptid": fields[55], "transcriptvar": fields[56] }, "fathmm-mkl": { "coding_score": fathmm_coding_score, "coding_rankscore": fields[72], "coding_pred": fields[73], "coding_group": fields[74] }, "eigen": { "raw": fields[75], "phred": fields[76], "raw_rankscore": fields[77] }, "eigen-pc": { "raw": fields[78], "raw_rankscore": fields[79] }, "genocanyon": { "score": fields[80], "rankscore": fields[81] }, "metasvm": { "score": metasvm_score, "rankscore": fields[60], "pred": fields[61] }, "metalr": { "score": lr_score, "rankscore": fields[63], "pred": fields[64] }, "reliability_index": fields[65], "dann": { "score": dann_score, "rankscore": fields[70] }, "gerp++": { "nr": fields[94], "rs": fields[95], "rs_rankscore": fields[96] }, "integrated": { "fitcons_score": integrated_fitcons_score, "fitcons_rankscore": fields[83], "confidence_value": fields[84] }, "gm12878": { "fitcons_score": gm12878_fitcons_score, "fitcons_rankscore": fields[86], "confidence_value": fields[87] }, "h1-hesc": { "fitcons_score": h1_hesc_fitcons_score, "fitcons_rankscore": fields[89], "confidence_value": fields[90] }, "huvec": { "fitcons_score": huvec_fitcons_score, "fitcons_rankscore": fields[92], "confidence_value": fields[93] }, "phylo": { "p100way": { "vertebrate": fields[97], "vertebrate_rankscore": fields[98] }, "p20way": { "mammalian": fields[99], "mammalian_rankscore": fields[100] } }, "phastcons": { "100way": { "vertebrate": fields[101], "vertebrate_rankscore": fields[102] }, "20way": { "mammalian": fields[103], "mammalian_rankscore": fields[104] } }, "siphy_29way": { "pi": siphy, "logodds": fields[106], "logodds_rankscore": fields[107] }, "1000gp3": { "ac": fields[108], "af": fields[109], "afr_ac": fields[110], "afr_af": fields[111], "eur_ac": fields[112], "eur_af": fields[113], "amr_ac": fields[114], "amr_af": fields[115], "eas_ac": fields[116], "eas_af": fields[117], "sas_ac": fields[118], "sas_af": fields[119] }, "twinsuk": { "ac": fields[120], "af": fields[121] }, "alspac": { "ac": fields[122], "af": fields[123] }, "esp6500": { "aa_ac": fields[124], "aa_af": fields[125], "ea_ac": fields[126], "ea_af": fields[127] }, "exac": { "ac": fields[128], "af": fields[129], "adj_ac": fields[130], "adj_af": fields[131], "afr_ac": fields[132], "afr_af": fields[133], "amr_ac": fields[134], "amr_af": fields[135], "eas_ac": fields[136], "eas_af": fields[137], "fin_ac": fields[138], "fin_af": fields[139], "nfe_ac": fields[140], "nfe_af": fields[141], "sas_ac": fields[142], "sas_af": fields[143] }, "exac_nontcga": { "ac": fields[144], "af": fields[145], "adj_ac": fields[146], "adj_af": fields[147], "afr_ac": fields[148], "afr_af": fields[149], "amr_ac": fields[150], "amr_af": fields[151], "eas_ac": fields[152], "eas_af": fields[153], "fin_ac": fields[154], "fin_af": fields[155], "nfe_ac": fields[156], "nfe_af": fields[157], "sas_ac": fields[158], "sas_af": fields[159] }, "exac_nonpsych": { "ac": fields[160], "af": fields[161], "adj_ac": fields[162], "adj_af": fields[163], "afr_ac": fields[164], "afr_af": fields[165], "amr_ac": fields[166], "amr_af": fields[167], "eas_ac": fields[168], "eas_af": fields[169], "fin_ac": fields[170], "fin_af": fields[171], "nfe_ac": fields[172], "nfe_af": fields[173] }, "clinvar": { "rs": fields[176], "clinsig": fields[177], "trait": fields[178], "golden_stars": fields[179] }, "gtex": gtex } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(df, version, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-coor)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["pos(1-coor)"] == '.': return None else: chromStart = int(df["pos(1-coor)"]) chromEnd = chromStart chromStart_38 = int(df["hg38_pos"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";") pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST3_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split( " (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [{ "mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1]) }, { "mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3]) }, { "mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5]) }, { "mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7]) }, { "mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9]) }] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP147"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-coor)"], "end": hg18_end }, "hg38": { "start": df["hg38_pos"], "end": df["hg38_pos"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"] }, "genename": df["genename"], "uniprot": list(uniprot), "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest3": { "score": vest3_score, "rankscore": df["VEST3_rankscore"] }, "eigen": { "coding_or_noncoding": df["Eigen_coding_or_noncoding"], "raw": df["Eigen-raw"], "phred": df["Eigen-phred"] }, "eigen-pc": { "raw": df["Eigen-PC-raw"], "phred": df["Eigen-PC-phred"], "raw_rankscore": df["Eigen-PC-raw_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p46way": { "placental": df["phyloP46way_placental"], "placental_rankscore": df["phyloP46way_placental_rankscore"], "primate": df["phyloP46way_primate"], "primate_rankscore": df["phyloP46way_primate_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "46way": { "placental": df["phastCons46way_placental"], "placental_rankscore": df["phastCons46way_placental_rankscore"], "primate": df["phastCons46way_primate"], "primate_rankscore": df["phastCons46way_primate_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "1000gp1": { "ac": df["1000Gp1_AC"], "af": df["1000Gp1_AF"], "afr_ac": df["1000Gp1_AFR_AC"], "afr_af": df["1000Gp1_AFR_AF"], "eur_ac": df["1000Gp1_EUR_AC"], "eur_af": df["1000Gp1_EUR_AF"], "amr_ac": df["1000Gp1_AMR_AC"], "amr_af": df["1000Gp1_AMR_AF"], "asn_ac": df["1000Gp1_ASN_AC"], "asn_af": df["1000Gp1_ASN_AF"] }, "esp6500": { "aa_af": df["ESP6500_AA_AF"], "ea_af": df["ESP6500_EA_AF "] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "aric5606": { "aa_ac": df["ARIC5606_AA_AC"], "aa_af": df["ARIC5606_AA_AF"], "ea_ac": df["ARIC5606_EA_AC"], "ea_af": df["ARIC5606_EA_AF"] }, "clinvar": { "rs": df["clinvar_rs"], "clinsig": list( map(int, [ i for i in df["clinvar_clnsig"].split("|") if i != "." ])), "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "golden_stars": list( map(int, [ i for i in df["clinvar_golden_stars"].split("|") if i != "." ])) } } } one_snp_json = list_split( dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", None]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(df, version, index): # specific variable treatment chrom = df.get_value(index, "#chr") if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df.get_value(index, "hg18_pos(1-based)") if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df.get_value(index, "hg19_pos(1-based)") == '.': return None else: chromStart = int(df.get_value(index, "hg19_pos(1-based)")) chromEnd = chromStart chromStart_38 = int(df.get_value(index, "pos(1-based)")) ref = df.get_value(index, "ref").upper() alt = df.get_value(index, "alt").upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df.get_value(index, "SiPhy_29way_pi") if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df.get_value(index, "GTEx_V6_gene").split('|') gtex_tissue = df.get_value(index, "GTEx_V6_tissue").split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df.get_value(index, "Uniprot_acc_Polyphen2").rstrip().rstrip(';').split(";") pos = df.get_value( index, "Uniprot_aapos_Polyphen2").rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df.get_value(index, "PROVEAN_score").split(';') sift_score = df.get_value(index, "SIFT_score").split(';') hdiv_score = df.get_value(index, "Polyphen2_HDIV_score").split(';') hvar_score = df.get_value(index, "Polyphen2_HVAR_score").split(';') lrt_score = df.get_value(index, "LRT_score").split(';') m_cap_score = df.get_value(index, "M-CAP_score").split(';') mutationtaster_score = df.get_value(index, "MutationTaster_score").split(';') mutationassessor_score = df.get_value(index, "MutationAssessor_score").split(';') vest3_score = df.get_value(index, "VEST3_score").split(';') metasvm_score = df.get_value(index, "MetaSVM_score").split(';') fathmm_score = df.get_value(index, "FATHMM_score").split(';') metalr_score = df.get_value(index, "MetaLR_score").split(';') modify_score_list = [ provean_score, sift_score, hdiv_score, hvar_score, lrt_score, m_cap_score, mutationtaster_score, mutationassessor_score, vest3_score, metasvm_score, fathmm_score, metalr_score ] for _score in modify_score_list: [None if item == '.' else item for item in _score] # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df.get_value(index, "rs_dbSNP147"), #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df.get_value(index, "hg18_pos(1-based)"), "end": hg18_end }, "hg38": { "start": df.get_value(index, "pos(1-based)"), "end": df.get_value(index, "pos(1-based)") }, "ref": ref, "alt": alt, "aa": { "ref": df.get_value(index, "aaref"), "alt": df.get_value(index, "aaalt"), "pos": df.get_value(index, "aapos"), "refcodon": df.get_value(index, "refcodon"), "codonpos": df.get_value(index, "codonpos"), "codon_degeneracy": df.get_value(index, "codon_degeneracy"), }, "genename": df.get_value(index, "genename"), "uniprot": uniprot, "interpro_domain": df.get_value(index, "Interpro_domain"), "cds_strand": df.get_value(index, "cds_strand"), "ancestral_allele": df.get_value(index, "Ancestral_allele"), #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df.get_value(index, "Ensembl_geneid"), "transcriptid": df.get_value(index, "Ensembl_transcriptid"), "proteinid": df.get_value(index, "Ensembl_proteinid") }, "sift": { "score": sift_score, "converted_rankscore": df.get_value(index, "SIFT_converted_rankscore"), "pred": df.get_value(index, "SIFT_pred") }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df.get_value(index, "Polyphen2_HDIV_rankscore"), "pred": df.get_value(index, "Polyphen2_HDIV_pred") }, "hvar": { "score": hvar_score, "rankscore": df.get_value(index, "Polyphen2_HVAR_rankscore"), "pred": df.get_value(index, "Polyphen2_HVAR_pred") } }, "lrt": { "score": lrt_score, "converted_rankscore": df.get_value(index, "LRT_converted_rankscore"), "pred": df.get_value(index, "LRT_pred"), "omega": df.get_value(index, "LRT_Omega") }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df.get_value(index, "MutationTaster_converted_rankscore"), "pred": df.get_value(index, "MutationTaster_pred"), "model": df.get_value(index, "MutationTaster_model"), "AAE": df.get_value(index, "MutationTaster_AAE") }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df.get_value(index, "MutationAssessor_score_rankscore"), "pred": df.get_value(index, "MutationAssessor_pred") }, "fathmm": { "score": fathmm_score, "rankscore": df.get_value(index, "FATHMM_converted_rankscore"), "pred": df.get_value(index, "FATHMM_pred") }, "provean": { "score": provean_score, "rankscore": df.get_value(index, "PROVEAN_converted_rankscore"), "pred": df.get_value(index, "PROVEAN_pred") }, "vest3": { "score": vest3_score, "rankscore": df.get_value(index, "VEST3_rankscore"), "transcriptid": df.get_value(index, "Transcript_id_VEST3"), "transcriptvar": df.get_value(index, "Transcript_var_VEST3") }, "fathmm-mkl": { "coding_score": df.get_value(index, "fathmm-MKL_coding_score"), "coding_rankscore": df.get_value(index, "fathmm-MKL_coding_rankscore"), "coding_pred": df.get_value(index, "fathmm-MKL_coding_pred"), "coding_group": df.get_value(index, "fathmm-MKL_coding_group") }, "eigen": { "coding_or_noncoding": df.get_value(index, "Eigen_coding_or_noncoding"), "raw": df.get_value(index, "Eigen-raw"), "phred": df.get_value(index, "Eigen-phred") }, "eigen-pc": { "raw": df.get_value(index, "Eigen-PC-raw"), "phred": df.get_value(index, "Eigen-PC-phred"), "raw_rankscore": df.get_value(index, "Eigen-PC-raw_rankscore") }, "genocanyon": { "score": df.get_value(index, "GenoCanyon_score"), "rankscore": df.get_value(index, "GenoCanyon_score_rankscore") }, "metasvm": { "score": metasvm_score, "rankscore": df.get_value(index, "MetaSVM_rankscore"), "pred": df.get_value(index, "MetaSVM_pred") }, "metalr": { "score": metalr_score, "rankscore": df.get_value(index, "MetaLR_rankscore"), "pred": df.get_value(index, "MetaLR_pred") }, "reliability_index": df.get_value(index, "Reliability_index"), "m_cap_score": { "score": m_cap_score, "rankscore": df.get_value(index, "M-CAP_rankscore"), "pred": df.get_value(index, "M-CAP_pred") }, "dann": { "score": df.get_value(index, "DANN_score"), "rankscore": df.get_value(index, "DANN_rankscore") }, "gerp++": { "nr": df.get_value(index, "GERP++_NR"), "rs": df.get_value(index, "GERP++_RS"), "rs_rankscore": df.get_value(index, "GERP++_RS_rankscore") }, "integrated": { "fitcons_score": df.get_value(index, "integrated_fitCons_score"), "fitcons_rankscore": df.get_value(index, "integrated_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "integrated_confidence_value") }, "gm12878": { "fitcons_score": df.get_value(index, "GM12878_fitCons_score"), "fitcons_rankscore": df.get_value(index, "GM12878_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "GM12878_confidence_value") }, "h1-hesc": { "fitcons_score": df.get_value(index, "H1-hESC_fitCons_score"), "fitcons_rankscore": df.get_value(index, "H1-hESC_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "H1-hESC_confidence_value") }, "huvec": { "fitcons_score": df.get_value(index, "HUVEC_fitCons_score"), "fitcons_rankscore": df.get_value(index, "HUVEC_fitCons_score_rankscore"), "confidence_value": df.get_value(index, "HUVEC_confidence_value") }, "phylo": { "p100way": { "vertebrate": df.get_value(index, "phyloP100way_vertebrate"), "vertebrate_rankscore": df.get_value(index, "phyloP100way_vertebrate_rankscore") }, "p20way": { "mammalian": df.get_value(index, "phyloP20way_mammalian"), "mammalian_rankscore": df.get_value(index, "phyloP20way_mammalian_rankscore") } }, "phastcons": { "100way": { "vertebrate": df.get_value(index, "phastCons100way_vertebrate"), "vertebrate_rankscore": df.get_value(index, "phastCons100way_vertebrate_rankscore") }, "20way": { "mammalian": df.get_value(index, "phastCons20way_mammalian"), "mammalian_rankscore": df.get_value(index, "phastCons20way_mammalian_rankscore") } }, "siphy_29way": { "pi": siphy, "logodds": df.get_value(index, "SiPhy_29way_logOdds"), "logodds_rankscore": df.get_value(index, "SiPhy_29way_logOdds_rankscore") }, "1000gp3": { "ac": df.get_value(index, "1000Gp3_AC"), "af": df.get_value(index, "1000Gp3_AF"), "afr_ac": df.get_value(index, "1000Gp3_AFR_AC"), "afr_af": df.get_value(index, "1000Gp3_AFR_AF"), "eur_ac": df.get_value(index, "1000Gp3_EUR_AC"), "eur_af": df.get_value(index, "1000Gp3_EUR_AF"), "amr_ac": df.get_value(index, "1000Gp3_AMR_AC"), "amr_af": df.get_value(index, "1000Gp3_AMR_AF"), "eas_ac": df.get_value(index, "1000Gp3_EAS_AC"), "eas_af": df.get_value(index, "1000Gp3_EAS_AF"), "sas_ac": df.get_value(index, "1000Gp3_SAS_AC"), "sas_af": df.get_value(index, "1000Gp3_SAS_AF") }, "twinsuk": { "ac": df.get_value(index, "TWINSUK_AC"), "af": df.get_value(index, "TWINSUK_AF") }, "alspac": { "ac": df.get_value(index, "ALSPAC_AC"), "af": df.get_value(index, "ALSPAC_AF") }, "esp6500": { "aa_ac": df.get_value(index, "ESP6500_AA_AC"), "aa_af": df.get_value(index, "ESP6500_AA_AF"), "ea_ac": df.get_value(index, "ESP6500_EA_AC"), "ea_af": df.get_value(index, "ESP6500_EA_AF") }, "exac": { "ac": df.get_value(index, "ExAC_AC"), "af": df.get_value(index, "ExAC_AF"), "adj_ac": df.get_value(index, "ExAC_Adj_AC"), "adj_af": df.get_value(index, "ExAC_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_AFR_AC"), "afr_af": df.get_value(index, "ExAC_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_AMR_AC"), "amr_af": df.get_value(index, "ExAC_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_EAS_AC"), "eas_af": df.get_value(index, "ExAC_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_FIN_AC"), "fin_af": df.get_value(index, "ExAC_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_SAS_AC"), "sas_af": df.get_value(index, "ExAC_SAS_AF") }, "exac_nontcga": { "ac": df.get_value(index, "ExAC_nonTCGA_AC"), "af": df.get_value(index, "ExAC_nonTCGA_AF"), "adj_ac": df.get_value(index, "ExAC_nonTCGA_Adj_AC"), "adj_af": df.get_value(index, "ExAC_nonTCGA_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_nonTCGA_AFR_AC"), "afr_af": df.get_value(index, "ExAC_nonTCGA_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_nonTCGA_AMR_AC"), "amr_af": df.get_value(index, "ExAC_nonTCGA_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_nonTCGA_EAS_AC"), "eas_af": df.get_value(index, "ExAC_nonTCGA_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_nonTCGA_FIN_AC"), "fin_af": df.get_value(index, "ExAC_nonTCGA_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_nonTCGA_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_nonTCGA_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_nonTCGA_SAS_AC"), "sas_af": df.get_value(index, "ExAC_nonTCGA_SAS_AF") }, "exac_nonpsych": { "ac": df.get_value(index, "ExAC_nonpsych_AC"), "af": df.get_value(index, "ExAC_nonpsych_AF"), "adj_ac": df.get_value(index, "ExAC_nonpsych_Adj_AC"), "adj_af": df.get_value(index, "ExAC_nonpsych_Adj_AF"), "afr_ac": df.get_value(index, "ExAC_nonpsych_AFR_AC"), "afr_af": df.get_value(index, "ExAC_nonpsych_AFR_AF"), "amr_ac": df.get_value(index, "ExAC_nonpsych_AMR_AC"), "amr_af": df.get_value(index, "ExAC_nonpsych_AMR_AF"), "eas_ac": df.get_value(index, "ExAC_nonpsych_EAS_AC"), "eas_af": df.get_value(index, "ExAC_nonpsych_EAS_AF"), "fin_ac": df.get_value(index, "ExAC_nonpsych_FIN_AC"), "fin_af": df.get_value(index, "ExAC_nonpsych_FIN_AF"), "nfe_ac": df.get_value(index, "ExAC_nonpsych_NFE_AC"), "nfe_af": df.get_value(index, "ExAC_nonpsych_NFE_AF"), "sas_ac": df.get_value(index, "ExAC_nonpsych_SAS_AC"), "sas_af": df.get_value(index, "ExAC_nonpsych_SAS_AF") }, "clinvar": { "rs": df.get_value(index, "clinvar_rs"), "clinsig": df.get_value(index, "clinvar_clnsig"), "trait": df.get_value(index, "clinvar_trait"), "golden_stars": df.get_value(index, "clinvar_golden_stars") }, "gtex": gtex } } one_snp_json = list_split( dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"]) yield obj
def _map_line_to_json(df, version, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-coor)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["pos(1-coor)"] == '.': return None else: chromStart = int(df["pos(1-coor)"]) chromEnd = chromStart chromStart_38 = int(df["hg38_pos"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";") pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST3_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [ {"mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1])}, {"mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3])}, {"mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5])}, {"mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7])}, {"mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9])} ] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP147"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-coor)"], "end": hg18_end }, "hg38": { "start": df["hg38_pos"], "end": df["hg38_pos"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"] }, "genename": df["genename"], "uniprot": list(uniprot), "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest3": { "score": vest3_score, "rankscore": df["VEST3_rankscore"] }, "eigen": { "coding_or_noncoding": df["Eigen_coding_or_noncoding"], "raw": df["Eigen-raw"], "phred": df["Eigen-phred"] }, "eigen-pc": { "raw": df["Eigen-PC-raw"], "phred": df["Eigen-PC-phred"], "raw_rankscore": df["Eigen-PC-raw_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p46way": { "placental": df["phyloP46way_placental"], "placental_rankscore": df["phyloP46way_placental_rankscore"], "primate": df["phyloP46way_primate"], "primate_rankscore": df["phyloP46way_primate_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "46way": { "placental": df["phastCons46way_placental"], "placental_rankscore": df["phastCons46way_placental_rankscore"], "primate": df["phastCons46way_primate"], "primate_rankscore": df["phastCons46way_primate_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "1000gp1": { "ac": df["1000Gp1_AC"], "af": df["1000Gp1_AF"], "afr_ac": df["1000Gp1_AFR_AC"], "afr_af": df["1000Gp1_AFR_AF"], "eur_ac": df["1000Gp1_EUR_AC"], "eur_af": df["1000Gp1_EUR_AF"], "amr_ac": df["1000Gp1_AMR_AC"], "amr_af": df["1000Gp1_AMR_AF"], "asn_ac": df["1000Gp1_ASN_AC"], "asn_af": df["1000Gp1_ASN_AF"] }, "esp6500": { "aa_af": df["ESP6500_AA_AF"], "ea_af": df["ESP6500_EA_AF "] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "aric5606": { "aa_ac": df["ARIC5606_AA_AC"], "aa_af": df["ARIC5606_AA_AF"], "ea_ac": df["ARIC5606_EA_AC"], "ea_af": df["ARIC5606_EA_AF"] }, "clinvar": { "rs": df["clinvar_rs"], "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])), "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."])) } } } one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", None]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chr_info = re.findall(r"[\w']+", fields[17]) chrom = chr_info[0] # Mutation GRCh37 genome position chromStart = chr_info[1] chromEnd = chr_info[2] HGVS = None cds = fields[13] sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds) ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds) delete = cds.find('del') != -1 del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds) comp = re.search(r'[ATCGMN]+', cds) if sub: HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group()) elif ins: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group()) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif del_ins: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group()) # elif comp: # HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group()) else: HGVS = fields[12] print "Error2:", fields[15], cds, fields[17] # load as json data if HGVS is None: return one_snp_json = { "sorter": fields[17] + fields[13], "_id": HGVS, "cosmic": { "gene": { "symbol": fields[0], # Gene name "id": fields[3], # HGNC ID "cds_length": fields[2] }, "transcript": fields[1], # Accession Number "sample": { "name": fields[4], # Sample name "id": fields[5] # ID_sample }, "tumour": { "id": fields[6], # ID_tumour "primary_site": fields[7], # Primary site "site_subtype": fields[8], # Site subtype "primary_histology": fields[9], # Primary histology "histology_subtype": fields[10], # Histology subtype "origin": fields[1] }, "mutation": { "id": "COSM" + fields[12], # Mutation ID "cds": cds, # Mutation CDS "aa": fields[14], # Mutation AA "description": fields[15], # Mutation Description "zygosity": fields[16], # Mutation zygosity "somatic_status": fields[21] # Mutation somatic status }, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "pubmed": fields[22] # Pubmed_PMID } } return dict_sweep(value_convert(one_snp_json), vals=[""])
def _map_line_to_json(fields): if len(fields) == VALID_COLUMN_NO: chrom = fields[0] chromStart = fields[1] allele1 = fields[2] allele2 = fields[4] HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, allele1, allele2) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'encode': { 'exp': fields[29], 'h3k27ac': fields[30], 'h3k4me1': fields[31], 'h3k4me3': fields[32], 'nucleo': fields[33], 'occ': fields[34], 'p_val': { 'comb': fields[35], 'dnas': fields[36], 'faire': fields[37], 'polii': fields[38], 'ctcf': fields[39], 'mycp': fields[40] }, 'sig': { 'dnase': fields[41], 'faire': fields[42], 'polii': fields[43], 'ctcf': fields[44], 'myc': fields[45] }, }, 'segway': fields[46], 'motif': { 'toverlap': fields[47], 'dist': fields[48], 'ecount': fields[49], 'ename': fields[50], 'ehipos': fields[51], 'escorechng': fields[52] }, 'tf': { 'bs': fields[53], 'bs_peaks': fields[54], 'bs_peaks_max': fields[55] }, 'isknownvariant': fields[56], 'esp': { 'af': fields[57], 'afr': fields[58], 'eur': fields[59] }, '1000g': { 'af': fields[60], 'asn': fields[61], 'amr': fields[62], 'afr': fields[63], 'eur': fields[64] }, 'min_dist_tss': fields[65], 'min_dist_tse': fields[66], 'gene': { 'gene_id': fields[67], 'feature_id': fields[68], 'ccds_id': fields[69], 'genename': fields[70], 'cds': { 'cdna_pos': fields[71], 'rel_cdna_pos': fields[72], 'cds_pos': fields[73], 'rel_cds_pos': fields[74] }, 'prot': { 'protpos': fields[75], 'rel_prot_pos': fields[76], 'oaa': fields[81], 'naa': fields[82] }, 'dst_2_splice': fields[77], 'dst_2_spltype': fields[78], 'exon': fields[79], 'intron': fields[80] }, 'grantham': fields[83], 'polyphen': { 'cat': fields[84], 'val': fields[85] }, 'sift': { 'cat': fields[86], 'val': fields[87] }, 'rawscore': fields[88], 'phred': fields[89] } } return dict_sweep(unlist(value_convert(one_snp_json)), "NA")
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[13] chromStart = fields[14] chromEnd = fields[15] HGVS = None cds = fields[18].split(":") cds = cds[1] replace = re.findall(r'[ATCGMNYR=]+', cds) sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) delete = fields[1] == 'deletion' indel = fields[1] == 'indel' dup = re.search(r'dup', cds) inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) if ins: delete = None indel = None elif delete: ins = None indel = None # parse from vcf file. Input chrom number # and chromStart, and return REF, ALT if chromStart: record = vcf_reader.fetch(chrom, int(chromStart)) else: record = None if record: REF = record.REF ALT = record.ALT ALT = ALT[0] if record.is_snp and len(ALT) < 2: mod = [REF, ALT] else: mod = ALT else: return if sub and record.is_snp: HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) elif ins: HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif indel: try: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) except AttributeError: print "ERROR:", fields[1], cds elif dup: HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) elif inv: HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) elif replace: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) else: print 'ERROR:', fields[1], cds # load as json data if HGVS is None: print 'None:', fields[1], cds return None one_snp_json = { "_id": HGVS, "clinvar": { "allele_id": fields[0], "hg19": { "chr": fields[13], "start": fields[14], "end": fields[15] }, "type": fields[1], "name": fields[2], "gene": { "id": fields[3], "symbol": fields[4] }, "clinical_significance": fields[5].split(";"), "rsid": 'rs' + str(fields[6]), "nsv_dbvar": fields[7], "rcv_accession": fields[8].split(";"), "tested_in_gtr": fields[9], "phenotype_id": other_id(fields[10]), "origin": fields[11], "cytogenic": fields[16], "review_status": fields[17], "hgvs": { "coding": fields[18], "protein": fields[19] }, "number_submitters": fields[20], "last_evaluated": fields[21], "guidelines": fields[22], "other_ids": other_id(fields[23]), "clinvar_id": fields[24] } } return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])