示例#1
0
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V6_gene"].split('|')
    gtex_tissue = df["GTEx_V6_tissue "].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))
    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [
            {"mechanism": mutpred_mechanisms[0],
             "p_val": modify_pvalue(mutpred_mechanisms[1])},
            {"mechanism": mutpred_mechanisms[2],
             "p_val": modify_pvalue(mutpred_mechanisms[3])},
            {"mechanism": mutpred_mechanisms[4],
             "p_val": modify_pvalue(mutpred_mechanisms[5])},
            {"mechanism": mutpred_mechanisms[6],
             "p_val": modify_pvalue(mutpred_mechanisms[7])},
            {"mechanism": mutpred_mechanisms[8],
             "p_val": modify_pvalue(mutpred_mechanisms[9])}
        ]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_score_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"],
                "transcriptid": df["Transcript_id_VEST3"],
                "transcriptvar": df["Transcript_var_VEST3"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_score_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_score_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"]
                },
                "p20way": {
                    "mammalian": df["phyloP20way_mammalian"],
                    "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"]
                },
                "20way": {
                    "mammalian": df["phastCons20way_mammalian"],
                    "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "rs": df["clinvar_rs"],
                "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])),
                "trait": [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."]))
            },
            "gtex": list(gtex)
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#2
0
def _map_line_to_json(fields,dbsnp_col):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    docs = [d for d in dbsnp_col.find({"dbsnp.rsid":rsid})]
    for doc in docs:
        HGVS = doc['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")
def _map_line_to_json(fields, dbsnp_col):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    docs = [d for d in dbsnp_col.find({"dbsnp.rsid": rsid})]
    for doc in docs:
        HGVS = doc['_id']
        one_snp_json = {
            "_id": HGVS,
            "grasp": {
                'hg19': {
                    'chr': fields[5],
                    'pos': fields[6]
                },
                'hupfield': fields[1],
                'last_curation_date': fields[2],
                'creation_date': fields[3],
                'srsid': fields[4],
                'publication': {
                    'journal': fields[16],
                    'title': fields[17],
                    'pmid': fields[7],
                    'snpid': fields[8],
                    'location_within_paper': fields[9],
                    'p_value': fields[10],
                    'phenotype': fields[11],
                    'paper_phenotype_description': fields[12],
                    'paper_phenotype_categories': fields[13],
                    'date_pub': fields[14]
                },
                'includes_male_female_only_analyses': fields[18],
                'exclusively_male_female': fields[19],
                'initial_sample_description': fields[20],
                'replication_sample_description': fields[21],
                'platform_snps_passing_qc': fields[22],
                'gwas_ancestry_description': fields[23],
                'discovery': {
                    'total_samples': fields[25],
                    'european': fields[26],
                    'african': fields[27],
                    'east_asian': fields[28],
                    'indian_south_asian': fields[29],
                    'hispanic': fields[30],
                    'native': fields[31],
                    'micronesian': fields[32],
                    'arab_me': fields[33],
                    'mixed': fields[34],
                    'unspecified': fields[35],
                    'filipino': fields[36],
                    'indonesian': fields[37]
                },
                'replication': {
                    'total_samples': fields[38],
                    'european': fields[39],
                    'african': fields[40],
                    'east_asian': fields[41],
                    'indian_south_asian': fields[42],
                    'hispanic': fields[43],
                    'native': fields[44],
                    'micronesian': fields[45],
                    'arab_me': fields[46],
                    'mixed': fields[47],
                    'unspecified': fields[48],
                    'filipino': fields[49],
                    'indonesian': fields[50]
                },
                'in_gene': fields[51],
                'nearest_gene': fields[52],
                'in_lincrna': fields[53],
                'in_mirna': fields[54],
                'in_mirna_bs': fields[55],
                'oreg_anno': fields[61],
                'conserv_pred_tfbs': fields[62],
                'human_enhancer': fields[63],
                'rna_edit': fields[64],
                'polyphen2': fields[65],
                'sift': fields[66],
                'ls_snp': fields[67],
                'uniprot': fields[68],
                'eqtl_meth_metab_study': fields[69]
            }
        }
        return list_split(
            dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]),
            ",")
示例#4
0
def _map_line_to_json(df, version, include_gnomad, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V6p_gene"].split('|')
    gtex_tissue = df["GTEx_V6p_tissue"].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    gnomad = {
        "gnomad_exomes": {
            "ac": df["gnomAD_exomes_AC"],
            "an": df["gnomAD_exomes_AN"],
            "af": df["gnomAD_exomes_AF"],
            "afr_ac": df["gnomAD_exomes_AFR_AC"],
            "afr_af": df["gnomAD_exomes_AFR_AF"],
            "afr_an": df["gnomAD_exomes_AFR_AN"],
            "amr_ac": df["gnomAD_exomes_AMR_AC"],
            "amr_an": df["gnomAD_exomes_AMR_AN"],
            "amr_af": df["gnomAD_exomes_AMR_AF"],
            "asj_ac": df["gnomAD_exomes_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_ASJ_AF"],
            "eas_ac": df["gnomAD_exomes_EAS_AC"],
            "eas_af": df["gnomAD_exomes_EAS_AF"],
            "eas_an": df["gnomAD_exomes_EAS_AN"],
            "fin_ac": df["gnomAD_exomes_FIN_AC"],
            "fin_af": df["gnomAD_exomes_FIN_AF"],
            "fin_an": df["gnomAD_exomes_FIN_AN"],
            "nfe_ac": df["gnomAD_exomes_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_NFE_AN"],
            "sas_ac": df["gnomAD_exomes_SAS_AC"],
            "sas_af": df["gnomAD_exomes_SAS_AF"],
            "sas_an": df["gnomAD_exomes_SAS_AN"],
            "oth_ac": df["gnomAD_exomes_OTH_AC"],
            "oth_af": df["gnomAD_exomes_OTH_AF"],
            "oth_an": df["gnomAD_exomes_OTH_AN"]
        },
        "gnomad_genomes": {
            "ac": df["gnomAD_genomes_AC"],
            "an": df["gnomAD_genomes_AN"],
            "af": df["gnomAD_genomes_AF"],
            "afr_ac": df["gnomAD_genomes_AFR_AC"],
            "afr_af": df["gnomAD_genomes_AFR_AF"],
            "afr_an": df["gnomAD_genomes_AFR_AN"],
            "amr_ac": df["gnomAD_genomes_AMR_AC"],
            "amr_an": df["gnomAD_genomes_AMR_AN"],
            "amr_af": df["gnomAD_genomes_AMR_AF"],
            "asj_ac": df["gnomAD_genomes_ASJ_AC"],
            "asj_an": df["gnomAD_genomes_ASJ_AN"],
            "asj_af": df["gnomAD_genomes_ASJ_AF"],
            "eas_ac": df["gnomAD_genomes_EAS_AC"],
            "eas_af": df["gnomAD_genomes_EAS_AF"],
            "eas_an": df["gnomAD_genomes_EAS_AN"],
            "fin_ac": df["gnomAD_genomes_FIN_AC"],
            "fin_af": df["gnomAD_genomes_FIN_AF"],
            "fin_an": df["gnomAD_genomes_FIN_AN"],
            "nfe_ac": df["gnomAD_genomes_NFE_AC"],
            "nfe_af": df["gnomAD_genomes_NFE_AF"],
            "nfe_an": df["gnomAD_genomes_NFE_AN"],
            "oth_ac": df["gnomAD_genomes_OTH_AC"],
            "oth_af": df["gnomAD_genomes_OTH_AF"],
            "oth_an": df["gnomAD_genomes_OTH_AN"]
        }
    }

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP150"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_score_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"],
                "transcriptid": df["Transcript_id_VEST3"],
                "transcriptvar": df["Transcript_var_VEST3"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_score_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_score_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p20way": {
                    "mammalian": df["phyloP20way_mammalian"],
                    "mammalian_rankscore":
                    df["phyloP20way_mammalian_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "20way": {
                    "mammalian": df["phastCons20way_mammalian"],
                    "mammalian_rankscore":
                    df["phastCons20way_mammalian_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "rs":
                df["clinvar_rs"],
                "clinsig":
                list(
                    map(int, [
                        i for i in df["clinvar_clnsig"].split("|") if i != "."
                    ])),
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars":
                list(
                    map(int, [
                        i for i in df["clinvar_golden_stars"].split("|")
                        if i != "."
                    ]))
            },
            "gtex": list(gtex)
        }
    }
    if include_gnomad:
        one_snp_json['dbnsfp'].update(gnomad)
    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", '-', None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#5
0
def _map_line_to_json(df, version, include_gnomad, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V8_gene"].split('|')
    gtex_tissue = df["GTEx_V8_tissue"].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    entry = df["Uniprot_entry"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'entry'), t),
                            zip(acc, entry)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    sift4g_score = df["SIFT4G_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST4_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    appris = df["APPRIS"].split(";")
    mpc_score = df["MPC_score"].split(';')
    mvp_score = df["MVP_score"].split(';')
    tsl = df["TSL"].split(';')
    vep_canonical = df["VEP_canonical"].split(';')
    deogen2_score = df["DEOGEN2_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    gnomad = {
        "gnomad_exomes": {
            "flag": df["gnomAD_exomes_flag"],
            "nhomalt": df["gnomAD_exomes_nhomalt"],
            "ac": df["gnomAD_exomes_AC"],
            "an": df["gnomAD_exomes_AN"],
            "af": df["gnomAD_exomes_AF"],
            "nhomalt": df["gnomAD_exomes_nhomalt"],
            "afr_ac": df["gnomAD_exomes_AFR_AC"],
            "afr_af": df["gnomAD_exomes_AFR_AF"],
            "afr_an": df["gnomAD_exomes_AFR_AN"],
            "afr_nhomalt": df["gnomAD_exomes_AFR_nhomalt"],
            "amr_ac": df["gnomAD_exomes_AMR_AC"],
            "amr_an": df["gnomAD_exomes_AMR_AN"],
            "amr_af": df["gnomAD_exomes_AMR_AF"],
            "amr_nhomalt": df["gnomAD_exomes_AMR_nhomalt"],
            "asj_ac": df["gnomAD_exomes_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_exomes_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_exomes_EAS_AC"],
            "eas_af": df["gnomAD_exomes_EAS_AF"],
            "eas_an": df["gnomAD_exomes_EAS_AN"],
            "eas_nhomalt": df["gnomAD_exomes_EAS_nhomalt"],
            "fin_ac": df["gnomAD_exomes_FIN_AC"],
            "fin_af": df["gnomAD_exomes_FIN_AF"],
            "fin_an": df["gnomAD_exomes_FIN_AN"],
            "fin_nhomalt": df["gnomAD_exomes_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_exomes_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_exomes_NFE_nhomalt"],
            "sas_ac": df["gnomAD_exomes_SAS_AC"],
            "sas_af": df["gnomAD_exomes_SAS_AF"],
            "sas_an": df["gnomAD_exomes_SAS_AN"],
            "sas_nhomalt": df["gnomAD_exomes_SAS_nhomalt"],
            "popmax_ac": df["gnomAD_exomes_POPMAX_AC"],
            "popmax_af": df["gnomAD_exomes_POPMAX_AF"],
            "popmax_an": df["gnomAD_exomes_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_exomes_POPMAX_nhomalt"]
        },
        "gnomad_exomes_controls": {
            "nhomalt": df["gnomAD_exomes_controls_nhomalt"],
            "ac": df["gnomAD_exomes_controls_AC"],
            "an": df["gnomAD_exomes_controls_AN"],
            "af": df["gnomAD_exomes_controls_AF"],
            "nhomalt": df["gnomAD_exomes_controls_nhomalt"],
            "afr_ac": df["gnomAD_exomes_controls_AFR_AC"],
            "afr_af": df["gnomAD_exomes_controls_AFR_AF"],
            "afr_an": df["gnomAD_exomes_controls_AFR_AN"],
            "afr_nhomalt": df["gnomAD_exomes_controls_AFR_nhomalt"],
            "amr_ac": df["gnomAD_exomes_controls_AMR_AC"],
            "amr_an": df["gnomAD_exomes_controls_AMR_AN"],
            "amr_af": df["gnomAD_exomes_controls_AMR_AF"],
            "amr_nhomalt": df["gnomAD_exomes_controls_AMR_nhomalt"],
            "asj_ac": df["gnomAD_exomes_controls_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_controls_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_controls_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_exomes_controls_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_exomes_controls_EAS_AC"],
            "eas_af": df["gnomAD_exomes_controls_EAS_AF"],
            "eas_an": df["gnomAD_exomes_controls_EAS_AN"],
            "eas_nhomalt": df["gnomAD_exomes_controls_EAS_nhomalt"],
            "fin_ac": df["gnomAD_exomes_controls_FIN_AC"],
            "fin_af": df["gnomAD_exomes_controls_FIN_AF"],
            "fin_an": df["gnomAD_exomes_controls_FIN_AN"],
            "fin_nhomalt": df["gnomAD_exomes_controls_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_exomes_controls_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_controls_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_controls_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_exomes_controls_NFE_nhomalt"],
            "sas_ac": df["gnomAD_exomes_controls_SAS_AC"],
            "sas_af": df["gnomAD_exomes_controls_SAS_AF"],
            "sas_an": df["gnomAD_exomes_controls_SAS_AN"],
            "sas_nhomalt": df["gnomAD_exomes_controls_SAS_nhomalt"],
            "popmax_ac": df["gnomAD_exomes_controls_POPMAX_AC"],
            "popmax_af": df["gnomAD_exomes_controls_POPMAX_AF"],
            "popmax_an": df["gnomAD_exomes_controls_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_exomes_controls_POPMAX_nhomalt"]
        },
        "gnomad_genomes": {
            "flag": df["gnomAD_genomes_flag"],
            "nhomalt": df["gnomAD_genomes_nhomalt"],
            "ac": df["gnomAD_genomes_AC"],
            "an": df["gnomAD_genomes_AN"],
            "af": df["gnomAD_genomes_AF"],
            "nhomalt": df["gnomAD_genomes_nhomalt"],
            "afr_ac": df["gnomAD_genomes_AFR_AC"],
            "afr_af": df["gnomAD_genomes_AFR_AF"],
            "afr_an": df["gnomAD_genomes_AFR_AN"],
            "afr_nhomalt": df["gnomAD_genomes_AFR_nhomalt"],
            "ami_ac": df["gnomAD_genomes_AMI_AC"],
            "ami_an": df["gnomAD_genomes_AMI_AN"],
            "ami_af": df["gnomAD_genomes_AMI_AF"],
            "ami_nhomalt": df["gnomAD_genomes_AMI_nhomalt"],
            "amr_ac": df["gnomAD_genomes_AMR_AC"],
            "amr_an": df["gnomAD_genomes_AMR_AN"],
            "amr_af": df["gnomAD_genomes_AMR_AF"],
            "amr_nhomalt": df["gnomAD_genomes_AMR_nhomalt"],
            "asj_ac": df["gnomAD_genomes_ASJ_AC"],
            "asj_an": df["gnomAD_genomes_ASJ_AN"],
            "asj_af": df["gnomAD_genomes_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_genomes_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_genomes_EAS_AC"],
            "eas_af": df["gnomAD_genomes_EAS_AF"],
            "eas_an": df["gnomAD_genomes_EAS_AN"],
            "eas_nhomalt": df["gnomAD_genomes_EAS_nhomalt"],
            "fin_ac": df["gnomAD_genomes_FIN_AC"],
            "fin_af": df["gnomAD_genomes_FIN_AF"],
            "fin_an": df["gnomAD_genomes_FIN_AN"],
            "fin_nhomalt": df["gnomAD_genomes_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_genomes_NFE_AC"],
            "nfe_af": df["gnomAD_genomes_NFE_AF"],
            "nfe_an": df["gnomAD_genomes_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_genomes_NFE_nhomalt"],
            "popmax_ac": df["gnomAD_genomes_POPMAX_AC"],
            "popmax_af": df["gnomAD_genomes_POPMAX_AF"],
            "popmax_an": df["gnomAD_genomes_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_genomes_POPMAX_nhomalt"]
        }
    }

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid":
            df["rs_dbSNP151"],
            #"rsid_dbSNP144": fields[6],
            "chrom":
            chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref":
            ref,
            "alt":
            alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename":
            df["genename"],
            "uniprot":
            list(uniprot),
            "vindijia_neandertal":
            [i for i in df["VindijiaNeandertal"].split("/") if i != "."],
            "interpro_domain":
            df["Interpro_domain"],
            "cds_strand":
            df["cds_strand"],
            "ancestral_allele":
            df["Ancestral_allele"],
            "appris":
            appris,
            "genecode_basic":
            df["GENCODE_basic"],
            "tsl":
            tsl,
            "vep_canonical":
            vep_canonical,
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "sift4g": {
                "score": sift4g_score,
                "pred": df["SIFT4G_score"],
                "converted_rankscore": df["SIFT4G_converted_rankscore"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mvp": {
                "score": mvp_score,
                "rankscore": df["MVP_rankscore"]
            },
            "mpc": {
                "score": mpc_score,
                "rankscore": df["MPC_rankscore"]
            },
            "bstatistic": {
                "score": df['bStatistic'],
                "rankscore": df["bStatistic_rankscore"]
            },
            "aloft": {
                "fraction_transcripts_affected":
                df["Aloft_Fraction_transcripts_affected"].split(';'),
                "prob_tolerant":
                df["Aloft_prob_Tolerant"],
                "prob_recessive":
                df["Aloft_prob_Recessive"],
                "prob_dominant":
                df["Aloft_prob_Dominant"],
                "pred":
                df["Aloft_pred"],
                "confidence":
                df["Aloft_Confidence"],
            },
            "primateai": {
                "score": df["PrimateAI_score"],
                "rankscore": df["PrimateAI_rankscore"],
                "pred": df["PrimateAI_pred"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest4": {
                "score": vest3_score,
                "rankscore": df["VEST4_rankscore"]
            },
            "deogen2": {
                "score": deogen2_score,
                "rankscore": df["DEOGEN2_rankscore"],
                "pred": df["DEOGEN2_pred"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "fathmm-xf": {
                "coding_score": df["fathmm-XF_coding_score"],
                "coding_rankscore": df["fathmm-XF_coding_rankscore"],
                "coding_pred": df["fathmm-XF_coding_pred"]
            },
            "eigen": {
                "raw_coding": df["Eigen-raw_coding"],
                "raw_coding_rankscore": df["Eigen-raw_coding_rankscore"],
                "phred_coding": df["Eigen-pred_coding"]
            },
            "eigen-pc": {
                "raw_coding": df["Eigen-PC-raw_coding"],
                "phred_coding": df["Eigen-PC-phred_coding"],
                "raw_rankscore": df["Eigen-PC-raw_coding_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index":
            df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p30way": {
                    "mammalian": df["phyloP30way_mammalian"],
                    "mammalian_rankscore":
                    df["phyloP30way_mammalian_rankscore"]
                },
                "p17way": {
                    "primate": df["phyloP17way_primate"],
                    "primate_rankscore": df["phyloP17way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "30way": {
                    "mammalian": df["phastCons30way_mammalian"],
                    "mammalian_rankscore":
                    df["phastCons30way_mammalian_rankscore"]
                },
                "p17way": {
                    "primate": df["phastCons17way_primate"],
                    "primate_rankscore": df["phastCons17way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "bayesdel": {
                "add_af": {
                    "score": df["BayesDel_addAF_score"],
                    "rankscore": df["BayesDel_addAF_rankscore"],
                    "pred": df["BayesDel_addAF_pred"]
                },
                "no_af": {
                    "score": df["BayesDel_noAF_score"],
                    "rankscore": df["BayesDel_noAF_rankscore"],
                    "pred": df["BayesDel_noAF_pred"]
                }
            },
            "clinpred": {
                "score": df["ClinPred_score"],
                "rankscore": df["ClinPred_rankscore"],
                "pred": df["ClinPred_pred"]
            },
            "list-s2": {
                "score": df["LIST-S2_score"],
                "rankscore": df["LIST-S2_rankscore"],
                "pred": df["LIST-S2_pred"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "uk10k": {
                "ac": df["UK10K_AC"],
                "af": df["UK10K_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "clinvar_id":
                df["clinvar_id"],
                "clinsig":
                [i for i in df["clinvar_clnsig"].split("/") if i != "."],
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "review":
                [i for i in df["clinvar_review"].split(",") if i != "."],
                "hgvs": [i for i in df["clinvar_hgvs"].split("|") if i != "."],
                "omim":
                [i for i in df["clinvar_OMIM_id"].split("|") if i != "."],
                "medgen":
                [i for i in df["clinvar_MedGen_id"].split("|") if i != "."],
                "orphanet":
                [i for i in df["clinvar_Orphanet_id"].split("|") if i != "."],
                "var_source":
                [i for i in df["clinvar_var_source"].split("|") if i != "."]
            },
            "hgvsc":
            list(
                set(df["HGVSc_ANNOVAR"].split(';') +
                    df["HGVSc_snpEff"].split(';') +
                    df["HGVSc_VEP"].split(';'))),
            "hgvsp":
            list(
                set(df["HGVSp_ANNOVAR"].split(';') +
                    df["HGVSp_snpEff"].split(';') +
                    df["HGVSp_VEP"].split(';'))),
            "gtex":
            list(gtex),
            "geuvadis_eqtl_target_gene":
            df["Geuvadis_eQTL_target_gene"]
        }
    }
    if include_gnomad:
        one_snp_json['dbnsfp'].update(gnomad)
    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", '-', "NA", None],
                   remove_invalid_list=True), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json