示例#1
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = "http://myvariant.info/v1/query?q=dbsnp.rsid:" + rsid + "&fields=_id"
    r = requests.get(url)
    for hits in r.json()["hits"]:
        HGVS = hits["_id"]
        one_snp_json = {
            "_id": HGVS,
            "grasp": {
                "hg19": {"chr": fields[5], "pos": fields[6]},
                "hupfield": fields[1],
                "last_curation_date": fields[2],
                "creation_date": fields[3],
                "srsid": fields[4],
                "publication": {
                    "journal": fields[16],
                    "title": fields[17],
                    "pmid": fields[7],
                    "snpid": fields[8],
                    "location_within_paper": fields[9],
                    "p_value": fields[10],
                    "phenotype": fields[11],
                    "paper_phenotype_description": fields[12],
                    "paper_phenotype_categories": fields[13],
                    "date_pub": fields[14],
                },
                "includes_male_female_only_analyses": fields[18],
                "exclusively_male_female": fields[19],
                "initial_sample_description": fields[20],
                "replication_sample_description": fields[21],
                "platform_snps_passing_qc": fields[22],
                "gwas_ancestry_description": fields[23],
                "discovery": {
                    "total_samples": fields[25],
                    "european": fields[26],
                    "african": fields[27],
                    "east_asian": fields[28],
                    "indian_south_asian": fields[29],
                    "hispanic": fields[30],
                    "native": fields[31],
                    "micronesian": fields[32],
                    "arab_me": fields[33],
                    "mixed": fields[34],
                    "unspecified": fields[35],
                    "filipino": fields[36],
                    "indonesian": fields[37],
                },
                "replication": {
                    "total_samples": fields[38],
                    "european": fields[39],
                    "african": fields[40],
                    "east_asian": fields[41],
                    "indian_south_asian": fields[42],
                    "hispanic": fields[43],
                    "native": fields[44],
                    "micronesian": fields[45],
                    "arab_me": fields[46],
                    "mixed": fields[47],
                    "unspecified": fields[48],
                    "filipino": fields[49],
                    "indonesian": fields[50],
                },
                "in_gene": fields[51],
                "nearest_gene": fields[52],
                "in_lincrna": fields[53],
                "in_mirna": fields[54],
                "in_mirna_bs": fields[55],
                "oreg_anno": fields[61],
                "conserv_pred_tfbs": fields[62],
                "human_enhancer": fields[63],
                "rna_edit": fields[64],
                "polyphen2": fields[65],
                "sift": fields[66],
                "ls_snp": fields[67],
                "uniprot": fields[68],
                "eqtl_meth_metab_study": fields[69],
            },
        }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
示例#2
0
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#3
0
def _map_line_to_json(df, version, index):
    # specific variable treatment
    chrom = df.get_value(index, "#chr")
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df.get_value(index, "hg18_pos(1-based)")
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df.get_value(index, "hg19_pos(1-based)") == '.':
        return None
    else:
        chromStart = int(df.get_value(index, "hg19_pos(1-based)"))
        chromEnd = chromStart
    chromStart_38 = int(df.get_value(index, "pos(1-based)"))
    ref = df.get_value(index, "ref").upper()
    alt = df.get_value(index, "alt").upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df.get_value(index, "SiPhy_29way_pi")
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df.get_value(index, "GTEx_V6_gene").split('|')
    gtex_tissue = df.get_value(index, "GTEx_V6_tissue").split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df.get_value(index,
                       "Uniprot_acc_Polyphen2").rstrip().rstrip(';').split(";")
    pos = df.get_value(
        index, "Uniprot_aapos_Polyphen2").rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df.get_value(index, "PROVEAN_score").split(';')
    sift_score = df.get_value(index, "SIFT_score").split(';')
    hdiv_score = df.get_value(index, "Polyphen2_HDIV_score").split(';')
    hvar_score = df.get_value(index, "Polyphen2_HVAR_score").split(';')
    lrt_score = df.get_value(index, "LRT_score").split(';')
    m_cap_score = df.get_value(index, "M-CAP_score").split(';')
    mutationtaster_score = df.get_value(index,
                                        "MutationTaster_score").split(';')
    mutationassessor_score = df.get_value(index,
                                          "MutationAssessor_score").split(';')
    vest3_score = df.get_value(index, "VEST3_score").split(';')
    metasvm_score = df.get_value(index, "MetaSVM_score").split(';')
    fathmm_score = df.get_value(index, "FATHMM_score").split(';')
    metalr_score = df.get_value(index, "MetaLR_score").split(';')
    modify_score_list = [
        provean_score, sift_score, hdiv_score, hvar_score, lrt_score,
        m_cap_score, mutationtaster_score, mutationassessor_score, vest3_score,
        metasvm_score, fathmm_score, metalr_score
    ]
    for _score in modify_score_list:
        [None if item == '.' else item for item in _score]

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df.get_value(index, "rs_dbSNP147"),
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df.get_value(index, "hg18_pos(1-based)"),
                "end": hg18_end
            },
            "hg38": {
                "start": df.get_value(index, "pos(1-based)"),
                "end": df.get_value(index, "pos(1-based)")
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df.get_value(index, "aaref"),
                "alt": df.get_value(index, "aaalt"),
                "pos": df.get_value(index, "aapos"),
                "refcodon": df.get_value(index, "refcodon"),
                "codonpos": df.get_value(index, "codonpos"),
                "codon_degeneracy": df.get_value(index, "codon_degeneracy"),
            },
            "genename": df.get_value(index, "genename"),
            "uniprot": uniprot,
            "interpro_domain": df.get_value(index, "Interpro_domain"),
            "cds_strand": df.get_value(index, "cds_strand"),
            "ancestral_allele": df.get_value(index, "Ancestral_allele"),
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df.get_value(index, "Ensembl_geneid"),
                "transcriptid": df.get_value(index, "Ensembl_transcriptid"),
                "proteinid": df.get_value(index, "Ensembl_proteinid")
            },
            "sift": {
                "score":
                sift_score,
                "converted_rankscore":
                df.get_value(index, "SIFT_converted_rankscore"),
                "pred":
                df.get_value(index, "SIFT_pred")
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HDIV_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HDIV_pred")
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HVAR_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HVAR_pred")
                }
            },
            "lrt": {
                "score":
                lrt_score,
                "converted_rankscore":
                df.get_value(index, "LRT_converted_rankscore"),
                "pred":
                df.get_value(index, "LRT_pred"),
                "omega":
                df.get_value(index, "LRT_Omega")
            },
            "mutationtaster": {
                "score":
                mutationtaster_score,
                "converted_rankscore":
                df.get_value(index, "MutationTaster_converted_rankscore"),
                "pred":
                df.get_value(index, "MutationTaster_pred"),
                "model":
                df.get_value(index, "MutationTaster_model"),
                "AAE":
                df.get_value(index, "MutationTaster_AAE")
            },
            "mutationassessor": {
                "score":
                mutationassessor_score,
                "rankscore":
                df.get_value(index, "MutationAssessor_score_rankscore"),
                "pred":
                df.get_value(index, "MutationAssessor_pred")
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df.get_value(index, "FATHMM_converted_rankscore"),
                "pred": df.get_value(index, "FATHMM_pred")
            },
            "provean": {
                "score": provean_score,
                "rankscore": df.get_value(index,
                                          "PROVEAN_converted_rankscore"),
                "pred": df.get_value(index, "PROVEAN_pred")
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df.get_value(index, "VEST3_rankscore"),
                "transcriptid": df.get_value(index, "Transcript_id_VEST3"),
                "transcriptvar": df.get_value(index, "Transcript_var_VEST3")
            },
            "fathmm-mkl": {
                "coding_score":
                df.get_value(index, "fathmm-MKL_coding_score"),
                "coding_rankscore":
                df.get_value(index, "fathmm-MKL_coding_rankscore"),
                "coding_pred":
                df.get_value(index, "fathmm-MKL_coding_pred"),
                "coding_group":
                df.get_value(index, "fathmm-MKL_coding_group")
            },
            "eigen": {
                "coding_or_noncoding":
                df.get_value(index, "Eigen_coding_or_noncoding"),
                "raw":
                df.get_value(index, "Eigen-raw"),
                "phred":
                df.get_value(index, "Eigen-phred")
            },
            "eigen-pc": {
                "raw": df.get_value(index, "Eigen-PC-raw"),
                "phred": df.get_value(index, "Eigen-PC-phred"),
                "raw_rankscore": df.get_value(index, "Eigen-PC-raw_rankscore")
            },
            "genocanyon": {
                "score": df.get_value(index, "GenoCanyon_score"),
                "rankscore": df.get_value(index, "GenoCanyon_score_rankscore")
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df.get_value(index, "MetaSVM_rankscore"),
                "pred": df.get_value(index, "MetaSVM_pred")
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df.get_value(index, "MetaLR_rankscore"),
                "pred": df.get_value(index, "MetaLR_pred")
            },
            "reliability_index": df.get_value(index, "Reliability_index"),
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df.get_value(index, "M-CAP_rankscore"),
                "pred": df.get_value(index, "M-CAP_pred")
            },
            "dann": {
                "score": df.get_value(index, "DANN_score"),
                "rankscore": df.get_value(index, "DANN_rankscore")
            },
            "gerp++": {
                "nr": df.get_value(index, "GERP++_NR"),
                "rs": df.get_value(index, "GERP++_RS"),
                "rs_rankscore": df.get_value(index, "GERP++_RS_rankscore")
            },
            "integrated": {
                "fitcons_score":
                df.get_value(index, "integrated_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "integrated_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "integrated_confidence_value")
            },
            "gm12878": {
                "fitcons_score":
                df.get_value(index, "GM12878_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "GM12878_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "GM12878_confidence_value")
            },
            "h1-hesc": {
                "fitcons_score":
                df.get_value(index, "H1-hESC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "H1-hESC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "H1-hESC_confidence_value")
            },
            "huvec": {
                "fitcons_score":
                df.get_value(index, "HUVEC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "HUVEC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "HUVEC_confidence_value")
            },
            "phylo": {
                "p100way": {
                    "vertebrate":
                    df.get_value(index, "phyloP100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phyloP100way_vertebrate_rankscore")
                },
                "p20way": {
                    "mammalian":
                    df.get_value(index, "phyloP20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phyloP20way_mammalian_rankscore")
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df.get_value(index, "phastCons100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phastCons100way_vertebrate_rankscore")
                },
                "20way": {
                    "mammalian":
                    df.get_value(index, "phastCons20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phastCons20way_mammalian_rankscore")
                }
            },
            "siphy_29way": {
                "pi":
                siphy,
                "logodds":
                df.get_value(index, "SiPhy_29way_logOdds"),
                "logodds_rankscore":
                df.get_value(index, "SiPhy_29way_logOdds_rankscore")
            },
            "1000gp3": {
                "ac": df.get_value(index, "1000Gp3_AC"),
                "af": df.get_value(index, "1000Gp3_AF"),
                "afr_ac": df.get_value(index, "1000Gp3_AFR_AC"),
                "afr_af": df.get_value(index, "1000Gp3_AFR_AF"),
                "eur_ac": df.get_value(index, "1000Gp3_EUR_AC"),
                "eur_af": df.get_value(index, "1000Gp3_EUR_AF"),
                "amr_ac": df.get_value(index, "1000Gp3_AMR_AC"),
                "amr_af": df.get_value(index, "1000Gp3_AMR_AF"),
                "eas_ac": df.get_value(index, "1000Gp3_EAS_AC"),
                "eas_af": df.get_value(index, "1000Gp3_EAS_AF"),
                "sas_ac": df.get_value(index, "1000Gp3_SAS_AC"),
                "sas_af": df.get_value(index, "1000Gp3_SAS_AF")
            },
            "twinsuk": {
                "ac": df.get_value(index, "TWINSUK_AC"),
                "af": df.get_value(index, "TWINSUK_AF")
            },
            "alspac": {
                "ac": df.get_value(index, "ALSPAC_AC"),
                "af": df.get_value(index, "ALSPAC_AF")
            },
            "esp6500": {
                "aa_ac": df.get_value(index, "ESP6500_AA_AC"),
                "aa_af": df.get_value(index, "ESP6500_AA_AF"),
                "ea_ac": df.get_value(index, "ESP6500_EA_AC"),
                "ea_af": df.get_value(index, "ESP6500_EA_AF")
            },
            "exac": {
                "ac": df.get_value(index, "ExAC_AC"),
                "af": df.get_value(index, "ExAC_AF"),
                "adj_ac": df.get_value(index, "ExAC_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_SAS_AF")
            },
            "exac_nontcga": {
                "ac": df.get_value(index, "ExAC_nonTCGA_AC"),
                "af": df.get_value(index, "ExAC_nonTCGA_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonTCGA_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonTCGA_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonTCGA_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonTCGA_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonTCGA_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonTCGA_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonTCGA_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonTCGA_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonTCGA_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonTCGA_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonTCGA_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonTCGA_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonTCGA_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonTCGA_SAS_AF")
            },
            "exac_nonpsych": {
                "ac": df.get_value(index, "ExAC_nonpsych_AC"),
                "af": df.get_value(index, "ExAC_nonpsych_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonpsych_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonpsych_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonpsych_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonpsych_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonpsych_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonpsych_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonpsych_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonpsych_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonpsych_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonpsych_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonpsych_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonpsych_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonpsych_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonpsych_SAS_AF")
            },
            "clinvar": {
                "rs": df.get_value(index, "clinvar_rs"),
                "clinsig": df.get_value(index, "clinvar_clnsig"),
                "trait": df.get_value(index, "clinvar_trait"),
                "golden_stars": df.get_value(index, "clinvar_golden_stars")
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#4
0
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-coor)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["pos(1-coor)"] == '.':
        return None
    else:
        chromStart = int(df["pos(1-coor)"])
        chromEnd = chromStart
    chromStart_38 = int(df["hg38_pos"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-coor)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["hg38_pos"],
                "end": df["hg38_pos"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"]
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p46way": {
                    "placental": df["phyloP46way_placental"],
                    "placental_rankscore":
                    df["phyloP46way_placental_rankscore"],
                    "primate": df["phyloP46way_primate"],
                    "primate_rankscore": df["phyloP46way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "46way": {
                    "placental": df["phastCons46way_placental"],
                    "placental_rankscore":
                    df["phastCons46way_placental_rankscore"],
                    "primate": df["phastCons46way_primate"],
                    "primate_rankscore": df["phastCons46way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp1": {
                "ac": df["1000Gp1_AC"],
                "af": df["1000Gp1_AF"],
                "afr_ac": df["1000Gp1_AFR_AC"],
                "afr_af": df["1000Gp1_AFR_AF"],
                "eur_ac": df["1000Gp1_EUR_AC"],
                "eur_af": df["1000Gp1_EUR_AF"],
                "amr_ac": df["1000Gp1_AMR_AC"],
                "amr_af": df["1000Gp1_AMR_AF"],
                "asn_ac": df["1000Gp1_ASN_AC"],
                "asn_af": df["1000Gp1_ASN_AF"]
            },
            "esp6500": {
                "aa_af": df["ESP6500_AA_AF"],
                "ea_af": df["ESP6500_EA_AF "]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "aric5606": {
                "aa_ac": df["ARIC5606_AA_AC"],
                "aa_af": df["ARIC5606_AA_AF"],
                "ea_ac": df["ARIC5606_EA_AC"],
                "ea_af": df["ARIC5606_EA_AF"]
            },
            "clinvar": {
                "rs":
                df["clinvar_rs"],
                "clinsig":
                list(
                    map(int, [
                        i for i in df["clinvar_clnsig"].split("|") if i != "."
                    ])),
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars":
                list(
                    map(int, [
                        i for i in df["clinvar_golden_stars"].split("|")
                        if i != "."
                    ]))
            }
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-coor)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["pos(1-coor)"] == '.':
        return None
    else:
        chromStart = int(df["pos(1-coor)"])
        chromEnd = chromStart
    chromStart_38 = int(df["hg38_pos"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))
    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [
            {"mechanism": mutpred_mechanisms[0],
             "p_val": modify_pvalue(mutpred_mechanisms[1])},
            {"mechanism": mutpred_mechanisms[2],
             "p_val": modify_pvalue(mutpred_mechanisms[3])},
            {"mechanism": mutpred_mechanisms[4],
             "p_val": modify_pvalue(mutpred_mechanisms[5])},
            {"mechanism": mutpred_mechanisms[6],
             "p_val": modify_pvalue(mutpred_mechanisms[7])},
            {"mechanism": mutpred_mechanisms[8],
             "p_val": modify_pvalue(mutpred_mechanisms[9])}
        ]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-coor)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["hg38_pos"],
                "end": df["hg38_pos"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"]
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"]
                },
                "p46way": {
                    "placental": df["phyloP46way_placental"],
                    "placental_rankscore": df["phyloP46way_placental_rankscore"],
                    "primate": df["phyloP46way_primate"],
                    "primate_rankscore": df["phyloP46way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"]
                },
                "46way": {
                    "placental": df["phastCons46way_placental"],
                    "placental_rankscore": df["phastCons46way_placental_rankscore"],
                    "primate": df["phastCons46way_primate"],
                    "primate_rankscore": df["phastCons46way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp1": {
                "ac": df["1000Gp1_AC"],
                "af": df["1000Gp1_AF"],
                "afr_ac": df["1000Gp1_AFR_AC"],
                "afr_af": df["1000Gp1_AFR_AF"],
                "eur_ac": df["1000Gp1_EUR_AC"],
                "eur_af": df["1000Gp1_EUR_AF"],
                "amr_ac": df["1000Gp1_AMR_AC"],
                "amr_af": df["1000Gp1_AMR_AF"],
                "asn_ac": df["1000Gp1_ASN_AC"],
                "asn_af": df["1000Gp1_ASN_AF"]
            },
            "esp6500": {
                "aa_af": df["ESP6500_AA_AF"],
                "ea_af": df["ESP6500_EA_AF "]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "aric5606": {
                "aa_ac": df["ARIC5606_AA_AC"],
                "aa_af": df["ARIC5606_AA_AF"],
                "ea_ac": df["ARIC5606_EA_AC"],
                "ea_af": df["ARIC5606_EA_AF"]
            },
            "clinvar": {
                "rs": df["clinvar_rs"],
                "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])),
                "trait": [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."]))
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#6
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#7
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = 'http://myvariant.info/v1/query?q=dbsnp.rsid:'\
          + rsid + '&fields=_id'
    r = requests.get(url)
    for hits in r.json()['hits']:
        HGVS = hits['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
示例#8
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
示例#10
0
def _map_line_to_json(fields):
    # specific variable treatment
    chrom = fields[0]
    if fields[7] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[7])+1
    chromStart = int(fields[1])
    chromEnd = int(fields[1]) + 1
    allele1 = fields[2]
    allele2 = fields[3]
    HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2)

    if fields[74] == ".":
        siphy = "."
    else:
        freq = fields[74].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[11].rstrip().rstrip(';').split(";")
    pos = fields[13].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {

        "_id": HGVS,
        "dbnsfp":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": fields[1],
                        "end": chromEnd
                    },
                "hg18":
                    {
                        "start": fields[7],
                        "end": hg18_end
                    },
                "hg38":
                    {
                        "chrom": fields[8],
                        "pos": fields[9]                    
                    },
                "allele1": allele1,
                "allele2": allele2,
                "aa":
                    {
                        "ref": fields[4],
                        "alt": fields[5],
                        "pos": fields[23],
                        "refcodon": fields[16],
                        "codonpos": fields[18],
                        "aapos_sift": fields[24],
                        "aapos_fathmm": fields[25]
                    },
                "genename": fields[10],
                "uniprot": uniprot,
                "interpro_domain": fields[14],
                "cds_strand": fields[15],
                "slr_test_statistic": fields[17],
                "fold-degenerate": fields[19],
                "ancestral_allele": fields[20],
                "ensembl":
                    {
                        "geneid": fields[21],
                        "transcriptid": fields[22]
                    },
                "sift":
                    {
                        "score": fields[26],
                        "converted_rankscore": fields[27],
                        "pred": fields[28]
                    },
                "polyphen2":
                    {
                        "hdiv":
                        {
                            "score": fields[29],
                            "rankscore": fields[30],
                            "pred": fields[31]
                        },
                        "hvar":
                        {
                            "score": fields[32],
                            "rankscore": fields[33],
                            "pred": fields[34]
                        }
                    },
                "lrt":
                    {
                        "score": fields[35],
                        "converted_rankscore": fields[36],
                        "pred": fields[37]
                    },
                "mutationtaster":
                    {
                        "score": fields[38],
                        "converted_rankscore": fields[39],
                        "pred": fields[40]
                    },
                "mutationassessor":
                    {
                        "score": fields[41],
                        "rankscore": fields[42],
                        "pred": fields[43]
                    },
                "fathmm":
                    {
                        "score": fields[44],
                        "rankscore": fields[45],
                        "pred": fields[46]
                    },
                "radialsvm":
                    {
                        "score": fields[47],
                        "rankscore": fields[48],
                        "pred": fields[49]
                    },
                "lr":
                    {
                        "score": fields[50],
                        "rankscore": fields[51],
                        "pred": fields[52]
                    },
                "reliability_index": fields[53],
                "vest3":
                    {
                        "score": fields[54],
                        "rankscore": fields[55]
                    },
                "cadd":
                    {
                        "raw": fields[56],
                        "raw_rankscore": fields[57],
                        "phred": fields[58]
                    },
                "gerp++":
                    {
                        "nr": fields[59],
                        "rs": fields[60],
                        "rs_rankscore": fields[61]
                    },
                "phylop":
                    {
                        "46way": 
                            {
                                "primate": fields[62],
                                "primate_rankscore": fields[63],
                                "placental": fields[64],
                                "placental_rankscore": fields[65],
                            },
                        "100way":
                            {
                                "vertebrate": fields[66],
                                "vertebrate_rankscore": fields[67]
                            }
                    },
                "phastcons":
                    {
                        "46way": 
                            {
                                "primate": fields[68],
                                "primate_rankscore": fields[69],
                                "placental": fields[70],
                                "placental_rankscore": fields[71],
                            },
                        "100way":
                            {
                                "vertebrate": fields[72],
                                "vertebrate_rankscore": fields[73]
                            }
                    },
                "siphy_29way":
                    {
                        "pi": siphy,
                        "logodds": fields[75],
                        "logodds_rankscore": fields[76]
                    },
                "lrt_omega": fields[77],
                "unisnp_ids": fields[78],
                "1000gp1":
                    {
                        "ac": fields[79],
                        "af": fields[80],
                        "afr_ac": fields[81],
                        "afr_af": fields[82],
                        "eur_ac": fields[83],
                        "eur_af": fields[84],
                        "amr_ac": fields[85],
                        "amr_af": fields[86],
                        "asn_ac": fields[87],
                        "asn_af": fields[88]
                    },
                "esp6500":
                    {
                        "aa_af": fields[89],
                        "ea_af": fields[90]
                    },
                "aric5606":
                    {
                        "aa_ac": fields[91],
                        "aa_af": fields[92],
                        "ea_ac": fields[93],
                        "ea_af": fields[94]
                    },
                "clinvar":
                    {
                        "rs": fields[95],
                        "clin_sig": fields[96],
                        "trait": fields[97]
                    }
            }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json