Exemplo n.º 1
0
def load_data():
    d_gene = process_gene(file_path_gene_disease)
    d_snp = process_snp(file_path_snp_disease)
    d_xrefs = process_xrefs(file_path_disease_mapping)
    umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo)
    for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())):
        if umls_id in umls_2_mondo:
            mondo_id = umls_2_mondo[umls_id]
            for _mondo in mondo_id:
                _doc = {
                    '_id': _mondo,
                    'disgenet': {
                        'xrefs': d_xrefs.get(umls_id, {}),
                        'genes_related_to_disease': d_gene.get(umls_id, {}),
                        'variants_related_to_disease': d_snp.get(umls_id, {})
                    }
                }
                _doc = (dict_sweep(unlist(_doc), [None]))
                yield _doc
        else:
            _doc = {
                '_id': umls_id,
                'disgenet': {
                    'xrefs': d_xrefs.get(umls_id, {}),
                    'genes_related_to_disease': d_gene.get(umls_id, {}),
                    'variants_related_to_disease': d_snp.get(umls_id, {})
                }
            }
            _doc = (dict_sweep(unlist(_doc), [None]))
            yield _doc
Exemplo n.º 2
0
def load_data(data_folder):
    file_path_disease_hpo = os.path.join(data_folder, 'phenotype.hpoa')
    file_path_mondo = os.path.join(data_folder, 'mondo.json')
    d_hpo = process_disease2hp(file_path_disease_hpo)
    orphanet_omim_2_mondo = construct_orphanet_omim_to_mondo_library(file_path_mondo)
    for disease_id in d_hpo.keys():
    #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())):
        if disease_id in orphanet_omim_2_mondo:
            mondo_id = orphanet_omim_2_mondo[disease_id]
            for _mondo in mondo_id:
                if disease_id.startswith('OMIM'):
                    _doc = {'_id': _mondo,
                            'hpo': {
                                'disease_name': d_hpo.get(disease_id, {})[1],
                                'omim': disease_id.split(':')[1],
                                'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0]
                                }
                           }
                elif disease_id.startswith('ORPHANET'):
                    _doc = {'_id': _mondo,
                            'hpo': {
                                'disease_name': d_hpo.get(disease_id, {})[1],
                                'orphanet': disease_id.split(':')[1],
                                'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0]
                                }
                           }
                else:
                    print(disease_id)
                _doc = (dict_sweep(unlist(_doc), [None]))
                yield _doc

        else:
            if disease_id.startswith('OMIM'):
                _doc = {'_id': disease_id,
                        'hpo': {
                            'disease_name': d_hpo.get(disease_id, {})[1],
                            'omim': disease_id.split(':')[1],
                            'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0]
                            }
                        }
            elif disease_id.startswith('ORPHANET'):
                _doc = {'_id': disease_id,
                        'hpo': {
                            'disease_name': d_hpo.get(disease_id, {})[1],
                            'orphanet': disease_id.split(':')[1],
                            'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0]
                            }
                        }
            else:
                _doc = {'_id': disease_id,
                        'hpo': {
                            'disease_name': d_hpo.get(disease_id, {})[1],
                            'decipher': disease_id.split(':')[1],
                            'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0]
                            }
                        }
            _doc = (dict_sweep(unlist(_doc), [None]))
            yield _doc
Exemplo n.º 3
0
def load_data(data_folder):
    file_path_mondo = os.path.join(data_folder, "mondo.json")
    file_path_gene_disease = os.path.join(
        data_folder, "all_gene_disease_pmid_associations.tsv.gz"
    )
    file_path_snp_disease = os.path.join(
        data_folder, "all_variant_disease_pmid_associations.tsv.gz"
    )
    file_path_disease_mapping = os.path.join(data_folder, "disease_mappings.tsv.gz")
    d_gene = process_gene(file_path_gene_disease)
    d_snp = process_snp(file_path_snp_disease)
    d_xrefs = process_xrefs(file_path_disease_mapping)
    umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo)
    for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())):
        if umls_id in d_xrefs and "mondo" in d_xrefs[umls_id]:
            _doc = {
                "_id": d_xrefs[umls_id]["mondo"],
                "disgenet": {
                    "xrefs": d_xrefs.get(umls_id, {}),
                    "genes_related_to_disease": d_gene.get(umls_id, {}),
                    "variants_related_to_disease": d_snp.get(umls_id, {}),
                },
            }
            _doc = dict_sweep(unlist(_doc), [None])
            yield _doc
        elif umls_id in umls_2_mondo:
            mondo_id = umls_2_mondo[umls_id]
            for _mondo in mondo_id:
                _doc = {
                    "_id": _mondo,
                    "disgenet": {
                        "xrefs": d_xrefs.get(umls_id, {}),
                        "genes_related_to_disease": d_gene.get(umls_id, {}),
                        "variants_related_to_disease": d_snp.get(umls_id, {}),
                    },
                }
                _doc = dict_sweep(unlist(_doc), [None])
                yield _doc
        else:
            _doc = {
                "_id": umls_id,
                "disgenet": {
                    "xrefs": d_xrefs.get(umls_id, {}),
                    "genes_related_to_disease": d_gene.get(umls_id, {}),
                    "variants_related_to_disease": d_snp.get(umls_id, {}),
                },
            }
            _doc = dict_sweep(unlist(_doc), [None])
            yield _doc
Exemplo n.º 4
0
def load_data(data_folder):

    input_file = os.path.join(data_folder, "phewas-catalog.csv")
    assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split(',')
        header = [_item[1:-1] for _item in header]
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter=',')

        results = defaultdict(list)
        for row in reader:
            variant = {"associations": {"phenotype": {}}, "variant": {}}
            assert re.match("^rs\d+$", row["snp"]) != None
            variant["variant"]["rsid"] = row["snp"]
            variant["associations"]["phenotype"]["name"] = row["phewas phenotype"]
            variant["associations"]["cases"] = row["cases"]

            variant["associations"]["pval"] = float(row["p-value"])
            variant["associations"]["odds-ratio"] = row["odds-ratio"]
            variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"]
            variant["variant"]["gene"] = row["gene_name"]
            variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',')
            pos_info = row["chromosome"].split(' ')
            if len(pos_info) == 2:
                variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info
            else:
                variant["variant"]["chrom"] = pos_info[0]
            results[variant["variant"]["rsid"]].append(variant)
        # Merge duplications
        rsid_list = [_item for _item in results.keys()]
        hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list)
        for k, v in results.items():
            if k in hgvs_rsid_dict and hgvs_rsid_dict[k]:
                if len(v) == 1:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = v[0]["associations"]
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
                else:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = []
                    for _item in v:
                        doc["phewas"]["associations"].append(_item["associations"])
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
Exemplo n.º 5
0
def load_data(data_folder):
    file_path_mondo = os.path.join(data_folder, "mondo.json")
    file_path_gene_disease = os.path.join(
        data_folder, "curated_gene_disease_associations.tsv.gz")
    file_path_snp_disease = os.path.join(
        data_folder, "all_variant_disease_pmid_associations.tsv.gz")
    file_path_disease_mapping = os.path.join(data_folder,
                                             "disease_mappings.tsv.gz")
    d_gene = process_gene(file_path_gene_disease)
    d_snp = process_snp(file_path_snp_disease)
    d_xrefs = process_xrefs(file_path_disease_mapping)
    umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo)
    for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())):
        if umls_id in d_xrefs and 'mondo' in d_xrefs[umls_id]:
            _doc = {
                '_id': d_xrefs[umls_id]['mondo'],
                'disgenet': {
                    'xrefs': d_xrefs.get(umls_id, {}),
                    'genes_related_to_disease': d_gene.get(umls_id, {}),
                    'variants_related_to_disease': d_snp.get(umls_id, {})
                }
            }
            _doc = (dict_sweep(unlist(_doc), [None]))
            yield _doc
        elif umls_id in umls_2_mondo:
            mondo_id = umls_2_mondo[umls_id]
            for _mondo in mondo_id:
                _doc = {
                    '_id': _mondo,
                    'disgenet': {
                        'xrefs': d_xrefs.get(umls_id, {}),
                        'genes_related_to_disease': d_gene.get(umls_id, {}),
                        'variants_related_to_disease': d_snp.get(umls_id, {})
                    }
                }
                _doc = (dict_sweep(unlist(_doc), [None]))
                yield _doc
        else:
            _doc = {
                '_id': umls_id,
                'disgenet': {
                    'xrefs': d_xrefs.get(umls_id, {}),
                    'genes_related_to_disease': d_gene.get(umls_id, {}),
                    'variants_related_to_disease': d_snp.get(umls_id, {})
                }
            }
            _doc = (dict_sweep(unlist(_doc), [None]))
            yield _doc
Exemplo n.º 6
0
def load_data(data_folder):
    # Ontology data
    go_file = os.path.join(data_folder, "go.json")
    goterms = parse_ontology(go_file)
    # Gene annotation files
    for f in glob.glob(os.path.join(data_folder, "*.gaf.gz")):
        print("Parsing {}".format(f))
        docs = parse_gene_annotations(f)

        # Create gene ID cache. Join all gene sets and fetch ids.
        all_genes = set()
        for _id, annotations in docs.items():
            for key in [
                    "genes", "excluded_genes", "contributing_genes",
                    "colocalized_genes"
            ]:
                if annotations.get(key) is not None:
                    all_genes = all_genes | annotations[key]
        uniprot = [i for i, j in all_genes]
        symbols = [j for i, j in all_genes]
        taxid = annotations['taxid']
        # Fetch gene data from mygene.info
        lookup = IDLookup(taxid)
        lookup.query_mygene(uniprot, "uniprot,retired,accession")
        lookup.retry_failed_with_new_ids(symbols, "symbol")

        for _id, annotations in docs.items():
            # Add ontology annotations
            annotations['go'] = goterms[_id]
            annotations['source'] = 'go'
            # Add gene sets
            if annotations.get("genes") is not None:
                annotations['name'] = annotations['go']['name']
                annotations['description'] = annotations['go']['description']
                new_genes = []
                for u, s in annotations['genes']:
                    if lookup.query_cache.get(u) is not None:
                        new_genes.append(lookup.query_cache[u])
                    elif lookup.query_cache.get(s) is not None:
                        new_genes.append(lookup.query_cache[s])
                annotations['genes'] = new_genes
            else:
                # No genes in set
                continue

            for key in [
                    "excluded_genes", "contributing_genes", "colocalized_genes"
            ]:
                if annotations.get(key) is not None:
                    new_genes = []
                    for u, s in annotations.pop(key):
                        if lookup.query_cache.get(u) is not None:
                            new_genes.append(lookup.query_cache[u])
                        elif lookup.query_cache.get(s) is not None:
                            new_genes.append(lookup.query_cache[s])
                    annotations['go'][key] = new_genes
            # Clean up data
            annotations = unlist(annotations)
            annotations = dict_sweep(annotations)
            yield annotations
Exemplo n.º 7
0
def load_data(_file):
    f = open(_file, 'r')
    reader = csv.DictReader(f)
    for row in reader:
        _dict = restr_dict(row)
        _dict = unlist(dict_sweep(_dict))
        yield _dict
Exemplo n.º 8
0
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
Exemplo n.º 9
0
def parse_ontology(f):
    "Get GO-term metadata from ontology JSON dump."
    with open(f, 'r') as infile:
        data = json.load(infile)
    nodes = data['graphs'][0]['nodes']
    go_terms = {}
    for node in nodes:
        url = node['id']
        _id = url.split("/")[-1]
        if not _id.startswith("GO_"):
            continue
        go_terms[_id] = {
            "id": _id,
            "url": url,
        }
        properties = node['meta'].get('basicPropertyValues')
        for p in properties:
            if p['val'] in [
                    "biological_process", "cellular_component",
                    "molecular_function"
            ]:
                go_terms[_id]["class"] = [p['val']]
        if node.get('lbl'):
            go_terms[_id]['name'] = node['lbl']
        if node['meta'].get("definition"):
            go_terms[_id]['description'] = node['meta']['definition'].get(
                'val')
            go_terms[_id]['xrefs'] = node['meta']['definition'].get('xrefs')
    go_terms = unlist(go_terms)
    go_terms = dict_sweep(go_terms)
    return go_terms
Exemplo n.º 10
0
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
Exemplo n.º 11
0
def restructure_dict(dictionary):
    restr_dict = dict()
    _flag = 0
    for key in list(dictionary):  # this is for 1
        if key == 'molecule_chembl_id':
            restr_dict['_id'] = dictionary[key]
        if key == 'molecule_structures' and type(
                dictionary['molecule_structures']) == dict:
            restr_dict['chembl'] = dictionary
            _flag = 1
            for x, y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key': y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y

    if _flag == 0:
        restr_dict['chembl'] = dictionary
    del restr_dict['chembl']['molecule_structures']
    restr_dict = unlist(restr_dict)
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, ".", "-", "", "NA", "None", "none", " ",
                                "Not Available", "unknown", "null"
                            ])
    restr_dict = value_convert_to_number(
        restr_dict, skipped_keys=["chebi_par_id", "first_approval"])
    restr_dict = boolean_convert(restr_dict, [
        "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag",
        "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass"
    ])
    return restr_dict
Exemplo n.º 12
0
def load_data(data_folder):
    input_fn = os.path.join(data_folder,"biomuta-master.csv")
    open_file = open(input_fn)
    db_biomuta = csv.reader(open_file)
    index = next(db_biomuta)
    assert len(index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index))
    index = [clean_index(s) for s in index]
    biomuta = (dict(zip(index, row)) for row in db_biomuta)
    json_rows = map(_map_line_to_json, biomuta)

    fd_tmp, tmp_path = mkstemp(dir=data_folder)
    
    try:
        with open(tmp_path, "w") as f:
            dbwriter = csv.writer(f)
            for i, doc in enumerate(json_rows):
                if doc:
                    dbwriter.writerow([doc['_id'], json.dumps(doc)])  

        csvsort(tmp_path, [0,], has_header=False)
        
        with open(tmp_path) as csvfile:
            json_rows = csv.reader(csvfile)
            json_rows = (json.loads(row[1]) for row in json_rows)
            row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
            json_rows = (merge_duplicate_rows(rg, "biomuta") for rg in row_groups)
            json_rows = (unlist(dict_sweep(row, vals=[None, ])) for row in json_rows)
            for res in json_rows:
                yield res

    finally:
        os.remove(tmp_path)
Exemplo n.º 13
0
Arquivo: parser.py Projeto: ravila4/go
def get_gene_ids(symbols, uniprot_ids, taxid):
    """Fetch NCBI, Ensembl, and gene names from UniProt ids or gene symbol."""
    mg = mygene.MyGeneInfo()
    fields = 'entrezgene,ensembl.gene,name,symbol'
    # Fetch ids from  UniProt
    response = mg.querymany(uniprot_ids,
                            scopes='uniprot',
                            fields=fields,
                            species=taxid,
                            returnall=True)
    genes = {}
    for out in response['out']:
        if out.get("_id") is not None:
            query = out['query']
            geneid = out['_id']
            hits = genes.setdefault(query, {})
            hits[geneid] = {
                "mygene_id": geneid,
                "uniprot": query,
                "symbol": out.get('symbol'),
                "name": out.get('name')
            }
            if out.get("entrezgene") is not None:
                hits[geneid].setdefault('ncbigene', [])
                if out['entrezgene'] not in hits[geneid]['ncbigene']:
                    hits[geneid]['ncbigene'].append(out['entrezgene'])
            if out.get("ensembl") is not None:
                hits[geneid].setdefault('ensemblgene', [])
                hits[geneid]['ensemblgene'] = hits[geneid]['ensemblgene'] + \
                    [i['gene'] for i in alwayslist(out['ensembl'])]
    # Retry missing using gene symbol
    retry = [symbols[uniprot_ids.index(k)] for k in response['missing']]
    response = mg.querymany(retry,
                            scopes='symbol',
                            fields=fields,
                            species=taxid,
                            returnall=True)
    for out in response['out']:
        if out.get("_id") is not None:
            query = out['query']
            geneid = out['_id']
            hits = genes.setdefault(query, {})
            hits[geneid] = {
                "mygene_id": geneid,
                "uniprot": uniprot_ids[symbols.index(query)],
                "symbol": out['symbol'],
                "name": out.get('name')
            }
            if out.get("entrezgene") is not None:
                hits[geneid].setdefault('ncbigene', [])
                if out['entrezgene'] not in hits[geneid]['ncbigene']:
                    hits[geneid]['ncbigene'].append(out['entrezgene'])
            if out.get("ensembl") is not None:
                hits[geneid].setdefault('ensemblgene', [])
                hits[geneid]['ensemblgene'] = hits[geneid]['ensemblgene'] + \
                    [i['gene'] for i in alwayslist(out['ensembl'])]
    genes = unlist(genes)
    genes = dict_sweep(genes, vals=[None, 'null'])
    return genes
Exemplo n.º 14
0
def load_data():
    # number of civic ids with ref, alt, chrom
    no_case1 = 0
    # number of civic ids with chrom, ref, but no alt
    no_case2 = 0
    # number of civic ids with chrom, alt, but no ref
    no_case3 = 0
    # number of civic ids with no alt and ref
    no_case4 = 0
    for variant_id in range(MAX_VARIANT_NUMBER):
        if variant_id % 200 == 0:
            print("scanned {} variants".format(variant_id))
        civic_url = 'https://civic.genome.wustl.edu/api/variants/'
        url = civic_url + str(variant_id)
        doc = requests.get(url).json()
        # time delay for 0.5s
        time.sleep(0.5)
        if set(['error', 'status']) != set(doc.keys()):
            [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']]
            doc.pop("id")
            new_doc = {}
            doc['variant_id'] = variant_id
            if chrom and ref and alt:
                no_case1 += 1
                try:
                  new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt)
                except ValueError:
                  print("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id))
                  continue
            # handle cases of deletions where only ref info is provided
            elif chrom and ref and not alt:
                no_case2 += 1
                start = int(pos)
                end = int(pos) + len(ref) - 1
                if start == end:
                    new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start)
                else:
                    new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
            # handle cases of insertions where only alt info is provided
            elif chrom and alt and not ref:
                no_case3 += 1
                new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt)
            # handle cases where no ref or alt info provided,
            # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1
            else:
                no_case4 += 1
                new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id)
            for _evidence in doc['evidence_items']:
                if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']:
                    _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid']
            new_doc['civic'] = doc
            yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}])
            # change doid into its formal representation, which should be sth like DOID:1
        else:
            continue
    print("number of ids with ref, alt, chrom: {}".format(no_case1))
    print("number of ids with chrom, ref but no alt: {}".format(no_case2))
    print("number of ids with chrom, alt but no ref: {}".format(no_case3))
    print("number of ids with no ref and alt: {}".format(no_case4))
Exemplo n.º 15
0
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']    
    restr_dict['chebi']= dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) 
    restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"])
    return restr_dict        
Exemplo n.º 16
0
def load_packages(_file):
    f = open(_file, 'r', encoding='latin1')
    reader = csv.DictReader(f, dialect='excel-tab')
    for row in reader:
        _dict = package_restr_dict(row)
        _dict = unlist(dict_sweep(_dict))
        _dict["_id"] = _dict["ndc"]["productndc"]
        yield _dict
Exemplo n.º 17
0
def load_packages(_file):
    f = open(_file,'r',encoding='latin1')
    reader = csv.DictReader(f,dialect='excel-tab')
    for row in reader:
        _dict = package_restr_dict(row)
        _dict = unlist(dict_sweep(_dict))
        _dict["_id"] = _dict["ndc"]["productndc"]
        yield _dict
Exemplo n.º 18
0
def restructure_drug_indications(indication_data):
    """ Group drug indications by molecule_chembl_id
    """
    restr_dict = {}
    for doc in indication_data:
        key = doc["molecule_chembl_id"]
        restr_dict.setdefault(key, []).append(doc)
    restr_dict = unlist(restr_dict)
    return restr_dict
Exemplo n.º 19
0
def restructure_metabolisms(metabolism_data):
    """ Group metabolism data by molecule_chembl_id
    """
    restr_dict = {}
    for doc in metabolism_data:
        key = doc["drug_chembl_id"]
        restr_dict.setdefault(key, []).append(doc)
    restr_dict = unlist(restr_dict)
    return restr_dict
Exemplo n.º 20
0
Arquivo: parser.py Projeto: ravila4/go
def load_data(data_folder):
    # Ontology data
    go_file = os.path.join(data_folder, "go.json")
    goterms = parse_ontology(go_file)
    # Gene annotation files
    for f in glob.glob(os.path.join(data_folder, "*.gaf.gz")):
        print("Parsing {}".format(f))
        docs = parse_gene_annotations(f)

        # Create gene ID cache. Join all gene sets and fetch ids.
        all_genes = set()
        for _id, annotations in docs.items():
            for key in [
                    "genes", "excluded_genes", "contributing_genes",
                    "colocalized_genes"
            ]:
                if annotations.get(key) is not None:
                    all_genes = all_genes | annotations[key]
        uniprot = [i for i, j in all_genes]
        symbols = [j for i, j in all_genes]
        taxid = annotations['taxid']
        genecache = get_gene_ids(symbols, uniprot, taxid)

        for _id, annotations in docs.items():
            # Add ontology annotations
            annotations['go'] = goterms[_id]
            # Add gene sets
            if annotations.get("genes") is not None:
                genes = []
                for u, s in annotations['genes']:
                    if genecache.get(u) is not None:
                        genes += [g for g in genecache[u].values()]
                    elif genecache.get(s) is not None:
                        genes += [g for g in genecache[s].values()]
                    else:
                        genes += {'symbol': s, 'uniprot': u}
                annotations['genes'] = genes
            else:
                # No genes in set
                continue

            for key in [
                    "excluded_genes", "contributing_genes", "colocalized_genes"
            ]:
                if annotations.get(key) is not None:
                    genes = []
                    for u, s in annotations[key]:
                        if genecache.get(u) is not None:
                            genes += [g for g in genecache[u].values()]
                        elif genecache.get(s) is not None:
                            genes += [g for g in genecache[s].values()]
                        else:
                            genes += {'symbol': s, 'uniprot': u}
                    annotations[key] = genes
            # Clean up data
            annotations = unlist(annotations)
            yield annotations
Exemplo n.º 21
0
def restructure_activities(activity_data):
    """ Group activities by molecule_chembl_id
    """
    restr_dict = {}
    for doc in activity_data:
        key = doc["molecule_chembl_id"]
        restr_dict.setdefault(key, []).append(doc)
    restr_dict = unlist(restr_dict)
    return restr_dict
Exemplo n.º 22
0
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']
    restr_dict['chebi']= dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available",
        "unknown","null","None","NaN"])
    restr_dict = value_convert_to_number(unlist(restr_dict),skipped_keys=["cid","sid",
        "beilstein","pubmed","sabio_rk","gmelin","molbase", "synonyms", "wikipedia","url_stub"])
    return restr_dict
Exemplo n.º 23
0
def load_data(input_file):

    with open_anyfile(input_file) as in_f:
        result = defaultdict(list)
        for line in in_f:
            pharos_id, _id = line.strip().split(',')
            if _id != 'entrez_gene_id' and _id != '0':
                result[str(_id)].append(int(pharos_id))
        for k, v in result.items():
            json_doc = {'_id': str(k), 'pharos': {"target_id": v}}
            yield unlist(json_doc)
Exemplo n.º 24
0
    def reformat(cls, dictionary):
        ret_dict = dict()
        _flag = 0
        for key in list(dictionary):
            if key == 'molecule_chembl_id':
                ret_dict['_id'] = dictionary[key]
            if key == 'molecule_structures' and type(
                    dictionary['molecule_structures']) == dict:
                ret_dict['chembl'] = dictionary
                _flag = 1
                for x, y in iter(dictionary['molecule_structures'].items()):
                    if x == 'standard_inchi_key':
                        ret_dict['chembl'].update(dictionary)
                        ret_dict['chembl'].update({'inchi_key': y})
                    if x == 'canonical_smiles':
                        ret_dict['chembl']['smiles'] = y
                    if x == 'standard_inchi':
                        ret_dict['chembl']['inchi'] = y

        if _flag == 0:
            ret_dict['chembl'] = dictionary
        if 'cross_references' in ret_dict['chembl'] and ret_dict['chembl'][
                'cross_references']:
            ret_dict['chembl'][
                'xrefs'] = MoleculeCrossReferenceListTransformer.transform_to_dict(
                    ret_dict['chembl']['cross_references'])

        del ret_dict['chembl']['molecule_structures']
        del ret_dict['chembl']['cross_references']

        ret_dict = unlist(ret_dict)

        # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs
        if 'chebi_par_id' in ret_dict['chembl'] and ret_dict['chembl'][
                'chebi_par_id']:
            ret_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(
                ret_dict['chembl']['chebi_par_id'])
        else:
            # clean, could be a None
            ret_dict['chembl'].pop("chebi_par_id", None)

        ret_dict = dict_sweep(ret_dict,
                              vals=[
                                  None, ".", "-", "", "NA", "None", "none",
                                  " ", "Not Available", "unknown", "null"
                              ])
        ret_dict = value_convert_to_number(
            ret_dict, skipped_keys=["chebi_par_id", "first_approval"])
        ret_dict = boolean_convert(ret_dict, [
            "topical", "oral", "parenteral", "dosed_ingredient",
            "polymer_flag", "therapeutic_flag", "med_chem_friendly",
            "molecule_properties.ro3_pass"
        ])
        return ret_dict
Exemplo n.º 25
0
def load_data(tsv_file):
    _file = open(tsv_file)
    reader = csv.DictReader(_file, delimiter='\t')
    _dict = {}
    drug_list = []
    for row in reader:
        _id = row["PharmGKB Accession Id"]
        _d = restr_dict(row)
        _d = clean_up(_d)
        _d = unlist(dict_sweep(_d))
        _dict = {'_id': _id, 'pharmgkb': _d}
        yield _dict
Exemplo n.º 26
0
def load_data(tsv_file):
    _file = open(tsv_file)
    reader = csv.DictReader(_file,delimiter='\t')
    _dict = {}
    drug_list = []
    for row in reader:
        _id = row["PharmGKB Accession Id"]
        _d = restr_dict(row)
        _d = clean_up(_d)
        _d = unlist(dict_sweep(_d))
        _dict = {'_id':_id,'pharmgkb':_d}
        yield _dict
Exemplo n.º 27
0
def load_data():
    pharmacology_class = process_pharmacology_action(file_path_pharma_class)
    faers = process_faers(file_path_faers)
    act = process_act(file_path_act)
    omop = process_omop(file_path_omop)
    approval = process_approval(file_path_approval)
    drug_dosage = process_drug_dosage(file_path_drug_dosage)
    synonyms = process_synonym(file_path_synonym)
    structures = process_structure(file_path_structure)
    identifiers = process_identifier(file_path_identifier)
    for struc_id in set(
            list(pharmacology_class.keys()) + list(faers.keys()) +
            list(act.keys()) + list(omop.keys()) + list(approval.keys()) +
            list(drug_dosage.keys()) + list(identifiers.keys()) +
            list(synonyms.keys()) + list(structures.keys())):
        #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())):
        if structures.get(struc_id, {}).get('inchikey', {}):
            _doc = {
                '_id': structures.get(struc_id, {}).get('inchikey', {}),
                'drugcentral': {
                    "pharmacology_class": pharmacology_class.get(struc_id, {}),
                    "fda_adverse_event": faers.get(struc_id, {}),
                    "bioactivity": act.get(struc_id, {}),
                    "drug_use": omop.get(struc_id, {}),
                    "approval": approval.get(struc_id, {}),
                    "drug_dosage": drug_dosage.get(struc_id, {}),
                    "synonyms": synonyms.get(struc_id, {}),
                    "structures": structures.get(struc_id, {}),
                    "xref": identifiers.get(struc_id, {})
                }
            }
        else:
            _id = xref_2_inchikey(identifiers.get(struc_id, {}))
            if not _id:
                _id = 'DrugCentral:' + str(struc_id)
            _doc = {
                '_id': _id,
                'drugcentral': {
                    "pharmacology_class": pharmacology_class.get(struc_id, {}),
                    "fda_adverse_event": faers.get(struc_id, {}),
                    "bioactivity": act.get(struc_id, {}),
                    "drug_use": omop.get(struc_id, {}),
                    "approval": approval.get(struc_id, {}),
                    "drug_dosage": drug_dosage.get(struc_id, {}),
                    "synonyms": synonyms.get(struc_id, {}),
                    "structures": structures.get(struc_id, {}),
                    "xref": identifiers.get(struc_id, {})
                }
            }
        _doc = (dict_sweep(unlist(_doc), [None]))
        yield _doc
Exemplo n.º 28
0
    def parse(self, record: vcf.model._Record, doc_key: str):
        """
            When parsing gnomad.genomes.*.vcf.bgz files, `doc_key` should be "gnomad_genome";
            when parsing gnomad.exomes.*.vcf.bgz files, `doc_key` should be "gnomad_exome".

            The returned document has the following structure:

                one_snp_json = {
                    "_id": hgvs_id,
                    doc_key: {
                        "chrom": chrom,
                        ...
                    }
                }
            """
        # the value of CHROM in hg38 GNOMAD source file startswith 'chr'; need to remove it first
        if record.CHROM.startswith('chr'):
            record.CHROM = record.CHROM[3:]  # This step is necessary to `profile_parser.parse()` method
        if record.CHROM not in CHROM_VALID_VALUES:
            return

        info = record.INFO

        for key in ["AC", "AF", "nhomalt"]:
            if key in info:
                assert len(record.ALT) == len(info[key]), \
                    "length of record.ALT != length of info.%s, at CHROM=%s, POS=%s" % (key, record.CHROM, record.POS)

        profile_list = self.profile_parser.parse(record)
        site_quality_metrics_dict = self.site_quality_metrics_parser.parse(info)

        for i in range(len(record.ALT)):
            hgvs_id, profile_dict = profile_list[i]
            if hgvs_id is None:
                continue

            population_frequency_dict = self.population_frequency_parser.parse(info, i)

            one_snp_json = {
                "_id": hgvs_id,
                doc_key: {
                    **profile_dict,
                    **site_quality_metrics_dict,
                    **population_frequency_dict
                }
            }

            obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None]))
            yield obj
Exemplo n.º 29
0
def load_data(data_folder):
    tar = tarfile.open(
        os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.tar"))
    member = tar.getmember(
        "Kaviar-160204-Public/vcfs/Kaviar-160204-Public-hg19.vcf.gz")
    member.name = os.path.basename(member.name)
    tar.extract(member, path=data_folder)
    tar.close()

    input_fn = os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.gz")
    vcf_reader = vcf.Reader(filename=input_fn,
                            compressed=True,
                            strict_whitespace=True)

    json_rows = map(_map_line_to_json, vcf_reader)
    json_rows = chain.from_iterable(json_rows)

    fd_tmp, tmp_path = mkstemp(dir=data_folder)

    try:
        with open(tmp_path, "w") as f:
            dbwriter = csv.writer(f)
            for doc in json_rows:
                if doc:
                    dbwriter.writerow([doc['_id'], json.dumps(doc)])

        csvsort(tmp_path, [
            0,
        ])

        with open(tmp_path) as csvfile:
            json_rows = csv.reader(csvfile)
            json_rows = (json.loads(row[1]) for row in json_rows)
            row_groups = (
                it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
            json_rows = (merge_duplicate_rows(rg, "kaviar")
                         for rg in row_groups)

            import logging
            for row in json_rows:
                logging.debug(row)
                res = unlist(dict_sweep(row, vals=[
                    None,
                ]))
                yield res

    finally:
        os.remove(tmp_path)
        os.remove(input_fn)
Exemplo n.º 30
0
def get_genesets(obo_filename, genemap_filename):
    disease_ontology = GO()
    obo_is_loaded = disease_ontology.load_obo(obo_filename)

    if obo_is_loaded is False:
        logging.error('Failed to load OBO file.')

    doid_omim_dict = build_doid_omim_dict(obo_filename)

    mim_diseases = build_mim_diseases_dict(genemap_filename)

    entrez_set = add_term_annotations(doid_omim_dict, disease_ontology,
                                      mim_diseases)

    genes_info = query_mygene(entrez_set, TAX_ID)
    disease_ontology.populated = True
    disease_ontology.propagate()

    genesets = list()
    for term_id, term in disease_ontology.go_terms.items():
        # If a term includes anyvalid gene IDs, add it as a geneset.
        gid_set = set()
        for annotation in term.annotations:
            gid_set.add(annotation.gid)

        if gid_set:
            my_geneset = {}
            my_geneset['_id'] = term_id.replace(":", "_")
            my_geneset['is_public'] = True
            my_geneset['taxid'] = TAX_ID
            my_geneset['source'] = 'do'
            my_geneset['name'] = term.full_name
            do_abstract = create_gs_abstract(term, doid_omim_dict)
            my_geneset['description'] = do_abstract

            # Genes in a geneset are sorted by their IDs to make output reproducible.
            my_geneset['genes'] = [
                genes_info[str(gid)] for gid in sorted(gid_set)
            ]

            my_geneset['do'] = {'id': term_id, 'abstract': do_abstract}

            my_geneset = dict_sweep(my_geneset,
                                    vals=[None],
                                    remove_invalid_list=True)
            my_geneset = unlist(my_geneset)
            genesets.append(my_geneset)

    return genesets
Exemplo n.º 31
0
def _map_line_to_json(fields):
    """Mapping each lines in csv file into JSON doc
    """
    one_snp_json = {
        "gene": fields[1],
        "variant_id": fields[2],
        "exon": fields[3],
        "egl_variant": fields[4],
        "egl_protein": fields[5],
        "egl_classification": fields[6],
        "egl_classification_date": fields[7],
        "hgvs": fields[8].split(" | ")
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
Exemplo n.º 32
0
 def query_mygene(self, ids, id_type):
     """Query information from mygene.info about each gene in 'ids'.
     Args:
         ids (iterable): Array or set of gene ids to query.
         id_type (str): query scope field for the ids.
             Can be a comma-separated string for multiple scopes.
             e.g. 'entrezgene,symbol'
     """
     self.ids = ids
     mg = mygene.MyGeneInfo()
     # Fields to query
     fields = "entrezgene,ensembl.gene,uniprot.Swiss-Prot,symbol,name"
     if id_type == "symbol":
         scopes = "symbol,alias"
     elif id_type == "entrezgene":
         scopes = "entrezgene,retired"
     else:
         scopes = id_type
     response = mg.querymany(ids,
                             scopes=scopes,
                             fields=fields,
                             species=self.species,
                             returnall=True)
     # Save failed queries
     self.missing = response['missing']
     # Format successful queries
     for out in response['out']:
         query = out['query']
         if out.get('notfound'):
             continue
         gene = {'mygene_id': out['_id']}
         if out.get('symbol') is not None:
             gene['symbol'] = out['symbol']
         if out.get('name') is not None:
             gene['name'] = out['name']
         if out.get('entrezgene') is not None:
             gene['ncbigene'] = out['entrezgene']
         if out.get('ensembl') is not None:
             if len(out['ensembl']) > 1:
                 for i in out['ensembl']:
                     gene.setdefault('ensemblgene', []).append(i['gene'])
             else:
                 gene['ensemblgene'] = out['ensembl']['gene']
         if out.get('uniprot') is not None:
             gene['uniprot'] = out['uniprot']['Swiss-Prot']
         gene = dict_sweep(gene)
         gene = unlist(gene)
         self.query_cache[query] = gene
Exemplo n.º 33
0
def load_data(assembly, input_file, chrom):
    import logging as loggingmod
    global logging
    logging = loggingmod.getLogger("dbsnp_upload")
    logging.info("Processing chr{}...".format(chrom))
    snpdoc_iter = parse_vcf(assembly,
                            input_file,
                            compressed=True,
                            verbose=False,
                            by_id=True,
                            reference=chrom)
    for doc in snpdoc_iter:
        _doc = {'dbsnp': doc}
        _doc['_id'] = doc['_id']
        del doc['_id']
        yield (dict_sweep(unlist(value_convert_to_number(_doc)), [None]))
Exemplo n.º 34
0
def load_data():
    pharmacology_class = process_pharmacology_action(file_path_pharma_class)
    faers = process_faers(file_path_faers)
    act = process_act(file_path_act)
    omop = process_omop(file_path_omop)
    approval = process_approval(file_path_approval)
    drug_dosage = process_drug_dosage(file_path_drug_dosage)
    synonyms = process_synonym(file_path_synonym)
    structures = process_structure(file_path_structure)
    identifiers = process_identifier(file_path_identifier)
    for struc_id in set(list(pharmacology_class.keys()) + list(faers.keys()) + list(act.keys()) + list(omop.keys()) + list(approval.keys()) + list(drug_dosage.keys()) + list(identifiers.keys()) + list(synonyms.keys()) + list(structures.keys())):
    #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())):
        if structures.get(struc_id, {}).get('inchikey', {}):
            _doc = {
                '_id': structures.get(struc_id, {}).get('inchikey', {}),
                'drugcentral': {
                    "pharmacology_class": pharmacology_class.get(struc_id, {}),
                    "fda_adverse_event": faers.get(struc_id, {}),
                    "bioactivity": act.get(struc_id, {}),
                    "drug_use": omop.get(struc_id, {}),
                    "approval": approval.get(struc_id, {}),
                    "drug_dosage": drug_dosage.get(struc_id, {}),
                    "synonyms": synonyms.get(struc_id, {}),
                    "structures": structures.get(struc_id, {}),
                    "xrefs": identifiers.get(struc_id, {})
                }
            }
        else:
            _id = xrefs_2_inchikey(identifiers.get(struc_id, {}))
            if not _id:
                _id = 'DrugCentral:' + str(struc_id)
            _doc = {
                '_id': _id,
                'drugcentral': {
                    "pharmacology_class": pharmacology_class.get(struc_id, {}),
                    "fda_adverse_event": faers.get(struc_id, {}),
                    "bioactivity": act.get(struc_id, {}),
                    "drug_use": omop.get(struc_id, {}),
                    "approval": approval.get(struc_id, {}),
                    "drug_dosage": drug_dosage.get(struc_id, {}),
                    "synonyms": synonyms.get(struc_id, {}),
                    "structures": structures.get(struc_id, {}),
                    "xrefs": identifiers.get(struc_id, {})
                }
            }
        _doc = (dict_sweep(unlist(_doc), [None]))
        yield _doc
Exemplo n.º 35
0
def load_data(data_folder):
    input_fn = os.path.join(data_folder, "CCLE_DepMap_18q3_maf_20180718.txt")
    db_ccle = csv.reader(open(input_fn), delimiter='\t')
    index = next(db_ccle)
    assert len(index) == VALID_COLUMN_NO, \
        "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index))
    index = [clean_index(s) for s in index]
    ccle = (dict(zip(index, row)) for row in db_ccle)
    ccle = filter(lambda row: row["chromosome"] != "", ccle)
    json_rows = map(_map_line_to_json, ccle)
    json_rows = (row for row in json_rows if row)
    json_rows = sorted(json_rows, key=lambda k: k['_id'])
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    json_rows = (merge_duplicate_rows(rg, "ccle") for rg in row_groups)
    return (unlist(dict_sweep(row, vals=[
        None,
    ])) for row in json_rows)
Exemplo n.º 36
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemplo n.º 37
0
def restructure_dict(dictionary):
    restr_dict = dict()
    _flag = 0
    for key in list(dictionary): # this is for 1
        if key == 'molecule_chembl_id':
            restr_dict['_id']=dictionary[key]
        if key == 'molecule_structures' and type(dictionary['molecule_structures'])==dict:
            restr_dict['chembl'] = dictionary
            _flag=1
            for x,y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key':y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y

    if _flag == 0:
        restr_dict['chembl'] = dictionary
    if 'cross_references' in restr_dict['chembl'] and restr_dict['chembl']['cross_references']:
        restr_dict['chembl']['xrefs'] = restructure_xref(restr_dict['chembl']['cross_references'])

    del restr_dict['chembl']['molecule_structures']
    del restr_dict['chembl']['cross_references']
    restr_dict = unlist(restr_dict)
    # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs
    if 'chebi_par_id' in restr_dict['chembl'] and restr_dict['chembl']['chebi_par_id']:
        restr_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(restr_dict['chembl']['chebi_par_id'])
    else:
        # clean, could be a None
        restr_dict['chembl'].pop("chebi_par_id",None)

    restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"])
    restr_dict = value_convert_to_number(restr_dict, skipped_keys=["chebi_par_id","first_approval"])
    restr_dict = boolean_convert(restr_dict, ["topical","oral","parenteral","dosed_ingredient","polymer_flag",
        "therapeutic_flag","med_chem_friendly","molecule_properties.ro3_pass"])
    return restr_dict
Exemplo n.º 38
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],   # ref aa
            'naa': fields[108],   # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],    # raw CADD score
            'phred': fields[115]        # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
Exemplo n.º 39
0
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V6_gene"].split('|')
    gtex_tissue = df["GTEx_V6_tissue "].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))
    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [
            {"mechanism": mutpred_mechanisms[0],
             "p_val": modify_pvalue(mutpred_mechanisms[1])},
            {"mechanism": mutpred_mechanisms[2],
             "p_val": modify_pvalue(mutpred_mechanisms[3])},
            {"mechanism": mutpred_mechanisms[4],
             "p_val": modify_pvalue(mutpred_mechanisms[5])},
            {"mechanism": mutpred_mechanisms[6],
             "p_val": modify_pvalue(mutpred_mechanisms[7])},
            {"mechanism": mutpred_mechanisms[8],
             "p_val": modify_pvalue(mutpred_mechanisms[9])}
        ]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_score_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"],
                "transcriptid": df["Transcript_id_VEST3"],
                "transcriptvar": df["Transcript_var_VEST3"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_score_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_score_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"]
                },
                "p20way": {
                    "mammalian": df["phyloP20way_mammalian"],
                    "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"]
                },
                "20way": {
                    "mammalian": df["phastCons20way_mammalian"],
                    "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "rs": df["clinvar_rs"],
                "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])),
                "trait": [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."]))
            },
            "gtex": list(gtex)
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Exemplo n.º 40
0
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    conditions = []
    for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
        synonyms = []
        conditions_name = ''
        for name in _trait.Name:
            if name.ElementValue.Type == 'Alternate':
                synonyms.append(name.ElementValue.get_valueOf_())
            if name.ElementValue.Type == 'Preferred':
                conditions_name += name.ElementValue.get_valueOf_()
        identifiers = {}
        for item in _trait.XRef:
            if item.DB == 'Human Phenotype Ontology':
                key = 'Human_Phenotype_Ontology'
            else:
                key = item.DB
            identifiers[key.lower()] = item.ID
        for symbol in _trait.Symbol:
            if symbol.ElementValue.Type == 'Preferred':
                conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
        age_of_onset = ''
        for _set in _trait.AttributeSet:
            if _set.Attribute.Type == 'age of onset':
                age_of_onset = _set.Attribute.get_valueOf_()
        conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset})

    try:
        genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
    except:
        genotypeset = None
    if genotypeset:
        obj_list = []
        id_list = []
        for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
            variant_id = _set.ID
            for _measure in _set.Measure:
                json_obj = parse_measure(_measure, hg19=hg19)
                if json_obj:
                    json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
                        'clinical_significance': clinical_significance,
                        'number_submitters': number_submitters,
                        'review_status': review_status,
                        'last_evaluated': str(last_evaluated),
                        'origin': origin,
                        'conditions': conditions})
                    json_obj['clinvar'].update({'variant_id': variant_id})
                    json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
                                               ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
                    obj_list.append(json_obj)
                    id_list.append(json_obj['_id'])
        for _obj in obj_list:
            _obj['clinvar'].update({'genotypeset': {
                    'type': 'CompoundHeterozygote',
                    'genotype': id_list
                    }})
            yield _obj
    else:
        variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
        for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
            json_obj = parse_measure(_measure, hg19=hg19)
            if json_obj:
                json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
                        'clinical_significance': clinical_significance,
                        'number_submitters': number_submitters,
                        'review_status': review_status,
                        'last_evaluated': str(last_evaluated),
                        'origin': origin,
                        'conditions': conditions})
                json_obj['clinvar'].update({'variant_id': variant_id})
                json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
                                               ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
                yield json_obj
Exemplo n.º 41
0
def load_data():
    dgidb_docs = fetch_all_docs_from_api()
    for _doc in dgidb_docs:
        _doc['interaction_id'] = _doc.pop('id')
        yield dict_sweep(unlist({'dgidb': _doc}), vals=[None, "", []])
Exemplo n.º 42
0
    def annotate(self,hgvs_vcfs):
        """hgvs_vcfs: list of {"vcf": {}, "_id": ""}"""

        # title of vcf
        vcf_stdin = ['#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO']
        for hgvs_id in hgvs_vcfs:
            vcf = hgvs_vcfs[hgvs_id]["vcf"]
            try:
                self.check_hgvs_info(vcf)
            except (TypeError, ValueError) as e:
                self.logger.warning("Skipping HGVS %s: %s" % (repr(hgvs_vcfs[hgvs_id]),e))
                continue
            # add hgvs ID at the end so we can match for sure which annotations correspond to which ID 
            # instead of rebuild it from VCF info (they can be different)
            # this comment will be at the first position in the result line
            vcf_stdin.append(str(vcf["chrom"]) + '\t' + str(vcf["position"]) + '\t' + '.' + '\t' + vcf["ref"] + '\t' + vcf["alt"] + '\t.\t.\t.' + "\t# hgvs:" + hgvs_id)

        if (len(vcf_stdin) - 1) == 0:
            self.logger.info("No HGVS ID as input (previously filtered out)")
            return
        self.logger.info("Running '%s' on %d HGVS IDs" % (self.snpeff_cmd,len(vcf_stdin)-1)) # -1: header
        proc = subprocess.Popen(self.snpeff_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = proc.communicate("\n".join(vcf_stdin).encode())
        stderr = stderr.decode()
        # they print some news message on stderr, bad idea when we use it to detect errors.
        # try to get rid of it
        if "NEW VERSION!" in stderr:
            stderr = stderr.splitlines()
            start = stderr.index("NEW VERSION!")
            # message is 5 lines long (hopefully..)
            end = start + 5
            stderr = stderr[:start] + stderr[end:]
            # rebuild and clean any empty lines
            stderr = "\n".join(stderr).strip()
        if stderr != '':
            fn = "snpeff_err_%s.pickle" % datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            pickle.dump({"input" : hgvs_vcfs,
                         "vcf_stdin" : vcf_stdin,
                         "stderr" : stderr},open(fn,"wb"))
            raise Exception("Something went wrong while generating snpeff annotation (see dump %s for more):\n%s" % (fn,stderr))

        strout = stdout.decode()
        vcf_stdout_raw = strout.splitlines()
        for vcf_line in vcf_stdout_raw:
            if vcf_line.startswith('#'):
                continue
            elif vcf_line == '':
                continue
            else:
                fromi = vcf_line.index("#")
                str_id = vcf_line[fromi:]
                hgvs_info = str_id.replace("#","").strip().split(":")
                # extract HGVS
                assert hgvs_info[0] == "hgvs", "Can't find HGVS ID in VCF line '%s'" % repr(vcf_line)
                hgvs_id = ":".join(hgvs_info[1:])
                # -1: remove the tab char also, before #
                vcf_line = vcf_line[:fromi-1]
                # assume the following item is 'ANN'
                ann_info = vcf_line.split(';')[0]
                ann = []
                # Multiple annotations per VCF line
                for item in ann_info.split(','):
                    if len(item.split('|')) > 1:
                        (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7]
                        (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15]
                        if cdna:
                            (cdna_position, cdna_len) = cdna.split('/')
                        else:
                            cdna_position = None
                            cdna_len = None
                        if cds:
                            (cds_position, cds_len) = cds.split('/')
                        else:
                            cds_position = None
                            cds_len = None
                        if protein:
                            (protein_position, protein_len) = protein.split('/')
                        else:
                            protein_position = None
                            protein_len = None
                        if exon:
                            (rank, total) = exon.split('/')
                        else:
                            rank = None
                            total = None
                        ann.append({
                            "effect": effect,
                            "putative_impact": putative_impact,
                            "genename": gene_name,
                            "gene_id": gene_id,
                            "feature_type": feature_type,
                            "feature_id": feature_id,
                            "transcript_biotype": transcript_biotype,
                            "rank": rank,
                            "total": total,
                            "hgvs_c": trim_delseq_from_hgvs(hgvs_coding), # trim long sequence
                            "hgvs_p": hgvs_protein,
                            "cdna": {
                                "position": cdna_position,
                                "length": cdna_len
                            },
                            "cds": {
                                "position": cds_position,
                                "length": cds_len
                            },
                            "protein": {
                                "position": protein_position,
                                "length": protein_len
                            },
                            "distance_to_feature": distance_to_feature
                        })
                # not all annotations include lof & nmd information. Set them to 'None' as default
                lof = None
                nmd = None
                # the case that annotation include 'ann' & 'lof' & 'nmd'
                if len(vcf_line.split(';')) == 3:
                    (lof_info, nmd_info) = vcf_line.split(';')[1:3]
                    # assume the second item is 'lof'
                    assert lof_info.startswith('LOF')
                    # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)'
                    lof_info = lof_info.split('(')[1].split(')')[0]
                    nmd_info = nmd_info.split('(')[1].split(')')[0]
                    (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                    (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                    lof = {
                        "gene_id": id_lof,
                        "genename": name_lof,
                        "number_of_transcripts_in_gene": nt_lof,
                        "percent_of_transcripts_affected": pt_lof
                    }
                    nmd = {
                        "gene_id": id_nmd,
                        "genename": name_nmd,
                        "number_of_transcripts_in_gene": nt_nmd,
                        "percent_of_transcripts_affected": pt_nmd
                    }
                # the case that annotation include 'ann' & 'lof or nmd'
                elif len(vcf_line.split(';')) == 2:
                    (ann_info, idk_info) = vcf_line.split(';')
                    if idk_info.startswith('LOF'):
                        lof_info = idk_info.split('(')[1].split(')')[0]
                        (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                        lof = {
                            "gene_id": id_lof,
                            "genename": name_lof,
                            "number_of_transcripts_in_gene": nt_lof,
                            "percent_of_transcripts_affected": pt_lof
                        }
                    else:
                        nmd_info = idk_info.split('(')[1].split(')')[0]
                        (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                        nmd = {
                            "gene_id": id_nmd,
                            "genename": name_nmd,
                            "number_of_transcripts_in_gene": nt_nmd,
                            "percent_of_transcripts_affected": pt_nmd
                        }
                (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5]
                one_snp_json = {
                    "_id": hgvs_id,
                    "snpeff": {
                        "ann": ann,
                        "lof": lof,
                        "nmd": nmd,
                    },
                }
                snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None])

                yield snpeff_json
Exemplo n.º 43
0
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS)
        one_snp_json = {
            "_id": HGVS,
            doc_key : {
                "chrom": chrom,
                "pos": chromStart,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'][i],
                    "ac_afr": info['AC_AFR'][i],
                    "ac_amr": info['AC_AMR'][i],
                    "ac_adj": info['AC_Adj'][i],
                    "ac_eas": info['AC_EAS'][i],
                    "ac_fin": info['AC_FIN'][i],
                    "ac_het": info['AC_Het'][i],
                    "ac_hom": info['AC_Hom'][i],
                    "ac_nfe": info['AC_NFE'][i],
                    "ac_oth": info['AC_OTH'][i],
                    "ac_sas": info['AC_SAS'][i],
                    "ac_male": info['AC_MALE'][i],
                    "ac_female": info['AC_FEMALE'][i]
                },
                "af": info['AF'][i],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemplo n.º 44
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
            HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {

        "_id": HGVS,
        "clinvar":
            {
                "allele_id": fields[0],
                "hg19":
                    {
                        "chr": fields[13],
                        "start": fields[14],
                        "end": fields[15]
                    },
                "type": fields[1],
                "name": fields[2],
                "gene":
                    {
                        "id": fields[3],
                        "symbol": fields[4]
                    },
                "clinical_significance": fields[5].split(";"),
                "rsid": 'rs' + str(fields[6]),
                "nsv_dbvar": fields[7],
                "rcv_accession": fields[8].split(";"),
                "tested_in_gtr": fields[9],
                "phenotype_id": other_id(fields[10]),
                "origin": fields[11],
                "cytogenic": fields[16],
                "review_status": fields[17],
                "hgvs":
                    {
                        "coding": fields[18],
                        "protein": fields[19]
                    },
                "number_submitters": fields[20],
                "last_evaluated": fields[21],
                "guidelines": fields[22],
                "other_ids": other_id(fields[23]),
                "clinvar_id": fields[24]
            }
        }
    return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
Exemplo n.º 45
0
def _map_line_to_json(fields,dbsnp_col):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    docs = [d for d in dbsnp_col.find({"dbsnp.rsid":rsid})]
    for doc in docs:
        HGVS = doc['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")