def rsidsToHG37Positions(rsidList):
    """Return a DataFrame containing hg37 positions for a list of rsids.

        args:
            rsidList (list of str): the rsids

        returns:
            df (DataFrame): all the rsids found in the genomic range
                between startPos and endPos, indexed by rsid
    """

    mv = myvariant.MyVariantInfo()
    gen = mv.querymany(rsidList,
                       scopes='dbsnp.rsid',
                       fields='dbsnp.rsid, dbsnp.hg19.start',
                       fetch_all=True,
                       assembly='hg37')

    rsids = {}
    for row in gen:
        try:
            rsid = (row['dbsnp']['rsid'])
            start = (row['dbsnp']['hg19']['start'])
            rsids[rsid] = start
        except KeyError:
            continue
    df = pd.DataFrame.from_dict(rsids, orient='index')
    return df
示例#2
0
 def refresh_myvariant_data(self):
     vars_by_hgvs = {v.b37_hgvs_id: v for v in self.variants.all()}
     mv = myvariant.MyVariantInfo()
     mv_data = mv.getvariants(vars_by_hgvs.keys(),
                              fields=['clinvar', 'dbsnp', 'exac'])
     for var_data in mv_data:
         if '_id' not in var_data:
             variant = vars_by_hgvs[var_data['query']]
             variant.myvariant_clinvar = {}
             variant.myvariant_exac = {}
             variant.myvariant_dbsnp = {}
             variant.save()
             continue
         variant = vars_by_hgvs[var_data['_id']]
         try:
             clinvar_data = var_data['clinvar']
             # Always as list - makes downstream code much easier.
             if not type(clinvar_data['rcv']) == list:
                 clinvar_data['rcv'] = [clinvar_data['rcv']]
             variant.myvariant_clinvar = var_data['clinvar']
         except KeyError:
             variant.myvariant_clivar = {}
         try:
             variant.myvariant_exac = var_data['exac']
         except KeyError:
             variant.myvariant_exac = {}
         try:
             variant.myvariant_dbsnp = var_data['dbsnp']
         except KeyError:
             variant.myvariant_dbsnp = {}
         variant.myvariant_last_update = django_timezone.now()
         variant.save()
示例#3
0
def annotate(listHGVS):
    """
    annotmvi - accepts HGVS, returns data on mutation
    Parameters: listHGVS: list of HGVS IDs to retrieve annotations for
    Return: dictionary/list containing the json data from myvariant.info
    """
    listmvi = []
    mv = myvariant.MyVariantInfo()

    #For each HGVS ID in the list, retrieve the annotation
    for idHGVS in listHGVS:
        listmvi.append(
            mv.getvariant(idHGVS,
                          fields=[
                              'dbsnp.rsid',
                              'dbsnp.alleles',
                              'dbsnp.vartype',
                              'dbsnp.gene',
                              'clinvar',
                              'gnomad_genome.af',
                              'gnomad_exome.af',
                              'dbnsfp.ensembl',
                              'dbnsfp.uniprot',
                              'dbnsfp.polyphen2',
                              'dbnsfp.sift',
                              'dbnsfp.provean',
                          ]))

        #Progress tracker
        if listHGVS.index(idHGVS) % 100 == 0:
            print(
                str(listHGVS.index(idHGVS)) + ' out of ' + str(len(listHGVS)) +
                ' written...')

    return listmvi
def getHG37PositionsInRange(chromosome, startPos, endPos):
    """Return a DataFrame containing hg37 positions for all rsids in a range.

        args:
            chromosome (int or str): the chromosome number
            startPos (int or str): the start position on the chromosome
            endPos (int or str): the end position on the chromosome

        returns:
            df (DataFrame): all the rsids found in the genomic range
                between startPos and endPos, indexed by rsid
            chromosome (int or str): the chromosome number
    """
    queryString = f'chr{chromosome}:{startPos}-{endPos}'

    mv = myvariant.MyVariantInfo()
    gen = mv.query(queryString,
                   scopes='dbsnp.rsid',
                   fields='dbsnp.rsid, dbsnp.hg19.start',
                   fetch_all=True,
                   assembly='hg37')

    rsids = {}
    for row in gen:
        try:
            rsid = (row['dbsnp']['rsid'])
            start = (row['dbsnp']['hg19']['start'])
            rsids[rsid] = start
        except KeyError:
            continue
    df = pd.DataFrame.from_dict(rsids, orient='index')
    return df, chromosome
示例#5
0
def test_func(num1, num2):

    mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1')
    demo = query('23andme_large.txt', 'time_record.txt', '23andme', 'csv', 19,
                 'None', num1, num2)
    result = demo.genequery()
    return result
示例#6
0
def rsid2bed(rsid_file, OutDir):
    window_size = 1000
    mv = myvariant.MyVariantInfo()
    rsid_file = open(rsid_file, 'r')
    bed_file = open(os.path.join(OutDir, 'tmp.bed'), 'w')
    num_input = 0
    num_out = 0
    for line in rsid_file:
        num_input += 1
        if line[:2].lower() != 'rs':
            print("Error: Please input valid rsid")
        info = mv.query(line, assembly='hg38')
        if len(info['hits']) == 0:
            continue
        chrom = info['hits'][0]['chrom']
        pos = info['hits'][0]['vcf']['position']
        ref = info['hits'][0]['vcf']['ref']
        alt = info['hits'][0]['vcf']['alt']
        id = line.strip().lower()

        begin = int(int(pos) - window_size / 2)
        end = int(int(pos) + window_size / 2)
        bed_file.write('chr' + chrom + '\t' + str(begin) + '\t' + str(end) +
                       '\t' + id + ';' + ref + ';' + alt + ';' + chrom + ';' +
                       pos + '\n')
        num_out += 1
    return num_input, num_out
示例#7
0
 def hvgs_ids(self):
     """The HVGS ID from myvariant."""
     if not hasattr(self, '_hvgs_ids'):
         mv = myvariant.MyVariantInfo()
         self._hvgs_ids = [
             i['_id'] for i in mv.query(self.snp_loc, fields='id')['hits']
         ]
     return self._hvgs_ids
    def annotate_vcf_file(self):
        '''
        - Annotate the VCF file using the following example code (for 1 variant)
        - Iterate of the variants (use first 900)
        - Store the result in a data structure
        :return:
        '''
        print("TODO")

        ##
        ## Example loop
        ##

        ## Build the connection
        h = httplib2.Http()
        headers = {'content-type': 'application/x-www-form-urlencoded'}

        params_pos = []  # List of variant positions
        with open(self.vcf_path) as my_vcf_fh:
            vcf_reader = vcf.Reader(my_vcf_fh)
            for counter, record in enumerate(vcf_reader):
                params_pos.append(record.CHROM + ":g." + str(record.POS) +
                                  record.REF + ">" + str(record.ALT[0]))

                if counter >= 899:
                    break

        ## Build the parameters using the list we just built
        params = 'ids=' + ",".join(params_pos) + '&hg38=true'

        ## Perform annotation
        res, con = h.request('http://myvariant.info/v1/variant',
                             'POST',
                             params,
                             headers=headers)
        annotation_result = con.decode('utf-8')

        # Alternative way with myvariant package (normal http request returns string not list/dict!
        mv = myvariant.MyVariantInfo()

        annotation_result = mv.getvariants(params)

        ## TODO now do something with the 'annotation_result'
        reslist = []
        for result in annotation_result:
            try:
                if result['notfound']:
                    pass
                else:
                    reslist.append(result)
            except:
                reslist.append(result)
        ##
        ## End example code
        ##

        return reslist  ## return the data structure here
示例#9
0
def getSNPannot(ids):
    mv = myvariant.MyVariantInfo()
    df = mv.querymany(
        ids,
        scopes='cosmic.cosmic_id, dbsnp.rsid',
        fields='_id, clinvar.gene.symbol, dbnsfp.aa.ref, dbnsfp.aa.pos, \
                dbnsfp.aa.alt, dbnsfp.clinvar.trait',
        as_dataframe=True)
    return df
示例#10
0
文件: __init__.py 项目: nesegunes/aiv
def getvariant(chromosome, start, ref, var):
    # Create myvariant info instance
    mv = myvariant.MyVariantInfo()

    # Get variant information for: chromosome, int(start), ref, var
    v = myvariant.format_hgvs(chromosome, int(start), ref, var)
    dir_ = mv.getvariant(v)

    # Return variant information found in all databases as a directory
    return dir_
示例#11
0
 def setUp(self):
     self.mv = myvariant.MyVariantInfo()
     self.query_list1 = [
         'chr1:g.866422C>T', 'chr1:g.876664G>A', 'chr1:g.69635G>C',
         'chr1:g.69869T>A', 'chr1:g.881918G>A', 'chr1:g.865625G>A',
         'chr1:g.69892T>C', 'chr1:g.879381C>T', 'chr1:g.878330C>G'
     ]
     self.query_list2 = [
         'rs374802787', 'rs1433078', 'rs1433115', 'rs377266517',
         'rs587640013', 'rs137857980', 'rs199710579', 'rs186823979',
         'rs2276240', 'rs372452565'
     ]
示例#12
0
def get_mv_data(chrom, pos, ref_allele, var_allele):
    hgvs_format = myvariant.format_hgvs(get_chrom_display(chrom), pos,
                                        ref_allele, var_allele)
    mv = myvariant.MyVariantInfo()
    mv_data = mv.getvariant(hgvs_format, fields=['clinvar', 'dbsnp', 'exac'])
    if mv_data and 'clinvar' in mv_data and 'rcv' in mv_data['clinvar']:
        if not type(mv_data['clinvar']['rcv']) == list:
            mv_data['clinvar']['rcv'] = [mv_data['clinvar']['rcv']]
    if mv_data:
        allele_freq, freq_url = get_allele_freq(mv_data, var_allele)
    else:
        allele_freq, freq_url = None, None
    return hgvs_format, mv_data, allele_freq, freq_url
示例#13
0
    def get_dict_myvariant(self, variant_list):
        """
        Function designated to place the queries on myvariant.info servers.

        :param variant_list: list of HGVS variant ID's. Usually retrived beforehand using the method get_variants_from_vcf
        from the class VariantParsing.
        :return: list of dictionaries. Each dictionary contains data about a single variant.
        """

        mv = myvariant.MyVariantInfo()
        # This will retrieve a list of dictionaries
        variant_data = mv.getvariants(variant_list, as_dataframe=False)
        variant_data = self.remove_id_key(variant_data)
        return variant_data
示例#14
0
def _get_myvariantinfo_annotations_dict(hgvs_ids_list, genome_build_version, verbose_level, num_failed_attempts=0):
    """ Retrieve variants from MyVariant.info"""

    max_failed_attempts = 5
    myvariant_fields = [
        'cadd.1000g',
        'cadd.esp',
        'cadd.phred',
        'cadd.gerp',
        'cadd.polyphen',
        'cadd.sift',
        'dbsnp.rsid',
        'cosmic.cosmic_id',
        'cosmic.tumor_site',
        'clinvar.rcv.accession',
        'clinvar.rcv.clinical_significance',
        'clinvar.rcv.conditions',
        'civic.description',
        'civic.evidence_items',
        'cgi',
        'gwassnps',
        'wellderly.alleles'
    ]

    be_verbose = verbose_level >= 2
    mv = myvariant.MyVariantInfo()
    try:
        myvariantinfo_dicts_list = mv.getvariants(hgvs_ids_list, verbose=int(be_verbose), as_dataframe=False,
                                                  fields=myvariant_fields, assembly=genome_build_version)
    except ValueError as unrecoverable_error:
        # If myvariant.info returned a value error, recalling with the same values won't help so error out now
        raise unrecoverable_error
    except Exception as error:
        # If we got something other than a value error, problem may be with internet connection or myvariant.info
        # availability, so try again a couple of times just in case we can recover
        logging.info('Error: ' + str(error) + 'while fetching from MyVariant')
        num_failed_attempts += 1
        if num_failed_attempts < max_failed_attempts:
            time.sleep(5)
            logging.info("Retrying MyVariant.info fetch")
            myvariantinfo_dicts_list = _get_myvariantinfo_annotations_dict(hgvs_ids_list, genome_build_version,
                                                                           verbose_level, num_failed_attempts)
        else:
            # give up and raise error
            raise error

    myvariantinfo_dicts_list = _remove_unwanted_keys(myvariantinfo_dicts_list)
    return myvariantinfo_dicts_list
示例#15
0
文件: query.py 项目: MikeDacre/grasp
def get_variant_info(snp_list, fields='dbsnp', pandas=True):
    """Get variant info for a list of SNPs.

    Args:
        snp_list: A list of SNP objects or SNP rsIDs
        fields:   Choose fields to display from:
                  `<docs.myvariant.info/en/latest/doc/data.html#available-fields>`_
                  Good choices are 'dbsnp', 'clinvar', or 'gwassnps'
                  Can also use 'grasp' to get a different version of this
                  info.
        pandas:   Return a dataframe instead of dictionary.

    Returns:
        A dictionary or a dataframe.
    """
    mv = _mv.MyVariantInfo()
    if isinstance(snp_list, _pd.DataFrame):
        try:
            snps = list(snp_list.study_snpid)
        except AttributeError:
            snps = list(snp_list.index)
    elif isinstance(snp_list[0], t.SNP):
        snps = [i.study_snpid for i in snp_list]
    else:
        snps = snp_list
    assert isinstance(snps, (list, tuple))
    dfs = []
    for q in _chunks(snps, 999):
        dfs.append(
            mv.querymany(q,
                         scopes='dbsnp.rsid',
                         fields=fields,
                         as_dataframe=pandas,
                         df_index=True))
        if len(snps) > 999:
            _sleep(2)
    if pandas:
        return _pd.concat(dfs)
    else:
        if len(dfs) > 1:
            return dfs
        else:
            return dfs[0]
示例#16
0
def grabVariant(chrNumAndPosition):
    # Example input: 'chr7:g.117589482A>G'
    geneDict, overallDict, protein_result = None, None, None
    # Build myvariant object
    mv = myvariant.MyVariantInfo()
    # Query to select dbsnp dictionary from overall dictionary
    if mv:
        try:
            dbsnpDict = mv.getvariant(chrNumAndPosition,
                                      assembly='hg38')['dbsnp']
            overallDict = mv.getvariant(chrNumAndPosition, assembly='hg38')
            if dbsnpDict:
                # Gather gene information from dbSNP dictionary
                geneDict = dbsnpDict['gene']
                protein_result = findCommonProteinName(overallDict)
                print('Search Query Successful')
        except TypeError:
            print('Search Query Not Found')
    return geneDict, overallDict, protein_result
示例#17
0
    def get_variant_info(self, fields="dbsnp", pandas=True):
        """Use the myvariant API to get info about this SNP.

        Note that this service can be very slow.
        It will be faster to query multiple SNPs.

        Args:
            fields: Choose fields to display from:
                    `docs.myvariant.info/en/latest/doc/data.html#available-fields`_
                    Good choices are 'dbsnp', 'clinvar', or 'gwassnps'
                    Can also use 'grasp' to get a different version of this
                    info.
            pandas: Return a dataframe instead of dictionary.

        Returns:
            A dictionary or a dataframe.
        """
        mv = myvariant.MyVariantInfo()
        return mv.getvariants(self.hvgs_ids,
                              fields=fields,
                              as_dataframe=pandas,
                              df_index=True)
示例#18
0
def additional_annotation(request, variant_sample_pk):

    variant_sample = get_object_or_404(VariantSample, pk=variant_sample_pk)

    variant = variant_sample.variant

    chromosome = variant.chromosome[3:]
    position = variant.position
    ref = variant.ref
    alt = variant.alt

    mv = myvariant.MyVariantInfo()

    q = 'chrom:' + chromosome + ' AND vcf.position:' + str(
        position) + ' AND vcf.ref:' + ref + ' AND vcf.alt:' + alt

    #data = mv.query(q)

    response = urllib2.urlopen('http://python.org/')
    html = response.read()

    return JsonResponse(html, safe=False)
示例#19
0
def map_23andme_clinvar(user_data, conn):
    print('Mapping user 23andMe data with ClinVar...')
    c = conn.cursor()
    mv = myvariant.MyVariantInfo()
    # Cache MyVariantInfo requests
    mv.set_caching('./myvariant_cache', verbose=False)
    # Can definitely be optimized to reduce database or HTTP requests.
    for user in user_data:
        parsed = parse_user_vcf_data(user)
        mapped = []
        print('Mapping ' + user['user']['username'] + ' data to ClinVar...')
        print(len(parsed))
        for var in parsed:
            c.execute('''
                SELECT * FROM clinvar WHERE (chrom=? AND pos=? AND alt LIKE ?) OR (id=?)
            ''', (CHROM_INDEX[str(var['chrom'])], int(var['pos']), var['alt_allele'], var['id']))
            for result in c.fetchall():
                var['clinvar_data'] = json.loads(result[-1])
                var['gennotes_id'] = None
                var['gennotes_data'] = None
                var['hgvs_id'] = None
                var['mv_data'] = None
                if (not var['alt_allele'] == '.'):
                    var['gennotes_id'] = _gennotes_id(var['chrom'], var['pos'], var['ref_allele'], var['alt_allele'])
                    results = requests.get('https://gennotes.herokuapp.com/api/variant/', params={'variant_list': json.dumps([var['gennotes_id']])})
                    var['gennotes_data'] = results.json()
                    var['hgvs_id'] = _hgvs_id(var['chrom'], var['pos'], var['ref_allele'], var['alt_allele'])
                    try:
                        mv_data = mv.getvariant(var['hgvs_id'], fields=['clinvar', 'dbsnp', 'exac'], verbose=False)
                        var['mv_data'] = mv_data
                    except Exception as e:
                        print(var['alt_allele'])
                        print(e)
                mapped.append(var)
        with open('mapped_user_vcf/' + user['local_filename'] + '.json', 'w') as f:
            json.dump(mapped, f, indent=4)
    c.close()
示例#20
0
def get_dbsnp_obj(variant):
    obj_array = []

    for obj in variant.variation_objects:
        if obj["object_type"] in [
                "reference snp id", "reference snp identifier",
                "reference snp object", "rs", "rs id", "rs identifier",
                "rs object", "rs number", "rsid"
        ]:
            obj_array.append(obj["object"])

    if obj_array:
        return obj_array

    for rs_id in get_rs_number(variant):
        mv = myvariant.MyVariantInfo()
        query_string = "dbsnp.rsid:" + rs_id
        result = mv.query(query_string, fields='dbsnp')
        gnomics.objects.variation.Variation.add_object(variant,
                                                       obj=result,
                                                       object_type="dbSNP")
        obj_array.append(result)

    return obj_array
示例#21
0
    def get_snp(self):
        """
        Gets all existing snp (from my variant library) in the target sequence.
        Why ? Because a snp can't be in the primers sequences.
        internet connection required.

        :return: a list of snp such as [[id, position, gmaf],[...],[...],...]
        """
        snp_info = []
        mv = myvariant.MyVariantInfo()
        res = mv.query(self.no_chromosome + ":" +
                       str(int(self.mutation_pos) - self.range) + "-" +
                       str(int(self.mutation_pos) + self.range),
                       fields='dbsnp',
                       size=1000)
        for element in res["hits"]:
            if "dbsnp" in element:
                if "gmaf" in element["dbsnp"]:
                    snp_info.append([
                        element["dbsnp"]["rsid"],
                        element["dbsnp"]["hg19"]["start"],
                        element["dbsnp"]["gmaf"]
                    ])
        return snp_info
示例#22
0
def get_info(gene_symbol, hg, database='hg19'):
    ##db_index = input("""
    ##Select database:
    ##0: hg38
    ##1: hg19
    ##""")
    ##
    ##database = ['hg38','hg19'][db_index]
    conn = pymysql.connect(host='genome-mysql.cse.ucsc.edu',
                           user='******',
                           password='******',
                           db=database)

    cur = conn.cursor()

    #gene_symbol = raw_input("Please enter the gene symbol: ")
    gene_symbol = gene_symbol.upper()
    statement1 = "select name from refGene where name2 = '%s'" % gene_symbol
    cur.execute(statement1)
    temp = cur.fetchall()
    fuzzy = False
    if len(temp[0]) == 0:
        fuzzy = True
        temp = []
        i = 1
        while i < len(gene_symbol):
            temp1 = gene_symbol[0:len(gene_symbol) -
                                i] + "_" + gene_symbol[len(gene_symbol) - i +
                                                       1:]
            test = "select name2 from refGene where name2 like '%s'" % temp1
            cur.execute(test)
            holder = cur.fetchall()
            for item in holder:
                if len(item) > 1:
                    for subitem in item:
                        print subitem
                        temp += [subitem[0]]
                else:
                    temp += [item[0]]
            i += 1
    if len(temp[0]) == 0:
        i = 3
        while i < len(gene_symbol):
            temp1 = gene_symbol[0:i] + "%"
            test = "select name2 from refGene where name2 like '%s'" % temp1
            cur.execute(test)
            holder = cur.fetchall()
            temp += [holder]
            i += 1

    genes = []
    for item in temp:
        if item not in genes:
            genes += [item]

    if fuzzy:
        print "Did you mean?:"
        for index, item in enumerate(genes):
            print index, ":", item
        print "none : none"
        gene_index = raw_input("Enter your choice: ")
        try:
            gene_index = int(gene_index)
            statement1 = "select name2 from refGene where name2 = '%s'" % genes[
                gene_index]
            cur.execute(statement1)
            temp = cur.fetchall()
        except:
            temp = []
            i = 0
            while i < len(gene_symbol) + 1:
                temp1 = gene_symbol[0:i + 2] + "%"
                test = "select name2 from refGene where name2 like '%s'" % temp1
                cur.execute(test)
                holder = cur.fetchall()
                temp += [holder]
                i += 1
            genes = []
            for item in temp[0]:
                if item[0] not in genes:
                    genes += [item[0]]
            print "Did you mean?:"
            for index, item in enumerate(genes):
                print index, ":", item
            print "none : none"
            gene_index = raw_input("Enter your choice: ")
            try:
                gene_index = int(gene_index)
                statement1 = "select name from refGene where name2 = '%s'" % genes[
                    gene_index]
                cur.execute(statement1)
                temp = cur.fetchall()
            except:
                print "Damn"

    transcripts = []
    for index, item in enumerate(temp):
        transcripts += [item[0]]
    if len(transcripts) > 1:
        print index, ": ", item[0]
        trans_choice = input(
            "Please select a transcript from the list above: ")
    else:
        trans_choice = 0
    transcript = transcripts[trans_choice]
    statement2 = "select * from refGene where name = '%s'" % transcript
    cur.execute(statement2)
    info = cur.fetchall()
    info = info[0]

    txstart = info[4]
    txend = info[5]
    cdstart = info[6]
    cdend = info[7]
    chrom = info[2]
    exonstarts = info[9].split(",")
    exonends = info[10].split(",")
    while '' in exonstarts:
        exonstarts.remove('')
    while '' in exonends:
        exonends.remove('')
    strand = info[3]
    if strand == "-":
        temp = exonstarts
        temp2 = exonends
        exonstarts = temp2[::-1]
        exonends = temp[::-1]

        temp = txstart
        temp2 = txend
        txstart = temp2
        txend = temp

        temp = cdstart
        temp2 = cdend
        cdstart = temp2
        cdend = temp

    mv = myvariant.MyVariantInfo()
    #hg=raw_input("Enter variant: ")
    #recreate coding mapping
    es = exon_mapping(cdstart, txstart, cdend, exonstarts, exonends, strand)
    genomic = get_gen(hg, es, exonends, chrom, strand)

    query = "'%s' AND %s" % (hg, gene_symbol)
    cv = mv.query(query)

    hits = []
    for item in cv['hits']:
        if item['_id'] == genomic:
            hits += [item]
    return hits, genomic
class query:
    # start
    mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1')
    '''
    parameters = input('Enter parameters :')
    filetype = input('file type :')
    hgversion = #input('NCBI snp version (eg. :38): ')
    lo = hgVersionJudge(hgversion)
    '''
    def __init__(self,
                 fileName,
                 outputfile_name,
                 type,
                 outputtype,
                 version=19,
                 fieldKeyWords='None',
                 lineBegin=-1,
                 lineEnd=math.inf):
        self.parameters = fileName
        self.outputfile_name = outputfile_name
        self.hgversion = version
        self.filetype = type
        self.outputtype = outputtype
        self.lo = self.hgVersionJudge(version)
        self.fieldKeyWords = fieldKeyWords
        self.lineBegin = lineBegin
        self.lineEnd = lineEnd

    def infosearch(self, line):
        p = re.compile('\bthis\b')
        print(p.search('no class at all'))
        print(re.search(line))

    def hgVersionJudge(self, nowVersion):
        if (int(nowVersion) != 19):
            strs = 'hg' + str(nowVersion)
            lo = LiftOver(strs, 'hg19')
            return lo
        else:
            return 0

    def whole_genomeProcessor(self, wordslist, hgVersionNow, lo):
        if (len(wordslist) < 8):
            print(len(wordslist))
            return 'no'
        chromosome = wordslist[3]
        position = wordslist[5]
        vartype = wordslist[6]
        originalBase = wordslist[7]
        postBase = wordslist[8]
        if (vartype != 'snp'):
            return 'no'
        chro = chromosome
        print("mark" + chro)
        print(position)
        print(chromosome)
        queryinfos = (chromosome + ':g.' + position + originalBase + '>' +
                      postBase)
        return queryinfos

    # def hgVersion_ChrPosConvert(self,lo, hgVersionNow,chro,position):
    #    return
    def AncestryAndmeProcessor(self, wordslist, hgVersionNow, lo):
        rsid = wordslist[0]
        # chromosome = wordslist[1]
        # position = wordslist[2]
        # genotype = wordslist[3]
        # originalBase = genotype[0]
        # postBase = genotype[1]
        queryinfom = (rsid)
        return queryinfom

    def AncestryAndmeProcessor_vcf_title(self, wordslist, hgVersionNow, lo):
        # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
        rsid = wordslist[0]
        chromosome = wordslist[1]
        position = wordslist[2]
        genotype = wordslist[3]
        originalBase = genotype[0]
        postBase = genotype[1]
        return (chromosome + '\t' + position + '\t' + rsid + '\t' +
                originalBase + '\t' + postBase)

    def vcfFileProcessor(self, wordslist, hgVersionNow, lo):
        # rsid = wordslist[0]
        chromosome = wordslist[0]
        position = wordslist[1]
        # genotype = wordslist[3]
        originalBase = wordslist[3]
        postBase = wordslist[4]
        chro = "chr" + chromosome
        print("mark" + chro)
        print(position)
        convert = []
        position = int(position)
        if (hgVersionNow == 19):
            convert = lo.convert_coordinate(chro, position)
            print(convert)
            resultt = str(convert[0])
            ss = resultt.split(",")
            cc = ss[0]
            dd = []
            dd = cc.split("'")
            chromosome = dd[1]
            print(chromosome)
            position = ss[1]
            position = position.strip()
            print(ss)
        position = str(position)
        queryinfo = 'chr' + chromosome + ':g.' + position + originalBase + '>' + postBase
        return queryinfo

    def expansion(self, dict1, dict0, key1):
        #print(type(dict1))
        if (isinstance(dict1, dict)):
            for key2 in dict1.keys():
                #print('keys: ' + key2)
                if (key1 != ''):
                    key3 = str(key1) + '.' + str(key2)
                else:
                    key3 = str(key2)
                self.expansion(dict1.get(key2), dict0, key3)
        elif (isinstance(dict1, list)):
            for item in dict1:
                self.expansion(item, dict0, key1)
        else:
            dict0[key1] = str(dict1)

    def genequery(self):
        # get the chrosome, position and genotype and put them into the things suitable for mv commend
        # print the information in the file
        # to open file
        #print('line133 success')
        file = open(self.parameters, "r")
        fileb = open(self.outputfile_name, "a", encoding='utf-8')
        count = 0
        lineNumber = 0
        nonrs = 0
        title = {}
        outputs = []
        queryinfo_list = []
        vcfinfo_list = []
        # print("\ntest end, real begin")
        if (self.outputtype != 'csv'):
            fileb.write(
                '##fileformat=VCFv4.1 \n##fileDate=' +
                str(datetime.datetime.now()) +
                '\n##version=hg19 \n##CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO')
        for lines in file:
            lineNumber = lineNumber + 1
            if (lines[0] != '#' and lines[0] != '>'
                    and lineNumber >= self.lineBegin
                    and lineNumber <= self.lineEnd):
                count = count + 1
                wordslist = []
                words = lines.split("\t")
                queryinfo = ''
                output = ''
                result = ''
                for item in words:
                    wordslist.append(item)
                if (len(wordslist) >= 1):
                    if (self.filetype == 'vcf'):
                        queryinfo = self.vcfFileProcessor(
                            wordslist, self.hgversion, self.lo)
                        if (self.fieldKeyWords == 'None'):
                            print('140 : queryInfo' + self.filetype)
                            print('141: ' + queryinfo)
                            result = self.mv.getvariant(queryinfo)
                        else:
                            result = self.mv.getvariant(
                                queryinfo, fields=self.fieldKeyWords)
                    elif (self.filetype == '23andme'
                          or self.filetype == 'ancestry'):
                        queryinfo = self.AncestryAndmeProcessor(
                            wordslist, self.hgversion, self.lo)
                        #print(lineNumber)
                        queryinfo_list.append(queryinfo)
                        if (self.outputtype == 'vcf'):
                            titleinfo = self.AncestryAndmeProcessor_vcf_title(
                                wordslist, self.hgversion, self.lo)
                            vcfinfo_list.append(titleinfo)
                    elif (self.filetype == 'whole_genome'):
                        queryinfo = self.whole_genomeProcessor(
                            wordslist, self.hgversion, self.lo)
                        if (self.fieldKeyWords == 'None'):
                            result = self.mv.getvariant(queryinfo)
                        else:
                            result = self.mv.getvariant(
                                queryinfo, fields=self.fieldKeyWords)
                    else:
                        print(
                            'possible type : vcf, 23andme, ancestry, whole_genome'
                        )
                        break
                else:
                    continue
                if (queryinfo == 'no'):
                    continue
                #print(queryinfo)
                #got a query list but no result
                #if (count > 20):
                #break
            elif (count >= 0):
                nonrs = nonrs + 1
                #if (self.outputtype != 'csv' and (lines[0] == '#' or lines[0] == '>')):
                #fileb.write(lines)
        #for loop end here

        return queryinfo_list

        if (self.outputtype == 'vcf'):
            count = 0
            dul_count = 0
            test_out = ''
            results = self.mv.querymany(queryinfo_list,
                                        'dbsnp.rsid',
                                        returnall=True)
            for result in results['out']:
                # print('210',result)
                # result = result['hits'][0]
                test_output = {}
                if (True):
                    self.expansion(result, test_output, '')
                    strall = ''
                    for key in test_output.keys():
                        strall = strall + str(key) + ' : ' + test_output.get(
                            key) + ' '
                    test_out = vcfinfo_list[count] + strall + '\n'
                    fileb.write(test_out)
                    for keys in test_output.keys():
                        if (test_output.get(keys, 'a') != 'a'):
                            title[keys] = 1
                            # detect whether there is dup in dup[] list
                    if (dul_count == 0):
                        for item in results['dup']:
                            # print('223',queryinfo_list[count])
                            # print(item)
                            if queryinfo_list[count] in item:
                                dul_count = item[1] - 1
                    else:
                        dul_count = dul_count - 1
                    if (dul_count == 0):
                        count = count + 1
            #count = 0
            #dul_count = 0
            #results = self.mv.querymany(queryinfo_list, 'dbsnp.rsid', returnall=True)
            #print(str(lines) + '\t' + str(result))
            #fileb.write(str(output)+ '\t' + str(result) + '\n'

        if (self.outputtype == 'csv'):
            count = 0
            dul_count = 0
            start_time = time.time()
            results = self.mv.querymany(queryinfo_list,
                                        'dbsnp.rsid',
                                        returnall=True)
            print("--- time " + "%s sconds ---" % (time.time() - start_time))
            fileb.write("%s \n" % (time.time() - start_time))
            for result in results['out']:
                #print('210',result)
                #result = result['hits'][0]
                if (result != ""):
                    test_output = {'info': queryinfo_list[count]}
                    self.expansion(result, test_output, '')
                    #print(type(test_output))
                    #print(type(outputs))
                    outputs.append(test_output)
                    for keys in test_output.keys():
                        if (test_output.get(keys, 'a') != 'a'):
                            title[keys] = 1
                # detect whether there is dup in dup[] list
                    if (dul_count == 0):
                        for item in results['dup']:
                            #print('223',queryinfo_list[count])
                            #print(item)
                            if queryinfo_list[count] in item:
                                dul_count = item[1] - 1
                    else:
                        dul_count = dul_count - 1
                    if (dul_count == 0):
                        count = count + 1
        if (self.outputtype == 'csv'):
            #print('197')
            all_keys = title.keys()
            #print('all keys : ' , all_keys)
            #dict_writer = csv.DictWriter(fileb, title, restval='Nan',)
            #dict_writer.writeheader()
            #dict_writer.writerows(outputs)
        #print("line number is: ", lineNumber)
        #print("identiable line number is: ", count)
        #print("non-identiable line number is: ", nonrs)
        # to close the file
        file.close()
        fileb.close()
        #print("identiable line number is: ", count)
        #print("non-identiable line number is: ", nonrs)
        # to close the file
        file.close()
        fileb.close()


if __name__ == '__main__':
    import myvariant
    import time
    from multiprocessing import Pool
    p = Pool(2)

    listall = []
    listunit = []
    mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1')

    demo = query('23andme_small.txt', 'time_record.txt', '23andme', 'csv', 19,
                 'None', 29, 1000)
    list1 = demo.genequery()
    print(len(list1))
    count = 0
    for item in list1:
        if count < 1000:
            listunit.append(item)
            count = count + 1
        else:
            count = 0
            listall.append(listunit)
            listunit = []
    #print(listall)
示例#25
0
def get_dbsnp(data, region, force=False):
    mv = myvariant.MyVariantInfo()
    q = mv.query(
        '_exists_:dbsnp AND _exists_:hg19 AND {}:{}-{}'.format(*region),
        fields='dbsnp',
        fetch_all=True)
    snps = list(q)

    # VCF, dbSNP and myVariant use 1-based indexing
    dbsnp = collections.defaultdict(dict)
    for snp in snps:
        pos, ref, alt, rs = snp['dbsnp']['hg19']['start'] - 1, snp['dbsnp'][
            'ref'], snp['dbsnp']['alt'], snp['dbsnp']['rsid']
        if len(ref) > 1 or len(alt) > 1:
            assert (ref[0] == alt[0])
        if len(ref) > 1:
            op = 'DEL.{}'.format(ref[1:])
        elif len(alt) > 1:
            op = 'INS.{}'.format(alt[1:].lower())
        else:
            op = 'SNP.{}{}'.format(ref, alt)
        dbsnp[pos][op] = rs

    mutations = {}
    for a in sorted(data):
        for m in data[a]['mutations']:
            if m['pos'] == 'pseudogene': continue
            if m['dbsnp'] not in ['', '*']:
                m['dbsnp'] = [m['dbsnp']]
            else:
                m['dbsnp'] = []
            pos, op = m['pos'], m['op']

            # check reversed SNP
            if op in dbsnp[pos]:
                rsid = str(dbsnp[pos][op])
                if rsid not in m['dbsnp']:
                    if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)'
                    m['dbsnp'].append(rsid)
                    log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos,
                              op)
                else:
                    log.debug(
                        'dbSNP: Variant {} matches the Karolinska\'s prediction',
                        rsid)
            elif len(dbsnp[pos]) > 0 and (op[:3] == 'SNP' and op[:4] +
                                          op[4:6][::-1] in dbsnp[pos]):
                op = op[:4] + op[4:6][::-1]
                rsid = str(dbsnp[pos][op])
                if rsid not in m['dbsnp']:
                    if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)'
                    m['dbsnp'].append(rsid)
                    log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos,
                              op)
                else:
                    log.debug(
                        'dbSNP: Variant {} matches the Karolinska\'s prediction',
                        rsid)
            elif len(dbsnp[pos]) != 0:
                log.trace('How about {} for {}:{} ({})', dbsnp[pos], pos, op,
                          m['old'])
    return data
class query:
    # start
    mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1')
    '''
    parameters = input('Enter parameters :')
    filetype = input('file type :')
    hgversion = #input('NCBI snp version (eg. :38): ')
    lo = hgVersionJudge(hgversion)
    '''
    def __init__(self,fileName, outputfile_name, type, outputtype, version = 19, fieldKeyWords = 'None', lineBegin = -1, lineEnd = math.inf):
        self.parameters = fileName
        self.outputfile_name = outputfile_name
        self.hgversion = version
        self.filetype = type
        self.outputtype = outputtype
        self.lo = self.hgVersionJudge(version)
        self.fieldKeyWords = fieldKeyWords
        self.lineBegin = lineBegin
        self.lineEnd = lineEnd

    def infosearch(self,line):
        p = re.compile('\bthis\b')
        print(p.search('no class at all'))
        print(re.search(line))

    def hgVersionJudge(self, nowVersion):
        if (int(nowVersion) != 19):
            strs = 'hg' + str(nowVersion)
            lo = LiftOver(strs, 'hg19')
            return lo
        else:
            return 0

    def whole_genomeProcessor(self, wordslist, hgVersionNow, lo):
        if (len(wordslist) < 8):
            print(len(wordslist))
            return 'no'
        chromosome = wordslist[3]
        position = wordslist[5]
        vartype = wordslist[6]
        originalBase = wordslist[7]
        postBase = wordslist[8]
        if (vartype != 'snp'):
            return 'no'
        chro = chromosome
        print("mark" + chro)
        print(position)
        print(chromosome)
        queryinfos = (chromosome + ':g.' + position + originalBase + '>' + postBase)
        return queryinfos

    # def hgVersion_ChrPosConvert(self,lo, hgVersionNow,chro,position):
    #    return
    def AncestryAndmeProcessor(self,wordslist, hgVersionNow, lo):
        rsid = wordslist[0]
        # chromosome = wordslist[1]
        # position = wordslist[2]
        # genotype = wordslist[3]
        # originalBase = genotype[0]
        # postBase = genotype[1]
        queryinfom = (rsid)
        return queryinfom

    def vcfFileProcessor(self, wordslist, hgVersionNow, lo):
        # rsid = wordslist[0]
        chromosome = wordslist[0]
        position = wordslist[1]
        # genotype = wordslist[3]
        originalBase = wordslist[3]
        postBase = wordslist[4]
        chro = "chr" + chromosome
        print("mark" + chro)
        print(position)
        convert = []
        position = int(position)
        if (hgVersionNow == 19):
            convert = lo.convert_coordinate(chro, position)
            print(convert)
            resultt = str(convert[0])
            ss = resultt.split(",")
            cc = ss[0]
            dd = []
            dd = cc.split("'")
            chromosome = dd[1]
            print(chromosome)
            position = ss[1]
            position = position.strip()
            print(ss)
        position = str(position)
        queryinfo = 'chr' + chromosome + ':g.' + position + originalBase + '>' + postBase
        return queryinfo

    def expansion(self, dict1, dict0, key1):
        #print(type(dict1))
        if (isinstance(dict1, dict)):
            for key2 in dict1.keys():
                #print('keys: ' + key2)
                if (key1 != ''):
                    key3 = str(key1) + '.' + str(key2)
                else:
                    key3 = str(key2)
                self.expansion(dict1.get(key2), dict0, key3)
        else:
            dict0[key1] = str(dict1)

    def queries(self, queryinfo,wordslist):
        if (self.fieldKeyWords == 'None'):
        # print('130: ' + queryinfo)
            result_collection = self.mv.query(queryinfo)
        else:
        # print('133: ' + queryinfo + ',fields=' + self.fieldKeyWords)
            result = self.mv.query(queryinfo, fields=self.fieldKeyWords)
        if (self.outputtype == 'vcf'):
            output = wordslist[1] + '\t' + wordslist[2] + '\t' + wordslist[0] + '\t' + wordslist[3][0] + '\t' + \
                     wordslist[3][1] + '\t.\t.\t'
        elif (self.outputtype == 'csv'):
            print(result)
            result = result['hits'][0]



    def genequery(self):
        # get the chrosome, position and genotype and put them into the things suitable for mv commend
        # print the information in the file
        # to open file
        result = ''
        file = open(self.parameters, "r")
        fileb = open(self.outputfile_name, "w", encoding='utf-8')
        count = 0
        lineNumber = 0
        nonrs = 0
        title = {}
        outputs = []
        multiple_query = []
        list_of_wordlist = []
        # print("\ntest end, real begin")
        if (self.outputtype != 'csv') :
            fileb.write('##fileformat=VCFv4.1 \n##fileDate=' + str(datetime.datetime.now()) + '\n##version=hg19 \n##CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO')
        for lines in file:
            lineNumber = lineNumber + 1
            if (lines[0] != '#' and lines[0] != '>' and lineNumber >= self.lineBegin and lineNumber <= self.lineEnd):
                count = count + 1
                wordslist = []
                words = lines.split("\t")
                queryinfo = ''
                output = ''
                for item in words:
                    wordslist.append(item)
                if (len(wordslist) >= 1):
                    if (self.filetype == 'vcf'):
                        queryinfo = self.vcfFileProcessor(wordslist, self.hgversion, self.lo)
                        if (self.fieldKeyWords == 'None') :
                            print('140 : queryInfo' + self.filetype)
                            print('141: ' + queryinfo)
                            result = self.mv.getvariant(queryinfo)
                        else:
                            result = self.mv.getvariant(queryinfo ,fields =self.fieldKeyWords)
                    elif (self.filetype == '23andme' or self.filetype == 'ancestry'):
                        queryinfo = self.AncestryAndmeProcessor(wordslist, self.hgversion, self.lo)
                        #print(queryinfo)
                        multiple_query.append(queryinfo)
                        list_of_wordlist.append(wordslist)
                    elif (self.filetype == 'whole_genome'):
                        queryinfo = self.whole_genomeProcessor(wordslist, self.hgversion, self.lo)
                        if (self.fieldKeyWords == 'None') :
                            result = self.mv.getvariant(queryinfo)
                        else:
                            result = self.mv.getvariant(queryinfo ,fields =self.fieldKeyWords)
                    else:
                        print('possible type : vcf, 23andme, ancestry, whole_genome')
                        break
                else:
                    continue
                if (queryinfo == 'no'):
                    continue
                #print(queryinfo)
                if (result != "" and self.outputtype =='vcf'):
                    lineNumber = lineNumber
                    #print(str(lines) + '\t' + str(result))
                    #fileb.write(str(output)+ '\t' + str(result) + '\n' )
                elif (result != "" and self.outputtype == 'csv') :
                    test_output = {}
                    self.expansion(result, test_output, '')
                    print(type(test_output))
                    print(type(outputs))
                    outputs.append(test_output)
                    for keys in test_output.keys() :
                        if (test_output.get(keys, 'a') != 'a'):
                            title[keys] = 1
                    #w = csv.DictWriter(fileb, test_output.keys())
                    #print(test_output)
                    #w.writeheader()
                    #w.writerow(test_output)
                if (count > 2000):
                    break
            elif (count >= 0):
                nonrs = nonrs + 1
                if (self.outputtype != 'csv' and (lines[0] == '#' or lines[0] == '>')):
                    #fileb.write(lines)
                    1 + 1
        if (self.outputtype == 'csv'):
            print('197')
            all_keys = title.keys()
            dict_writer = csv.DictWriter(fileb, title)
            dict_writer.writeheader()
            dict_writer.writerows(outputs)
        print("line number is: ", lineNumber)
        print("identiable line number is: ", count)
        print("non-identiable line number is: ", nonrs)
        # to close the file
        file.close()
        fileb.close()
        return multiple_query
示例#27
0
def create_missense_variant_item(hgvs, label, login, fast_run=True):
    print(hgvs)
    mv = myvariant.MyVariantInfo()
    vd = mv.getvariant(hgvs)
    chrom = human_chromosome_map[vd['chrom'].upper()]
    if 'hg19' not in vd or 'dbnsfp' not in vd:
        raise ValueError(
            "Metadata not found in MyVariant, unable to create item")
    start = str(vd['hg19']['start'])
    end = str(vd['hg19']['end'])
    gene = hgnc_qid[vd['dbnsfp']['genename'].upper()]
    url = "http://myvariant.info/v1/variant/{}".format(quote(hgvs))

    ref = [
        wdi_core.WDItemID(ITEMS['MyVariant.info'],
                          PROPS['stated in'],
                          is_reference=True),
        wdi_core.WDUrl(url, PROPS['reference URL'], is_reference=True),
        wdi_core.WDTime(strftime("+%Y-%m-%dT00:00:00Z", gmtime()),
                        PROPS['retrieved'],
                        is_reference=True)
    ]
    ga_qual = wdi_core.WDItemID(ITEMS['Genome assembly GRCh37'],
                                PROPS['genomic assembly'],
                                is_qualifier=True)

    s = []
    s.append(
        wdi_core.WDItemID(ITEMS['sequence variant'],
                          PROPS['instance of'],
                          references=[ref]))
    s.append(
        wdi_core.WDItemID(ITEMS['Missense Variant'],
                          PROPS['subclass of'],
                          references=[ref]))
    s.append(
        wdi_core.WDItemID(chrom,
                          PROPS['chromosome'],
                          references=[ref],
                          qualifiers=[ga_qual]))
    s.append(
        wdi_core.WDString(start,
                          PROPS['genomic start'],
                          references=[ref],
                          qualifiers=[ga_qual]))
    s.append(
        wdi_core.WDString(end,
                          PROPS['genomic end'],
                          references=[ref],
                          qualifiers=[ga_qual]))
    s.append(
        wdi_core.WDItemID(gene,
                          PROPS['biological variant of'],
                          references=[ref]))
    s.append(
        wdi_core.WDExternalID(hgvs,
                              PROPS['HGVS nomenclature'],
                              references=[ref]))

    item = wdi_core.WDItemEngine(
        item_name=label,
        data=s,
        domain="variant",
        fast_run=fast_run,
        fast_run_base_filter={PROPS['HGVS nomenclature']: ''},
        fast_run_use_refs=True,
        ref_handler=update_retrieved_if_new_multiple_refs,
        core_props=core_props)
    item.set_label(label)
    item.set_description("genetic variant")
    wdi_helpers.try_write(item, hgvs, PROPS['HGVS nomenclature'], login)
    return item
示例#28
0
def search(query,
           user=None,
           search_type=None,
           taxon="H**o sapiens",
           source="entrez"):
    result_set = []

    if (source.lower() in ["myvariant", "all"]):
        mv = myvariant.MyVariantInfo()
        result = mv.query(query)

        for hit in result["hits"]:
            temp_identifier_list = []

            temp_var = gnomics.objects.variation.Variation(
                identifier=hit["_id"],
                identifier_type="HGVS ID",
                language=None,
                source="MyVariant",
                taxon="H**o sapiens")
            temp_identifier_list.append(hit["_id"])

            if "gnomad_genome" in hit:
                if hit["gnomad_genome"]["rsid"] not in temp_identifier_list:
                    gnomics.objects.variation.Variation.add_identifier(
                        temp_var,
                        identifier=hit["gnomad_genome"]["rsid"],
                        identifier_type="RS Number",
                        language=None,
                        source="MyVariant",
                        taxon="H**o sapiens")
                    temp_identifier_list.append(hit["gnomad_genome"]["rsid"])

                if "clinvar" in hit:

                    for genomic_hgvs in hit["clinvar"]["hgvs"]["genomic"]:
                        if genomic_hgvs not in temp_identifier_list:
                            gnomics.objects.variation.Variation.add_identifier(
                                temp_var,
                                identifier=genomic_hgvs,
                                identifier_type="Genomic HGVS ID",
                                language=None,
                                source="MyVariant",
                                taxon="H**o sapiens")
                            temp_identifier_list.append(genomic_hgvs)

                    if "coding" in hit["clinvar"]["hgvs"]:
                        if hit["clinvar"]["hgvs"][
                                "coding"] not in temp_identifier_list:
                            gnomics.objects.variation.Variation.add_identifier(
                                temp_var,
                                identifier=hit["clinvar"]["hgvs"]["coding"],
                                identifier_type="Coding HGVS ID",
                                language=None,
                                source="MyVariant",
                                taxon="H**o sapiens")
                            temp_identifier_list.append(
                                hit["clinvar"]["hgvs"]["coding"])

                    if "variant_id" in hit["clinvar"]:
                        if hit["clinvar"][
                                "variant_id"] not in temp_identifier_list:
                            gnomics.objects.variation.Variation.add_identifier(
                                temp_var,
                                identifier=hit["clinvar"]["variant_id"],
                                identifier_type="Variant ID",
                                language=None,
                                source="MyVariant",
                                taxon="H**o sapiens")
                            temp_identifier_list.append(
                                hit["clinvar"]["variant_id"])

                    if "rcv" in hit["clinvar"]:

                        print("here")
                        print(hit)

                        if type(hit["clinvar"]["rcv"]) == list:
                            for sub_hit in hit["clinvar"]["rcv"]:
                                if sub_hit[
                                        "accession"] not in temp_identifier_list:
                                    gnomics.objects.variation.Variation.add_identifier(
                                        temp_var,
                                        identifier=sub_hit["accession"],
                                        identifier_type="ClinVar Accession",
                                        name=sub_hit["preferred_name"],
                                        taxon="H**o sapiens")
                                    temp_identifier_list.append(
                                        sub_hit["accession"])
                        else:
                            if hit["clinvar"]["rcv"][
                                    "accession"] not in temp_identifier_list:
                                gnomics.objects.variation.Variation.add_identifier(
                                    temp_var,
                                    identifier=hit["clinvar"]["rcv"]
                                    ["accession"],
                                    identifier_type="ClinVar Accession",
                                    name=hit["clinvar"]["rcv"]
                                    ["preferred_name"],
                                    taxon="H**o sapiens")
                                temp_identifier_list.append(
                                    hit["clinvar"]["rcv"]["accession"])

            result_set.append(temp_var)

    # Adapted from:
    # https://www.ncbi.nlm.nih.gov/dbvar/content/tools/entrez/
    if (source.lower() in ["ncbi", "entrez", "all"]) and user is not None:

        if user.email is not None:

            Entrez.email = user.email
            paramEutils = {"usehistory": "Y"}
            full_query = "('variant'[Object Type] AND %s)" % query

            eSearch = Entrez.esearch(db="dbvar",
                                     term=full_query,
                                     **paramEutils)
            res = Entrez.read(eSearch)

            if res["IdList"]:
                for iden in res["IdList"]:
                    if taxon == "H**o sapiens":
                        temp_var = gnomics.objects.variation.Variation(
                            identifier=iden,
                            identifier_type="Variant Region ID",
                            language=None,
                            source="dbVar",
                            name=None,
                            taxon=taxon)
                        result_set.append(temp_var)
            else:
                paramEutils = {"usehistory": "Y"}
                eSearch = Entrez.esearch(db="snp", term=query, **paramEutils)
                res = Entrez.read(eSearch)
                for iden in res["IdList"]:
                    if taxon == "H**o sapiens":
                        temp_var = gnomics.objects.variation.Variation(
                            identifier=iden,
                            identifier_type="RS Number",
                            language=None,
                            source="dbSNP",
                            name=None,
                            taxon=taxon)
                        result_set.append(temp_var)

        else:
            print(
                "Search cannot continue without a valid user and a valid email address associated with such a user object."
            )

    if (source.lower() in ["ensembl", "all"]):

        if taxon == "H**o sapiens":
            server = "https://rest.ensembl.org"
            ext = "/variation/human/" + str(query) + "?"

            r = requests.get(server + ext,
                             headers={"Content-Type": "application/json"})

            if not r.ok:
                print("No match found.")
            else:

                decoded = r.json()

                if "name" in decoded:
                    temp_var = gnomics.objects.variation.Variation(
                        identifier=decoded["name"],
                        identifier_type="Ensembl Variation ID",
                        language=None,
                        source="Ensembl",
                        taxon="H**o sapiens")

                    if "rs" in decoded["name"]:
                        gnomics.objects.variation.Variation.add_identifier(
                            temp_var,
                            identifier=decoded["name"],
                            identifier_type="RS Number",
                            language=None,
                            source="Ensembl",
                            taxon="H**o sapiens")

                    for syn in decoded["synonyms"]:
                        gnomics.objects.variation.Variation.add_identifier(
                            temp_var,
                            identifier=syn,
                            identifier_type="Ensembl Synonym",
                            language=None,
                            source="Ensembl",
                            taxon="H**o sapiens")

                    result_set.append(temp_var)

    if (source.lower() in ["ebi", "embl", "proteins api", "all"]):

        var_match = re.compile(
            r"[ARNDBCEQZGHILKMFPSTWYV]\d+[ARNDBCEQZGHILKMFPSTWYV]")
        matched = re.findall(var_match, query)

        var_match_2 = re.compile(r"[ARNDBCEQZGHILKMFPSTWYV]\d+")
        matched_2 = re.findall(var_match_2, query)

        if matched:

            gene = query.split(" ")[0].strip()
            variation = query.split(" ")[1].strip().replace("(", "").replace(
                ")", "").strip()

            # Get Ensembl identifier from gene query.
            server = "https://rest.ensembl.org"
            ext = "/xrefs/symbol/" + taxon.lower().replace(
                " ", "_") + "/" + gene + "?"
            r = requests.get(server + ext,
                             headers={"Content-Type": "application/json"})
            if not r.ok:
                r.raise_for_status()
                sys.exit()
            decoded = r.json()
            ensembl_gene_id = ""
            for x in decoded:
                if "ENSG" in x["id"]:
                    ensembl_gene_id = x["id"]

            # Get UniProt identifier from Ensembl identifier.
            server = "https://rest.ensembl.org"
            ext = "/xrefs/id/" + ensembl_gene_id + "?"
            r = requests.get(server + ext,
                             headers={"Content-Type": "application/json"})
            if not r.ok:
                r.raise_for_status()
                sys.exit()
            decoded = r.json()
            uniprot_accession = ""
            for x in decoded:
                if x["dbname"] == "Uniprot_gn":
                    uniprot_accession = x["primary_id"]

            wild_match = re.compile(
                "([ARNDBCEQZGHILKMFPSTWYV])\d+[ARNDBCEQZGHILKMFPSTWYV]")
            alt_match = re.compile(
                "[ARNDBCEQZGHILKMFPSTWYV]\d+([ARNDBCEQZGHILKMFPSTWYV])")

            wildtype = re.findall(wild_match, variation)[0]
            location_1 = ''.join(filter(str.isdigit, variation))
            location_2 = ''.join(filter(str.isdigit, variation))
            alternativesequence = re.findall(alt_match, variation)[0]

            url = "https://www.ebi.ac.uk/proteins/api/"
            ext = "variation?offset=0&size=100&wildtype=" + wildtype + "&alternativesequence=" + alternativesequence + "&location=" + str(
                location_1) + "-" + str(
                    location_2) + "&accession=" + uniprot_accession

            r = requests.get(url + ext,
                             headers={"Content-Type": "application/json"})

            if not r.ok:
                print("Something went wrong.")
            else:
                decoded = r.json()

                var_array = []
                var_id_array = []
                for x in decoded:

                    for feat in x["features"]:

                        if "ftId" in feat:

                            temp_var = gnomics.objects.variation.Variation(
                                identifier=feat["ftId"],
                                identifier_type="ftId",
                                source="Proteins API")

                            var_id_array.append(feat["ftId"])

                            for xref in feat["xrefs"]:

                                if "COSM" in xref["id"] and xref[
                                        "id"] not in var_id_array:
                                    gnomics.objects.variation.Variation.add_identifier(
                                        temp_var,
                                        identifier=xref["id"],
                                        identifier_type="COSMIC Mutation ID",
                                        source="COSMIC")
                                    var_id_array.append(xref["id"])

                                elif "rs" in xref["id"] and xref[
                                        "id"] not in var_id_array:
                                    gnomics.objects.variation.Variation.add_identifier(
                                        temp_var,
                                        identifier=xref["id"],
                                        identifier_type="RS Number",
                                        source="dbSNP")
                                    var_id_array.append(xref["id"])

                                else:
                                    print("Other identifier found.")
                                    print(xref["id"])

                            result_set.append(temp_var)

                        else:
                            print("No ftId in feature.")
                            print(feat)

        elif matched_2:

            if len(query.split(" ")) > 1:

                gene = query.split(" ")[0].strip()
                variation = query.split(" ")[1].strip().replace(
                    "(", "").replace(")", "").strip()

                # Get Ensembl identifier from gene query.
                server = "https://rest.ensembl.org"
                ext = "/xrefs/symbol/" + taxon.lower().replace(
                    " ", "_") + "/" + gene + "?"
                r = requests.get(server + ext,
                                 headers={"Content-Type": "application/json"})
                if not r.ok:
                    r.raise_for_status()
                    sys.exit()
                decoded = r.json()
                ensembl_gene_id = ""
                for x in decoded:
                    if "ENSG" in x["id"]:
                        ensembl_gene_id = x["id"]

                # Get UniProt identifier from Ensembl identifier.
                server = "https://rest.ensembl.org"
                ext = "/xrefs/id/" + ensembl_gene_id + "?"
                r = requests.get(server + ext,
                                 headers={"Content-Type": "application/json"})
                if not r.ok:
                    r.raise_for_status()
                    sys.exit()
                decoded = r.json()
                uniprot_accession = ""
                for x in decoded:
                    if x["dbname"] == "Uniprot_gn":
                        uniprot_accession = x["primary_id"]

                wild_match = re.compile("([ARNDBCEQZGHILKMFPSTWYV])\d+")

                wildtype = re.findall(wild_match, variation)[0]
                location_1 = ''.join(filter(str.isdigit, variation))
                location_2 = ''.join(filter(str.isdigit, variation))

                url = "https://www.ebi.ac.uk/proteins/api/"
                ext = "variation?offset=0&size=100&wildtype=" + wildtype + "&location=" + str(
                    location_1) + "-" + str(
                        location_2) + "&accession=" + uniprot_accession

                r = requests.get(url + ext,
                                 headers={"Content-Type": "application/json"})

                if not r.ok:
                    print("Something went wrong.")
                else:
                    decoded = r.json()
                    var_array = []
                    var_id_array = []

                    for x in decoded:
                        for feat in x["features"]:
                            if "ftId" in feat:

                                temp_var = gnomics.objects.variation.Variation(
                                    identifier=feat["ftId"],
                                    identifier_type="ftId",
                                    source="Proteins API")
                                var_id_array.append(feat["ftId"])

                                for xref in feat["xrefs"]:
                                    if "COSM" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type=
                                            "COSMIC Mutation ID",
                                            source="COSMIC")
                                        var_id_array.append(xref["id"])

                                    elif "rs" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type="RS Number",
                                            source="dbSNP")
                                        var_id_array.append(xref["id"])

                                    elif "RCV" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type="ClinVar Accession",
                                            source="ClinVar")
                                        var_id_array.append(xref["id"])

                                    else:
                                        continue

                                result_set.append(temp_var)

                            else:

                                temp_var = gnomics.objects.variation.Variation(
                                )
                                for xref in feat["xrefs"]:
                                    if "COSM" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type=
                                            "COSMIC Mutation ID",
                                            source="COSMIC")
                                        var_id_array.append(xref["id"])

                                    elif "rs" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type="RS Number",
                                            source="dbSNP")
                                        var_id_array.append(xref["id"])

                                    elif "RCV" in xref["id"] and xref[
                                            "id"] not in var_id_array:
                                        gnomics.objects.variation.Variation.add_identifier(
                                            temp_var,
                                            identifier=xref["id"],
                                            identifier_type="ClinVar Accession",
                                            source="ClinVar")
                                        var_id_array.append(xref["id"])

                                    else:
                                        continue

                                if len(temp_var.identifiers) > 0:
                                    result_set.append(temp_var)

    if (source.lower() in ["ncbi", "entrez"]) and user is not None:
        print(
            "The Entrez database cannot be searched without a valid user email provided."
        )

    return result_set
示例#29
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  1 20:56:40 2019

@author: Nicky
"""
import myvariant				# import die het mogelijk maakt om gegevens van Clinvar op te halen
mv = myvariant.MyVariantInfo()
	
# nog toevoegen: rs code uit outputdb.txt bestand halen dat gemaakt wordt in app.py

info = mv.querymany(['rs121913364'], scopes='dbsnp.rsid')	# Haalt informatie op adv de rs code
a = ([d['_id'] for d in info])								# haalt regel met informatie nodig voor het ophalen van clinvar gegevens uit alle informatie
genomeposition = (a[0])										# zet de bv 'chr1:g.35367G>A' in de variabele genomeposition
print(genomeposition)
clinvar_result = mv.getvariant(genomeposition)				# haalt clinvar resultaten op
text = str(clinvar_result)									# slaat de resultaten op in de variabele text
print(clinvar_result)
file = open("ClinvarResults.txt","w") 						# opend bestand genaamd ClinvarResults.txt
if text == None:											# als text gelijk is aan None wordt er geschreven dat er geen resultaten zijn gevonden
  file.write("No results found on Clinvar")
else:														# als text ongelijk is aan None worden alle gegevens in het txt bestand geschreven
  file.write(text)
file.close()
with open('ClinvarResults.txt', 'r') as f2:
    data = f2.read()
    print(data)
示例#30
0
def match_genome(inputfile, outputfile, inputfilename):
    """
    Produce a CSV genome report at outputfile for a given VCF inputfile.
    """
    data = dict()

    # Set up ClinVar data.
    clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37')
    if clinvar_filepath.endswith('.vcf'):
        input_clinvar_file = open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvar_filepath)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    # Run vcf2clinvar on genome data.
    clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile,
                                                   input_clinvar_file)
    # Set up to get myvariant.info data (mainly for ExAC data.)
    mv = myvariant.MyVariantInfo()

    # iterate through all ClinVar matches.
    for genome_vcf_line, allele, zygosity in clinvar_matches:
        # Discard low quality data.
        if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters:
            continue
        # Check significance. Only keep this as a notable variant if one of the
        # submissions has reported "pathogenic" and "likely pathogenic" effect.
        sigs = [rec.sig for rec in allele.records]
        if not ('4' in sigs or '5' in sigs):
            continue
        # Store data in a dict according to HGVS position.
        poskey = myvariant.format_hgvs(genome_vcf_line.chrom,
                                       genome_vcf_line.start,
                                       genome_vcf_line.ref_allele,
                                       allele.sequence)
        data[poskey] = {
            'genome_vcf_line': genome_vcf_line,
            'clinvar_allele': allele,
            'zygosity': zygosity
        }

    # Add data from myvariant.info using the HGVS positions.
    variants = data.keys()
    mv_output = mv.getvariants(variants, fields=['clinvar', 'exac'])
    for i in range(len(variants)):
        if 'clinvar' in mv_output[i]:
            data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar']
        if 'exac' in mv_output[i]:
            data[variants[i]]['mv_exac'] = mv_output[i]['exac']

    # Write report as CSV.
    with open(outputfile, 'w') as f:
        csv_out = csv.writer(f)
        for var in variants:
            # Clinvar URL for variant.
            cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format(
                data[var]['clinvar_allele'].records[0].acc)
            disease_name = ''
            preferred_name = ''
            getev_url = ''
            # Disease name, preferred name, and GET-Evidence URL if we have
            # myvariant.info information with ClinVar data.
            if 'mv_clinvar' in data[var]:
                cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format(
                    data[var]['mv_clinvar']['variant_id'])
                try:
                    disease_name = data[var]['mv_clinvar']['rcv'][
                        'conditions']['name']
                    preferred_name = data[var]['mv_clinvar']['rcv'][
                        'preferred_name']
                except TypeError:
                    disease_name = ', '.join(
                        set([
                            rcv['conditions']['name']
                            for rcv in data[var]['mv_clinvar']['rcv']
                        ]))
                    preferred_name = data[var]['mv_clinvar']['rcv'][0][
                        'preferred_name']
                getev_url = guess_getevidence_url(preferred_name)
            exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format(
                data[var]['genome_vcf_line'].chrom[3:],
                data[var]['genome_vcf_line'].start,
                data[var]['genome_vcf_line'].ref_allele,
                data[var]['clinvar_allele'].sequence)
            # Allele frequency using ExAC data, if myvariant.info had that.
            if 'mv_exac' in data[var]:
                total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[
                    var]['mv_exac']['an']['an']
                total_freq = str(total_freq)
                freq_source = 'ExAC'
            else:
                # If not, try to get it from our ClinVar data.
                try:
                    total_freq = str(data[var]['clinvar_allele'].frequency)
                    freq_source = 'ClinVar'
                except KeyError:
                    # If that fails, give up on frequency.
                    total_freq = ''
                    freq_source = 'Unknown'
            data_row = [
                inputfilename, var, preferred_name, disease_name, cv_url,
                exac_url, total_freq, freq_source, getev_url
            ]
            csv_out.writerow(data_row)
    return