def rsidsToHG37Positions(rsidList): """Return a DataFrame containing hg37 positions for a list of rsids. args: rsidList (list of str): the rsids returns: df (DataFrame): all the rsids found in the genomic range between startPos and endPos, indexed by rsid """ mv = myvariant.MyVariantInfo() gen = mv.querymany(rsidList, scopes='dbsnp.rsid', fields='dbsnp.rsid, dbsnp.hg19.start', fetch_all=True, assembly='hg37') rsids = {} for row in gen: try: rsid = (row['dbsnp']['rsid']) start = (row['dbsnp']['hg19']['start']) rsids[rsid] = start except KeyError: continue df = pd.DataFrame.from_dict(rsids, orient='index') return df
def refresh_myvariant_data(self): vars_by_hgvs = {v.b37_hgvs_id: v for v in self.variants.all()} mv = myvariant.MyVariantInfo() mv_data = mv.getvariants(vars_by_hgvs.keys(), fields=['clinvar', 'dbsnp', 'exac']) for var_data in mv_data: if '_id' not in var_data: variant = vars_by_hgvs[var_data['query']] variant.myvariant_clinvar = {} variant.myvariant_exac = {} variant.myvariant_dbsnp = {} variant.save() continue variant = vars_by_hgvs[var_data['_id']] try: clinvar_data = var_data['clinvar'] # Always as list - makes downstream code much easier. if not type(clinvar_data['rcv']) == list: clinvar_data['rcv'] = [clinvar_data['rcv']] variant.myvariant_clinvar = var_data['clinvar'] except KeyError: variant.myvariant_clivar = {} try: variant.myvariant_exac = var_data['exac'] except KeyError: variant.myvariant_exac = {} try: variant.myvariant_dbsnp = var_data['dbsnp'] except KeyError: variant.myvariant_dbsnp = {} variant.myvariant_last_update = django_timezone.now() variant.save()
def annotate(listHGVS): """ annotmvi - accepts HGVS, returns data on mutation Parameters: listHGVS: list of HGVS IDs to retrieve annotations for Return: dictionary/list containing the json data from myvariant.info """ listmvi = [] mv = myvariant.MyVariantInfo() #For each HGVS ID in the list, retrieve the annotation for idHGVS in listHGVS: listmvi.append( mv.getvariant(idHGVS, fields=[ 'dbsnp.rsid', 'dbsnp.alleles', 'dbsnp.vartype', 'dbsnp.gene', 'clinvar', 'gnomad_genome.af', 'gnomad_exome.af', 'dbnsfp.ensembl', 'dbnsfp.uniprot', 'dbnsfp.polyphen2', 'dbnsfp.sift', 'dbnsfp.provean', ])) #Progress tracker if listHGVS.index(idHGVS) % 100 == 0: print( str(listHGVS.index(idHGVS)) + ' out of ' + str(len(listHGVS)) + ' written...') return listmvi
def getHG37PositionsInRange(chromosome, startPos, endPos): """Return a DataFrame containing hg37 positions for all rsids in a range. args: chromosome (int or str): the chromosome number startPos (int or str): the start position on the chromosome endPos (int or str): the end position on the chromosome returns: df (DataFrame): all the rsids found in the genomic range between startPos and endPos, indexed by rsid chromosome (int or str): the chromosome number """ queryString = f'chr{chromosome}:{startPos}-{endPos}' mv = myvariant.MyVariantInfo() gen = mv.query(queryString, scopes='dbsnp.rsid', fields='dbsnp.rsid, dbsnp.hg19.start', fetch_all=True, assembly='hg37') rsids = {} for row in gen: try: rsid = (row['dbsnp']['rsid']) start = (row['dbsnp']['hg19']['start']) rsids[rsid] = start except KeyError: continue df = pd.DataFrame.from_dict(rsids, orient='index') return df, chromosome
def test_func(num1, num2): mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1') demo = query('23andme_large.txt', 'time_record.txt', '23andme', 'csv', 19, 'None', num1, num2) result = demo.genequery() return result
def rsid2bed(rsid_file, OutDir): window_size = 1000 mv = myvariant.MyVariantInfo() rsid_file = open(rsid_file, 'r') bed_file = open(os.path.join(OutDir, 'tmp.bed'), 'w') num_input = 0 num_out = 0 for line in rsid_file: num_input += 1 if line[:2].lower() != 'rs': print("Error: Please input valid rsid") info = mv.query(line, assembly='hg38') if len(info['hits']) == 0: continue chrom = info['hits'][0]['chrom'] pos = info['hits'][0]['vcf']['position'] ref = info['hits'][0]['vcf']['ref'] alt = info['hits'][0]['vcf']['alt'] id = line.strip().lower() begin = int(int(pos) - window_size / 2) end = int(int(pos) + window_size / 2) bed_file.write('chr' + chrom + '\t' + str(begin) + '\t' + str(end) + '\t' + id + ';' + ref + ';' + alt + ';' + chrom + ';' + pos + '\n') num_out += 1 return num_input, num_out
def hvgs_ids(self): """The HVGS ID from myvariant.""" if not hasattr(self, '_hvgs_ids'): mv = myvariant.MyVariantInfo() self._hvgs_ids = [ i['_id'] for i in mv.query(self.snp_loc, fields='id')['hits'] ] return self._hvgs_ids
def annotate_vcf_file(self): ''' - Annotate the VCF file using the following example code (for 1 variant) - Iterate of the variants (use first 900) - Store the result in a data structure :return: ''' print("TODO") ## ## Example loop ## ## Build the connection h = httplib2.Http() headers = {'content-type': 'application/x-www-form-urlencoded'} params_pos = [] # List of variant positions with open(self.vcf_path) as my_vcf_fh: vcf_reader = vcf.Reader(my_vcf_fh) for counter, record in enumerate(vcf_reader): params_pos.append(record.CHROM + ":g." + str(record.POS) + record.REF + ">" + str(record.ALT[0])) if counter >= 899: break ## Build the parameters using the list we just built params = 'ids=' + ",".join(params_pos) + '&hg38=true' ## Perform annotation res, con = h.request('http://myvariant.info/v1/variant', 'POST', params, headers=headers) annotation_result = con.decode('utf-8') # Alternative way with myvariant package (normal http request returns string not list/dict! mv = myvariant.MyVariantInfo() annotation_result = mv.getvariants(params) ## TODO now do something with the 'annotation_result' reslist = [] for result in annotation_result: try: if result['notfound']: pass else: reslist.append(result) except: reslist.append(result) ## ## End example code ## return reslist ## return the data structure here
def getSNPannot(ids): mv = myvariant.MyVariantInfo() df = mv.querymany( ids, scopes='cosmic.cosmic_id, dbsnp.rsid', fields='_id, clinvar.gene.symbol, dbnsfp.aa.ref, dbnsfp.aa.pos, \ dbnsfp.aa.alt, dbnsfp.clinvar.trait', as_dataframe=True) return df
def getvariant(chromosome, start, ref, var): # Create myvariant info instance mv = myvariant.MyVariantInfo() # Get variant information for: chromosome, int(start), ref, var v = myvariant.format_hgvs(chromosome, int(start), ref, var) dir_ = mv.getvariant(v) # Return variant information found in all databases as a directory return dir_
def setUp(self): self.mv = myvariant.MyVariantInfo() self.query_list1 = [ 'chr1:g.866422C>T', 'chr1:g.876664G>A', 'chr1:g.69635G>C', 'chr1:g.69869T>A', 'chr1:g.881918G>A', 'chr1:g.865625G>A', 'chr1:g.69892T>C', 'chr1:g.879381C>T', 'chr1:g.878330C>G' ] self.query_list2 = [ 'rs374802787', 'rs1433078', 'rs1433115', 'rs377266517', 'rs587640013', 'rs137857980', 'rs199710579', 'rs186823979', 'rs2276240', 'rs372452565' ]
def get_mv_data(chrom, pos, ref_allele, var_allele): hgvs_format = myvariant.format_hgvs(get_chrom_display(chrom), pos, ref_allele, var_allele) mv = myvariant.MyVariantInfo() mv_data = mv.getvariant(hgvs_format, fields=['clinvar', 'dbsnp', 'exac']) if mv_data and 'clinvar' in mv_data and 'rcv' in mv_data['clinvar']: if not type(mv_data['clinvar']['rcv']) == list: mv_data['clinvar']['rcv'] = [mv_data['clinvar']['rcv']] if mv_data: allele_freq, freq_url = get_allele_freq(mv_data, var_allele) else: allele_freq, freq_url = None, None return hgvs_format, mv_data, allele_freq, freq_url
def get_dict_myvariant(self, variant_list): """ Function designated to place the queries on myvariant.info servers. :param variant_list: list of HGVS variant ID's. Usually retrived beforehand using the method get_variants_from_vcf from the class VariantParsing. :return: list of dictionaries. Each dictionary contains data about a single variant. """ mv = myvariant.MyVariantInfo() # This will retrieve a list of dictionaries variant_data = mv.getvariants(variant_list, as_dataframe=False) variant_data = self.remove_id_key(variant_data) return variant_data
def _get_myvariantinfo_annotations_dict(hgvs_ids_list, genome_build_version, verbose_level, num_failed_attempts=0): """ Retrieve variants from MyVariant.info""" max_failed_attempts = 5 myvariant_fields = [ 'cadd.1000g', 'cadd.esp', 'cadd.phred', 'cadd.gerp', 'cadd.polyphen', 'cadd.sift', 'dbsnp.rsid', 'cosmic.cosmic_id', 'cosmic.tumor_site', 'clinvar.rcv.accession', 'clinvar.rcv.clinical_significance', 'clinvar.rcv.conditions', 'civic.description', 'civic.evidence_items', 'cgi', 'gwassnps', 'wellderly.alleles' ] be_verbose = verbose_level >= 2 mv = myvariant.MyVariantInfo() try: myvariantinfo_dicts_list = mv.getvariants(hgvs_ids_list, verbose=int(be_verbose), as_dataframe=False, fields=myvariant_fields, assembly=genome_build_version) except ValueError as unrecoverable_error: # If myvariant.info returned a value error, recalling with the same values won't help so error out now raise unrecoverable_error except Exception as error: # If we got something other than a value error, problem may be with internet connection or myvariant.info # availability, so try again a couple of times just in case we can recover logging.info('Error: ' + str(error) + 'while fetching from MyVariant') num_failed_attempts += 1 if num_failed_attempts < max_failed_attempts: time.sleep(5) logging.info("Retrying MyVariant.info fetch") myvariantinfo_dicts_list = _get_myvariantinfo_annotations_dict(hgvs_ids_list, genome_build_version, verbose_level, num_failed_attempts) else: # give up and raise error raise error myvariantinfo_dicts_list = _remove_unwanted_keys(myvariantinfo_dicts_list) return myvariantinfo_dicts_list
def get_variant_info(snp_list, fields='dbsnp', pandas=True): """Get variant info for a list of SNPs. Args: snp_list: A list of SNP objects or SNP rsIDs fields: Choose fields to display from: `<docs.myvariant.info/en/latest/doc/data.html#available-fields>`_ Good choices are 'dbsnp', 'clinvar', or 'gwassnps' Can also use 'grasp' to get a different version of this info. pandas: Return a dataframe instead of dictionary. Returns: A dictionary or a dataframe. """ mv = _mv.MyVariantInfo() if isinstance(snp_list, _pd.DataFrame): try: snps = list(snp_list.study_snpid) except AttributeError: snps = list(snp_list.index) elif isinstance(snp_list[0], t.SNP): snps = [i.study_snpid for i in snp_list] else: snps = snp_list assert isinstance(snps, (list, tuple)) dfs = [] for q in _chunks(snps, 999): dfs.append( mv.querymany(q, scopes='dbsnp.rsid', fields=fields, as_dataframe=pandas, df_index=True)) if len(snps) > 999: _sleep(2) if pandas: return _pd.concat(dfs) else: if len(dfs) > 1: return dfs else: return dfs[0]
def grabVariant(chrNumAndPosition): # Example input: 'chr7:g.117589482A>G' geneDict, overallDict, protein_result = None, None, None # Build myvariant object mv = myvariant.MyVariantInfo() # Query to select dbsnp dictionary from overall dictionary if mv: try: dbsnpDict = mv.getvariant(chrNumAndPosition, assembly='hg38')['dbsnp'] overallDict = mv.getvariant(chrNumAndPosition, assembly='hg38') if dbsnpDict: # Gather gene information from dbSNP dictionary geneDict = dbsnpDict['gene'] protein_result = findCommonProteinName(overallDict) print('Search Query Successful') except TypeError: print('Search Query Not Found') return geneDict, overallDict, protein_result
def get_variant_info(self, fields="dbsnp", pandas=True): """Use the myvariant API to get info about this SNP. Note that this service can be very slow. It will be faster to query multiple SNPs. Args: fields: Choose fields to display from: `docs.myvariant.info/en/latest/doc/data.html#available-fields`_ Good choices are 'dbsnp', 'clinvar', or 'gwassnps' Can also use 'grasp' to get a different version of this info. pandas: Return a dataframe instead of dictionary. Returns: A dictionary or a dataframe. """ mv = myvariant.MyVariantInfo() return mv.getvariants(self.hvgs_ids, fields=fields, as_dataframe=pandas, df_index=True)
def additional_annotation(request, variant_sample_pk): variant_sample = get_object_or_404(VariantSample, pk=variant_sample_pk) variant = variant_sample.variant chromosome = variant.chromosome[3:] position = variant.position ref = variant.ref alt = variant.alt mv = myvariant.MyVariantInfo() q = 'chrom:' + chromosome + ' AND vcf.position:' + str( position) + ' AND vcf.ref:' + ref + ' AND vcf.alt:' + alt #data = mv.query(q) response = urllib2.urlopen('http://python.org/') html = response.read() return JsonResponse(html, safe=False)
def map_23andme_clinvar(user_data, conn): print('Mapping user 23andMe data with ClinVar...') c = conn.cursor() mv = myvariant.MyVariantInfo() # Cache MyVariantInfo requests mv.set_caching('./myvariant_cache', verbose=False) # Can definitely be optimized to reduce database or HTTP requests. for user in user_data: parsed = parse_user_vcf_data(user) mapped = [] print('Mapping ' + user['user']['username'] + ' data to ClinVar...') print(len(parsed)) for var in parsed: c.execute(''' SELECT * FROM clinvar WHERE (chrom=? AND pos=? AND alt LIKE ?) OR (id=?) ''', (CHROM_INDEX[str(var['chrom'])], int(var['pos']), var['alt_allele'], var['id'])) for result in c.fetchall(): var['clinvar_data'] = json.loads(result[-1]) var['gennotes_id'] = None var['gennotes_data'] = None var['hgvs_id'] = None var['mv_data'] = None if (not var['alt_allele'] == '.'): var['gennotes_id'] = _gennotes_id(var['chrom'], var['pos'], var['ref_allele'], var['alt_allele']) results = requests.get('https://gennotes.herokuapp.com/api/variant/', params={'variant_list': json.dumps([var['gennotes_id']])}) var['gennotes_data'] = results.json() var['hgvs_id'] = _hgvs_id(var['chrom'], var['pos'], var['ref_allele'], var['alt_allele']) try: mv_data = mv.getvariant(var['hgvs_id'], fields=['clinvar', 'dbsnp', 'exac'], verbose=False) var['mv_data'] = mv_data except Exception as e: print(var['alt_allele']) print(e) mapped.append(var) with open('mapped_user_vcf/' + user['local_filename'] + '.json', 'w') as f: json.dump(mapped, f, indent=4) c.close()
def get_dbsnp_obj(variant): obj_array = [] for obj in variant.variation_objects: if obj["object_type"] in [ "reference snp id", "reference snp identifier", "reference snp object", "rs", "rs id", "rs identifier", "rs object", "rs number", "rsid" ]: obj_array.append(obj["object"]) if obj_array: return obj_array for rs_id in get_rs_number(variant): mv = myvariant.MyVariantInfo() query_string = "dbsnp.rsid:" + rs_id result = mv.query(query_string, fields='dbsnp') gnomics.objects.variation.Variation.add_object(variant, obj=result, object_type="dbSNP") obj_array.append(result) return obj_array
def get_snp(self): """ Gets all existing snp (from my variant library) in the target sequence. Why ? Because a snp can't be in the primers sequences. internet connection required. :return: a list of snp such as [[id, position, gmaf],[...],[...],...] """ snp_info = [] mv = myvariant.MyVariantInfo() res = mv.query(self.no_chromosome + ":" + str(int(self.mutation_pos) - self.range) + "-" + str(int(self.mutation_pos) + self.range), fields='dbsnp', size=1000) for element in res["hits"]: if "dbsnp" in element: if "gmaf" in element["dbsnp"]: snp_info.append([ element["dbsnp"]["rsid"], element["dbsnp"]["hg19"]["start"], element["dbsnp"]["gmaf"] ]) return snp_info
def get_info(gene_symbol, hg, database='hg19'): ##db_index = input(""" ##Select database: ##0: hg38 ##1: hg19 ##""") ## ##database = ['hg38','hg19'][db_index] conn = pymysql.connect(host='genome-mysql.cse.ucsc.edu', user='******', password='******', db=database) cur = conn.cursor() #gene_symbol = raw_input("Please enter the gene symbol: ") gene_symbol = gene_symbol.upper() statement1 = "select name from refGene where name2 = '%s'" % gene_symbol cur.execute(statement1) temp = cur.fetchall() fuzzy = False if len(temp[0]) == 0: fuzzy = True temp = [] i = 1 while i < len(gene_symbol): temp1 = gene_symbol[0:len(gene_symbol) - i] + "_" + gene_symbol[len(gene_symbol) - i + 1:] test = "select name2 from refGene where name2 like '%s'" % temp1 cur.execute(test) holder = cur.fetchall() for item in holder: if len(item) > 1: for subitem in item: print subitem temp += [subitem[0]] else: temp += [item[0]] i += 1 if len(temp[0]) == 0: i = 3 while i < len(gene_symbol): temp1 = gene_symbol[0:i] + "%" test = "select name2 from refGene where name2 like '%s'" % temp1 cur.execute(test) holder = cur.fetchall() temp += [holder] i += 1 genes = [] for item in temp: if item not in genes: genes += [item] if fuzzy: print "Did you mean?:" for index, item in enumerate(genes): print index, ":", item print "none : none" gene_index = raw_input("Enter your choice: ") try: gene_index = int(gene_index) statement1 = "select name2 from refGene where name2 = '%s'" % genes[ gene_index] cur.execute(statement1) temp = cur.fetchall() except: temp = [] i = 0 while i < len(gene_symbol) + 1: temp1 = gene_symbol[0:i + 2] + "%" test = "select name2 from refGene where name2 like '%s'" % temp1 cur.execute(test) holder = cur.fetchall() temp += [holder] i += 1 genes = [] for item in temp[0]: if item[0] not in genes: genes += [item[0]] print "Did you mean?:" for index, item in enumerate(genes): print index, ":", item print "none : none" gene_index = raw_input("Enter your choice: ") try: gene_index = int(gene_index) statement1 = "select name from refGene where name2 = '%s'" % genes[ gene_index] cur.execute(statement1) temp = cur.fetchall() except: print "Damn" transcripts = [] for index, item in enumerate(temp): transcripts += [item[0]] if len(transcripts) > 1: print index, ": ", item[0] trans_choice = input( "Please select a transcript from the list above: ") else: trans_choice = 0 transcript = transcripts[trans_choice] statement2 = "select * from refGene where name = '%s'" % transcript cur.execute(statement2) info = cur.fetchall() info = info[0] txstart = info[4] txend = info[5] cdstart = info[6] cdend = info[7] chrom = info[2] exonstarts = info[9].split(",") exonends = info[10].split(",") while '' in exonstarts: exonstarts.remove('') while '' in exonends: exonends.remove('') strand = info[3] if strand == "-": temp = exonstarts temp2 = exonends exonstarts = temp2[::-1] exonends = temp[::-1] temp = txstart temp2 = txend txstart = temp2 txend = temp temp = cdstart temp2 = cdend cdstart = temp2 cdend = temp mv = myvariant.MyVariantInfo() #hg=raw_input("Enter variant: ") #recreate coding mapping es = exon_mapping(cdstart, txstart, cdend, exonstarts, exonends, strand) genomic = get_gen(hg, es, exonends, chrom, strand) query = "'%s' AND %s" % (hg, gene_symbol) cv = mv.query(query) hits = [] for item in cv['hits']: if item['_id'] == genomic: hits += [item] return hits, genomic
class query: # start mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1') ''' parameters = input('Enter parameters :') filetype = input('file type :') hgversion = #input('NCBI snp version (eg. :38): ') lo = hgVersionJudge(hgversion) ''' def __init__(self, fileName, outputfile_name, type, outputtype, version=19, fieldKeyWords='None', lineBegin=-1, lineEnd=math.inf): self.parameters = fileName self.outputfile_name = outputfile_name self.hgversion = version self.filetype = type self.outputtype = outputtype self.lo = self.hgVersionJudge(version) self.fieldKeyWords = fieldKeyWords self.lineBegin = lineBegin self.lineEnd = lineEnd def infosearch(self, line): p = re.compile('\bthis\b') print(p.search('no class at all')) print(re.search(line)) def hgVersionJudge(self, nowVersion): if (int(nowVersion) != 19): strs = 'hg' + str(nowVersion) lo = LiftOver(strs, 'hg19') return lo else: return 0 def whole_genomeProcessor(self, wordslist, hgVersionNow, lo): if (len(wordslist) < 8): print(len(wordslist)) return 'no' chromosome = wordslist[3] position = wordslist[5] vartype = wordslist[6] originalBase = wordslist[7] postBase = wordslist[8] if (vartype != 'snp'): return 'no' chro = chromosome print("mark" + chro) print(position) print(chromosome) queryinfos = (chromosome + ':g.' + position + originalBase + '>' + postBase) return queryinfos # def hgVersion_ChrPosConvert(self,lo, hgVersionNow,chro,position): # return def AncestryAndmeProcessor(self, wordslist, hgVersionNow, lo): rsid = wordslist[0] # chromosome = wordslist[1] # position = wordslist[2] # genotype = wordslist[3] # originalBase = genotype[0] # postBase = genotype[1] queryinfom = (rsid) return queryinfom def AncestryAndmeProcessor_vcf_title(self, wordslist, hgVersionNow, lo): # CHROM POS ID REF ALT QUAL FILTER INFO rsid = wordslist[0] chromosome = wordslist[1] position = wordslist[2] genotype = wordslist[3] originalBase = genotype[0] postBase = genotype[1] return (chromosome + '\t' + position + '\t' + rsid + '\t' + originalBase + '\t' + postBase) def vcfFileProcessor(self, wordslist, hgVersionNow, lo): # rsid = wordslist[0] chromosome = wordslist[0] position = wordslist[1] # genotype = wordslist[3] originalBase = wordslist[3] postBase = wordslist[4] chro = "chr" + chromosome print("mark" + chro) print(position) convert = [] position = int(position) if (hgVersionNow == 19): convert = lo.convert_coordinate(chro, position) print(convert) resultt = str(convert[0]) ss = resultt.split(",") cc = ss[0] dd = [] dd = cc.split("'") chromosome = dd[1] print(chromosome) position = ss[1] position = position.strip() print(ss) position = str(position) queryinfo = 'chr' + chromosome + ':g.' + position + originalBase + '>' + postBase return queryinfo def expansion(self, dict1, dict0, key1): #print(type(dict1)) if (isinstance(dict1, dict)): for key2 in dict1.keys(): #print('keys: ' + key2) if (key1 != ''): key3 = str(key1) + '.' + str(key2) else: key3 = str(key2) self.expansion(dict1.get(key2), dict0, key3) elif (isinstance(dict1, list)): for item in dict1: self.expansion(item, dict0, key1) else: dict0[key1] = str(dict1) def genequery(self): # get the chrosome, position and genotype and put them into the things suitable for mv commend # print the information in the file # to open file #print('line133 success') file = open(self.parameters, "r") fileb = open(self.outputfile_name, "a", encoding='utf-8') count = 0 lineNumber = 0 nonrs = 0 title = {} outputs = [] queryinfo_list = [] vcfinfo_list = [] # print("\ntest end, real begin") if (self.outputtype != 'csv'): fileb.write( '##fileformat=VCFv4.1 \n##fileDate=' + str(datetime.datetime.now()) + '\n##version=hg19 \n##CHROM POS ID REF ALT QUAL FILTER INFO') for lines in file: lineNumber = lineNumber + 1 if (lines[0] != '#' and lines[0] != '>' and lineNumber >= self.lineBegin and lineNumber <= self.lineEnd): count = count + 1 wordslist = [] words = lines.split("\t") queryinfo = '' output = '' result = '' for item in words: wordslist.append(item) if (len(wordslist) >= 1): if (self.filetype == 'vcf'): queryinfo = self.vcfFileProcessor( wordslist, self.hgversion, self.lo) if (self.fieldKeyWords == 'None'): print('140 : queryInfo' + self.filetype) print('141: ' + queryinfo) result = self.mv.getvariant(queryinfo) else: result = self.mv.getvariant( queryinfo, fields=self.fieldKeyWords) elif (self.filetype == '23andme' or self.filetype == 'ancestry'): queryinfo = self.AncestryAndmeProcessor( wordslist, self.hgversion, self.lo) #print(lineNumber) queryinfo_list.append(queryinfo) if (self.outputtype == 'vcf'): titleinfo = self.AncestryAndmeProcessor_vcf_title( wordslist, self.hgversion, self.lo) vcfinfo_list.append(titleinfo) elif (self.filetype == 'whole_genome'): queryinfo = self.whole_genomeProcessor( wordslist, self.hgversion, self.lo) if (self.fieldKeyWords == 'None'): result = self.mv.getvariant(queryinfo) else: result = self.mv.getvariant( queryinfo, fields=self.fieldKeyWords) else: print( 'possible type : vcf, 23andme, ancestry, whole_genome' ) break else: continue if (queryinfo == 'no'): continue #print(queryinfo) #got a query list but no result #if (count > 20): #break elif (count >= 0): nonrs = nonrs + 1 #if (self.outputtype != 'csv' and (lines[0] == '#' or lines[0] == '>')): #fileb.write(lines) #for loop end here return queryinfo_list if (self.outputtype == 'vcf'): count = 0 dul_count = 0 test_out = '' results = self.mv.querymany(queryinfo_list, 'dbsnp.rsid', returnall=True) for result in results['out']: # print('210',result) # result = result['hits'][0] test_output = {} if (True): self.expansion(result, test_output, '') strall = '' for key in test_output.keys(): strall = strall + str(key) + ' : ' + test_output.get( key) + ' ' test_out = vcfinfo_list[count] + strall + '\n' fileb.write(test_out) for keys in test_output.keys(): if (test_output.get(keys, 'a') != 'a'): title[keys] = 1 # detect whether there is dup in dup[] list if (dul_count == 0): for item in results['dup']: # print('223',queryinfo_list[count]) # print(item) if queryinfo_list[count] in item: dul_count = item[1] - 1 else: dul_count = dul_count - 1 if (dul_count == 0): count = count + 1 #count = 0 #dul_count = 0 #results = self.mv.querymany(queryinfo_list, 'dbsnp.rsid', returnall=True) #print(str(lines) + '\t' + str(result)) #fileb.write(str(output)+ '\t' + str(result) + '\n' if (self.outputtype == 'csv'): count = 0 dul_count = 0 start_time = time.time() results = self.mv.querymany(queryinfo_list, 'dbsnp.rsid', returnall=True) print("--- time " + "%s sconds ---" % (time.time() - start_time)) fileb.write("%s \n" % (time.time() - start_time)) for result in results['out']: #print('210',result) #result = result['hits'][0] if (result != ""): test_output = {'info': queryinfo_list[count]} self.expansion(result, test_output, '') #print(type(test_output)) #print(type(outputs)) outputs.append(test_output) for keys in test_output.keys(): if (test_output.get(keys, 'a') != 'a'): title[keys] = 1 # detect whether there is dup in dup[] list if (dul_count == 0): for item in results['dup']: #print('223',queryinfo_list[count]) #print(item) if queryinfo_list[count] in item: dul_count = item[1] - 1 else: dul_count = dul_count - 1 if (dul_count == 0): count = count + 1 if (self.outputtype == 'csv'): #print('197') all_keys = title.keys() #print('all keys : ' , all_keys) #dict_writer = csv.DictWriter(fileb, title, restval='Nan',) #dict_writer.writeheader() #dict_writer.writerows(outputs) #print("line number is: ", lineNumber) #print("identiable line number is: ", count) #print("non-identiable line number is: ", nonrs) # to close the file file.close() fileb.close()
#print("identiable line number is: ", count) #print("non-identiable line number is: ", nonrs) # to close the file file.close() fileb.close() if __name__ == '__main__': import myvariant import time from multiprocessing import Pool p = Pool(2) listall = [] listunit = [] mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1') demo = query('23andme_small.txt', 'time_record.txt', '23andme', 'csv', 19, 'None', 29, 1000) list1 = demo.genequery() print(len(list1)) count = 0 for item in list1: if count < 1000: listunit.append(item) count = count + 1 else: count = 0 listall.append(listunit) listunit = [] #print(listall)
def get_dbsnp(data, region, force=False): mv = myvariant.MyVariantInfo() q = mv.query( '_exists_:dbsnp AND _exists_:hg19 AND {}:{}-{}'.format(*region), fields='dbsnp', fetch_all=True) snps = list(q) # VCF, dbSNP and myVariant use 1-based indexing dbsnp = collections.defaultdict(dict) for snp in snps: pos, ref, alt, rs = snp['dbsnp']['hg19']['start'] - 1, snp['dbsnp'][ 'ref'], snp['dbsnp']['alt'], snp['dbsnp']['rsid'] if len(ref) > 1 or len(alt) > 1: assert (ref[0] == alt[0]) if len(ref) > 1: op = 'DEL.{}'.format(ref[1:]) elif len(alt) > 1: op = 'INS.{}'.format(alt[1:].lower()) else: op = 'SNP.{}{}'.format(ref, alt) dbsnp[pos][op] = rs mutations = {} for a in sorted(data): for m in data[a]['mutations']: if m['pos'] == 'pseudogene': continue if m['dbsnp'] not in ['', '*']: m['dbsnp'] = [m['dbsnp']] else: m['dbsnp'] = [] pos, op = m['pos'], m['op'] # check reversed SNP if op in dbsnp[pos]: rsid = str(dbsnp[pos][op]) if rsid not in m['dbsnp']: if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)' m['dbsnp'].append(rsid) log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos, op) else: log.debug( 'dbSNP: Variant {} matches the Karolinska\'s prediction', rsid) elif len(dbsnp[pos]) > 0 and (op[:3] == 'SNP' and op[:4] + op[4:6][::-1] in dbsnp[pos]): op = op[:4] + op[4:6][::-1] rsid = str(dbsnp[pos][op]) if rsid not in m['dbsnp']: if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)' m['dbsnp'].append(rsid) log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos, op) else: log.debug( 'dbSNP: Variant {} matches the Karolinska\'s prediction', rsid) elif len(dbsnp[pos]) != 0: log.trace('How about {} for {}:{} ({})', dbsnp[pos], pos, op, m['old']) return data
class query: # start mv = myvariant.MyVariantInfo(url='http://myvariant.info/v1') ''' parameters = input('Enter parameters :') filetype = input('file type :') hgversion = #input('NCBI snp version (eg. :38): ') lo = hgVersionJudge(hgversion) ''' def __init__(self,fileName, outputfile_name, type, outputtype, version = 19, fieldKeyWords = 'None', lineBegin = -1, lineEnd = math.inf): self.parameters = fileName self.outputfile_name = outputfile_name self.hgversion = version self.filetype = type self.outputtype = outputtype self.lo = self.hgVersionJudge(version) self.fieldKeyWords = fieldKeyWords self.lineBegin = lineBegin self.lineEnd = lineEnd def infosearch(self,line): p = re.compile('\bthis\b') print(p.search('no class at all')) print(re.search(line)) def hgVersionJudge(self, nowVersion): if (int(nowVersion) != 19): strs = 'hg' + str(nowVersion) lo = LiftOver(strs, 'hg19') return lo else: return 0 def whole_genomeProcessor(self, wordslist, hgVersionNow, lo): if (len(wordslist) < 8): print(len(wordslist)) return 'no' chromosome = wordslist[3] position = wordslist[5] vartype = wordslist[6] originalBase = wordslist[7] postBase = wordslist[8] if (vartype != 'snp'): return 'no' chro = chromosome print("mark" + chro) print(position) print(chromosome) queryinfos = (chromosome + ':g.' + position + originalBase + '>' + postBase) return queryinfos # def hgVersion_ChrPosConvert(self,lo, hgVersionNow,chro,position): # return def AncestryAndmeProcessor(self,wordslist, hgVersionNow, lo): rsid = wordslist[0] # chromosome = wordslist[1] # position = wordslist[2] # genotype = wordslist[3] # originalBase = genotype[0] # postBase = genotype[1] queryinfom = (rsid) return queryinfom def vcfFileProcessor(self, wordslist, hgVersionNow, lo): # rsid = wordslist[0] chromosome = wordslist[0] position = wordslist[1] # genotype = wordslist[3] originalBase = wordslist[3] postBase = wordslist[4] chro = "chr" + chromosome print("mark" + chro) print(position) convert = [] position = int(position) if (hgVersionNow == 19): convert = lo.convert_coordinate(chro, position) print(convert) resultt = str(convert[0]) ss = resultt.split(",") cc = ss[0] dd = [] dd = cc.split("'") chromosome = dd[1] print(chromosome) position = ss[1] position = position.strip() print(ss) position = str(position) queryinfo = 'chr' + chromosome + ':g.' + position + originalBase + '>' + postBase return queryinfo def expansion(self, dict1, dict0, key1): #print(type(dict1)) if (isinstance(dict1, dict)): for key2 in dict1.keys(): #print('keys: ' + key2) if (key1 != ''): key3 = str(key1) + '.' + str(key2) else: key3 = str(key2) self.expansion(dict1.get(key2), dict0, key3) else: dict0[key1] = str(dict1) def queries(self, queryinfo,wordslist): if (self.fieldKeyWords == 'None'): # print('130: ' + queryinfo) result_collection = self.mv.query(queryinfo) else: # print('133: ' + queryinfo + ',fields=' + self.fieldKeyWords) result = self.mv.query(queryinfo, fields=self.fieldKeyWords) if (self.outputtype == 'vcf'): output = wordslist[1] + '\t' + wordslist[2] + '\t' + wordslist[0] + '\t' + wordslist[3][0] + '\t' + \ wordslist[3][1] + '\t.\t.\t' elif (self.outputtype == 'csv'): print(result) result = result['hits'][0] def genequery(self): # get the chrosome, position and genotype and put them into the things suitable for mv commend # print the information in the file # to open file result = '' file = open(self.parameters, "r") fileb = open(self.outputfile_name, "w", encoding='utf-8') count = 0 lineNumber = 0 nonrs = 0 title = {} outputs = [] multiple_query = [] list_of_wordlist = [] # print("\ntest end, real begin") if (self.outputtype != 'csv') : fileb.write('##fileformat=VCFv4.1 \n##fileDate=' + str(datetime.datetime.now()) + '\n##version=hg19 \n##CHROM POS ID REF ALT QUAL FILTER INFO') for lines in file: lineNumber = lineNumber + 1 if (lines[0] != '#' and lines[0] != '>' and lineNumber >= self.lineBegin and lineNumber <= self.lineEnd): count = count + 1 wordslist = [] words = lines.split("\t") queryinfo = '' output = '' for item in words: wordslist.append(item) if (len(wordslist) >= 1): if (self.filetype == 'vcf'): queryinfo = self.vcfFileProcessor(wordslist, self.hgversion, self.lo) if (self.fieldKeyWords == 'None') : print('140 : queryInfo' + self.filetype) print('141: ' + queryinfo) result = self.mv.getvariant(queryinfo) else: result = self.mv.getvariant(queryinfo ,fields =self.fieldKeyWords) elif (self.filetype == '23andme' or self.filetype == 'ancestry'): queryinfo = self.AncestryAndmeProcessor(wordslist, self.hgversion, self.lo) #print(queryinfo) multiple_query.append(queryinfo) list_of_wordlist.append(wordslist) elif (self.filetype == 'whole_genome'): queryinfo = self.whole_genomeProcessor(wordslist, self.hgversion, self.lo) if (self.fieldKeyWords == 'None') : result = self.mv.getvariant(queryinfo) else: result = self.mv.getvariant(queryinfo ,fields =self.fieldKeyWords) else: print('possible type : vcf, 23andme, ancestry, whole_genome') break else: continue if (queryinfo == 'no'): continue #print(queryinfo) if (result != "" and self.outputtype =='vcf'): lineNumber = lineNumber #print(str(lines) + '\t' + str(result)) #fileb.write(str(output)+ '\t' + str(result) + '\n' ) elif (result != "" and self.outputtype == 'csv') : test_output = {} self.expansion(result, test_output, '') print(type(test_output)) print(type(outputs)) outputs.append(test_output) for keys in test_output.keys() : if (test_output.get(keys, 'a') != 'a'): title[keys] = 1 #w = csv.DictWriter(fileb, test_output.keys()) #print(test_output) #w.writeheader() #w.writerow(test_output) if (count > 2000): break elif (count >= 0): nonrs = nonrs + 1 if (self.outputtype != 'csv' and (lines[0] == '#' or lines[0] == '>')): #fileb.write(lines) 1 + 1 if (self.outputtype == 'csv'): print('197') all_keys = title.keys() dict_writer = csv.DictWriter(fileb, title) dict_writer.writeheader() dict_writer.writerows(outputs) print("line number is: ", lineNumber) print("identiable line number is: ", count) print("non-identiable line number is: ", nonrs) # to close the file file.close() fileb.close() return multiple_query
def create_missense_variant_item(hgvs, label, login, fast_run=True): print(hgvs) mv = myvariant.MyVariantInfo() vd = mv.getvariant(hgvs) chrom = human_chromosome_map[vd['chrom'].upper()] if 'hg19' not in vd or 'dbnsfp' not in vd: raise ValueError( "Metadata not found in MyVariant, unable to create item") start = str(vd['hg19']['start']) end = str(vd['hg19']['end']) gene = hgnc_qid[vd['dbnsfp']['genename'].upper()] url = "http://myvariant.info/v1/variant/{}".format(quote(hgvs)) ref = [ wdi_core.WDItemID(ITEMS['MyVariant.info'], PROPS['stated in'], is_reference=True), wdi_core.WDUrl(url, PROPS['reference URL'], is_reference=True), wdi_core.WDTime(strftime("+%Y-%m-%dT00:00:00Z", gmtime()), PROPS['retrieved'], is_reference=True) ] ga_qual = wdi_core.WDItemID(ITEMS['Genome assembly GRCh37'], PROPS['genomic assembly'], is_qualifier=True) s = [] s.append( wdi_core.WDItemID(ITEMS['sequence variant'], PROPS['instance of'], references=[ref])) s.append( wdi_core.WDItemID(ITEMS['Missense Variant'], PROPS['subclass of'], references=[ref])) s.append( wdi_core.WDItemID(chrom, PROPS['chromosome'], references=[ref], qualifiers=[ga_qual])) s.append( wdi_core.WDString(start, PROPS['genomic start'], references=[ref], qualifiers=[ga_qual])) s.append( wdi_core.WDString(end, PROPS['genomic end'], references=[ref], qualifiers=[ga_qual])) s.append( wdi_core.WDItemID(gene, PROPS['biological variant of'], references=[ref])) s.append( wdi_core.WDExternalID(hgvs, PROPS['HGVS nomenclature'], references=[ref])) item = wdi_core.WDItemEngine( item_name=label, data=s, domain="variant", fast_run=fast_run, fast_run_base_filter={PROPS['HGVS nomenclature']: ''}, fast_run_use_refs=True, ref_handler=update_retrieved_if_new_multiple_refs, core_props=core_props) item.set_label(label) item.set_description("genetic variant") wdi_helpers.try_write(item, hgvs, PROPS['HGVS nomenclature'], login) return item
def search(query, user=None, search_type=None, taxon="H**o sapiens", source="entrez"): result_set = [] if (source.lower() in ["myvariant", "all"]): mv = myvariant.MyVariantInfo() result = mv.query(query) for hit in result["hits"]: temp_identifier_list = [] temp_var = gnomics.objects.variation.Variation( identifier=hit["_id"], identifier_type="HGVS ID", language=None, source="MyVariant", taxon="H**o sapiens") temp_identifier_list.append(hit["_id"]) if "gnomad_genome" in hit: if hit["gnomad_genome"]["rsid"] not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=hit["gnomad_genome"]["rsid"], identifier_type="RS Number", language=None, source="MyVariant", taxon="H**o sapiens") temp_identifier_list.append(hit["gnomad_genome"]["rsid"]) if "clinvar" in hit: for genomic_hgvs in hit["clinvar"]["hgvs"]["genomic"]: if genomic_hgvs not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=genomic_hgvs, identifier_type="Genomic HGVS ID", language=None, source="MyVariant", taxon="H**o sapiens") temp_identifier_list.append(genomic_hgvs) if "coding" in hit["clinvar"]["hgvs"]: if hit["clinvar"]["hgvs"][ "coding"] not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=hit["clinvar"]["hgvs"]["coding"], identifier_type="Coding HGVS ID", language=None, source="MyVariant", taxon="H**o sapiens") temp_identifier_list.append( hit["clinvar"]["hgvs"]["coding"]) if "variant_id" in hit["clinvar"]: if hit["clinvar"][ "variant_id"] not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=hit["clinvar"]["variant_id"], identifier_type="Variant ID", language=None, source="MyVariant", taxon="H**o sapiens") temp_identifier_list.append( hit["clinvar"]["variant_id"]) if "rcv" in hit["clinvar"]: print("here") print(hit) if type(hit["clinvar"]["rcv"]) == list: for sub_hit in hit["clinvar"]["rcv"]: if sub_hit[ "accession"] not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=sub_hit["accession"], identifier_type="ClinVar Accession", name=sub_hit["preferred_name"], taxon="H**o sapiens") temp_identifier_list.append( sub_hit["accession"]) else: if hit["clinvar"]["rcv"][ "accession"] not in temp_identifier_list: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=hit["clinvar"]["rcv"] ["accession"], identifier_type="ClinVar Accession", name=hit["clinvar"]["rcv"] ["preferred_name"], taxon="H**o sapiens") temp_identifier_list.append( hit["clinvar"]["rcv"]["accession"]) result_set.append(temp_var) # Adapted from: # https://www.ncbi.nlm.nih.gov/dbvar/content/tools/entrez/ if (source.lower() in ["ncbi", "entrez", "all"]) and user is not None: if user.email is not None: Entrez.email = user.email paramEutils = {"usehistory": "Y"} full_query = "('variant'[Object Type] AND %s)" % query eSearch = Entrez.esearch(db="dbvar", term=full_query, **paramEutils) res = Entrez.read(eSearch) if res["IdList"]: for iden in res["IdList"]: if taxon == "H**o sapiens": temp_var = gnomics.objects.variation.Variation( identifier=iden, identifier_type="Variant Region ID", language=None, source="dbVar", name=None, taxon=taxon) result_set.append(temp_var) else: paramEutils = {"usehistory": "Y"} eSearch = Entrez.esearch(db="snp", term=query, **paramEutils) res = Entrez.read(eSearch) for iden in res["IdList"]: if taxon == "H**o sapiens": temp_var = gnomics.objects.variation.Variation( identifier=iden, identifier_type="RS Number", language=None, source="dbSNP", name=None, taxon=taxon) result_set.append(temp_var) else: print( "Search cannot continue without a valid user and a valid email address associated with such a user object." ) if (source.lower() in ["ensembl", "all"]): if taxon == "H**o sapiens": server = "https://rest.ensembl.org" ext = "/variation/human/" + str(query) + "?" r = requests.get(server + ext, headers={"Content-Type": "application/json"}) if not r.ok: print("No match found.") else: decoded = r.json() if "name" in decoded: temp_var = gnomics.objects.variation.Variation( identifier=decoded["name"], identifier_type="Ensembl Variation ID", language=None, source="Ensembl", taxon="H**o sapiens") if "rs" in decoded["name"]: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=decoded["name"], identifier_type="RS Number", language=None, source="Ensembl", taxon="H**o sapiens") for syn in decoded["synonyms"]: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=syn, identifier_type="Ensembl Synonym", language=None, source="Ensembl", taxon="H**o sapiens") result_set.append(temp_var) if (source.lower() in ["ebi", "embl", "proteins api", "all"]): var_match = re.compile( r"[ARNDBCEQZGHILKMFPSTWYV]\d+[ARNDBCEQZGHILKMFPSTWYV]") matched = re.findall(var_match, query) var_match_2 = re.compile(r"[ARNDBCEQZGHILKMFPSTWYV]\d+") matched_2 = re.findall(var_match_2, query) if matched: gene = query.split(" ")[0].strip() variation = query.split(" ")[1].strip().replace("(", "").replace( ")", "").strip() # Get Ensembl identifier from gene query. server = "https://rest.ensembl.org" ext = "/xrefs/symbol/" + taxon.lower().replace( " ", "_") + "/" + gene + "?" r = requests.get(server + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() ensembl_gene_id = "" for x in decoded: if "ENSG" in x["id"]: ensembl_gene_id = x["id"] # Get UniProt identifier from Ensembl identifier. server = "https://rest.ensembl.org" ext = "/xrefs/id/" + ensembl_gene_id + "?" r = requests.get(server + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() uniprot_accession = "" for x in decoded: if x["dbname"] == "Uniprot_gn": uniprot_accession = x["primary_id"] wild_match = re.compile( "([ARNDBCEQZGHILKMFPSTWYV])\d+[ARNDBCEQZGHILKMFPSTWYV]") alt_match = re.compile( "[ARNDBCEQZGHILKMFPSTWYV]\d+([ARNDBCEQZGHILKMFPSTWYV])") wildtype = re.findall(wild_match, variation)[0] location_1 = ''.join(filter(str.isdigit, variation)) location_2 = ''.join(filter(str.isdigit, variation)) alternativesequence = re.findall(alt_match, variation)[0] url = "https://www.ebi.ac.uk/proteins/api/" ext = "variation?offset=0&size=100&wildtype=" + wildtype + "&alternativesequence=" + alternativesequence + "&location=" + str( location_1) + "-" + str( location_2) + "&accession=" + uniprot_accession r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: print("Something went wrong.") else: decoded = r.json() var_array = [] var_id_array = [] for x in decoded: for feat in x["features"]: if "ftId" in feat: temp_var = gnomics.objects.variation.Variation( identifier=feat["ftId"], identifier_type="ftId", source="Proteins API") var_id_array.append(feat["ftId"]) for xref in feat["xrefs"]: if "COSM" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="COSMIC Mutation ID", source="COSMIC") var_id_array.append(xref["id"]) elif "rs" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="RS Number", source="dbSNP") var_id_array.append(xref["id"]) else: print("Other identifier found.") print(xref["id"]) result_set.append(temp_var) else: print("No ftId in feature.") print(feat) elif matched_2: if len(query.split(" ")) > 1: gene = query.split(" ")[0].strip() variation = query.split(" ")[1].strip().replace( "(", "").replace(")", "").strip() # Get Ensembl identifier from gene query. server = "https://rest.ensembl.org" ext = "/xrefs/symbol/" + taxon.lower().replace( " ", "_") + "/" + gene + "?" r = requests.get(server + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() ensembl_gene_id = "" for x in decoded: if "ENSG" in x["id"]: ensembl_gene_id = x["id"] # Get UniProt identifier from Ensembl identifier. server = "https://rest.ensembl.org" ext = "/xrefs/id/" + ensembl_gene_id + "?" r = requests.get(server + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() uniprot_accession = "" for x in decoded: if x["dbname"] == "Uniprot_gn": uniprot_accession = x["primary_id"] wild_match = re.compile("([ARNDBCEQZGHILKMFPSTWYV])\d+") wildtype = re.findall(wild_match, variation)[0] location_1 = ''.join(filter(str.isdigit, variation)) location_2 = ''.join(filter(str.isdigit, variation)) url = "https://www.ebi.ac.uk/proteins/api/" ext = "variation?offset=0&size=100&wildtype=" + wildtype + "&location=" + str( location_1) + "-" + str( location_2) + "&accession=" + uniprot_accession r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: print("Something went wrong.") else: decoded = r.json() var_array = [] var_id_array = [] for x in decoded: for feat in x["features"]: if "ftId" in feat: temp_var = gnomics.objects.variation.Variation( identifier=feat["ftId"], identifier_type="ftId", source="Proteins API") var_id_array.append(feat["ftId"]) for xref in feat["xrefs"]: if "COSM" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type= "COSMIC Mutation ID", source="COSMIC") var_id_array.append(xref["id"]) elif "rs" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="RS Number", source="dbSNP") var_id_array.append(xref["id"]) elif "RCV" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="ClinVar Accession", source="ClinVar") var_id_array.append(xref["id"]) else: continue result_set.append(temp_var) else: temp_var = gnomics.objects.variation.Variation( ) for xref in feat["xrefs"]: if "COSM" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type= "COSMIC Mutation ID", source="COSMIC") var_id_array.append(xref["id"]) elif "rs" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="RS Number", source="dbSNP") var_id_array.append(xref["id"]) elif "RCV" in xref["id"] and xref[ "id"] not in var_id_array: gnomics.objects.variation.Variation.add_identifier( temp_var, identifier=xref["id"], identifier_type="ClinVar Accession", source="ClinVar") var_id_array.append(xref["id"]) else: continue if len(temp_var.identifiers) > 0: result_set.append(temp_var) if (source.lower() in ["ncbi", "entrez"]) and user is not None: print( "The Entrez database cannot be searched without a valid user email provided." ) return result_set
# -*- coding: utf-8 -*- """ Created on Mon Jul 1 20:56:40 2019 @author: Nicky """ import myvariant # import die het mogelijk maakt om gegevens van Clinvar op te halen mv = myvariant.MyVariantInfo() # nog toevoegen: rs code uit outputdb.txt bestand halen dat gemaakt wordt in app.py info = mv.querymany(['rs121913364'], scopes='dbsnp.rsid') # Haalt informatie op adv de rs code a = ([d['_id'] for d in info]) # haalt regel met informatie nodig voor het ophalen van clinvar gegevens uit alle informatie genomeposition = (a[0]) # zet de bv 'chr1:g.35367G>A' in de variabele genomeposition print(genomeposition) clinvar_result = mv.getvariant(genomeposition) # haalt clinvar resultaten op text = str(clinvar_result) # slaat de resultaten op in de variabele text print(clinvar_result) file = open("ClinvarResults.txt","w") # opend bestand genaamd ClinvarResults.txt if text == None: # als text gelijk is aan None wordt er geschreven dat er geen resultaten zijn gevonden file.write("No results found on Clinvar") else: # als text ongelijk is aan None worden alle gegevens in het txt bestand geschreven file.write(text) file.close() with open('ClinvarResults.txt', 'r') as f2: data = f2.read() print(data)
def match_genome(inputfile, outputfile, inputfilename): """ Produce a CSV genome report at outputfile for a given VCF inputfile. """ data = dict() # Set up ClinVar data. clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37') if clinvar_filepath.endswith('.vcf'): input_clinvar_file = open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvar_filepath) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") # Run vcf2clinvar on genome data. clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile, input_clinvar_file) # Set up to get myvariant.info data (mainly for ExAC data.) mv = myvariant.MyVariantInfo() # iterate through all ClinVar matches. for genome_vcf_line, allele, zygosity in clinvar_matches: # Discard low quality data. if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters: continue # Check significance. Only keep this as a notable variant if one of the # submissions has reported "pathogenic" and "likely pathogenic" effect. sigs = [rec.sig for rec in allele.records] if not ('4' in sigs or '5' in sigs): continue # Store data in a dict according to HGVS position. poskey = myvariant.format_hgvs(genome_vcf_line.chrom, genome_vcf_line.start, genome_vcf_line.ref_allele, allele.sequence) data[poskey] = { 'genome_vcf_line': genome_vcf_line, 'clinvar_allele': allele, 'zygosity': zygosity } # Add data from myvariant.info using the HGVS positions. variants = data.keys() mv_output = mv.getvariants(variants, fields=['clinvar', 'exac']) for i in range(len(variants)): if 'clinvar' in mv_output[i]: data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar'] if 'exac' in mv_output[i]: data[variants[i]]['mv_exac'] = mv_output[i]['exac'] # Write report as CSV. with open(outputfile, 'w') as f: csv_out = csv.writer(f) for var in variants: # Clinvar URL for variant. cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format( data[var]['clinvar_allele'].records[0].acc) disease_name = '' preferred_name = '' getev_url = '' # Disease name, preferred name, and GET-Evidence URL if we have # myvariant.info information with ClinVar data. if 'mv_clinvar' in data[var]: cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format( data[var]['mv_clinvar']['variant_id']) try: disease_name = data[var]['mv_clinvar']['rcv'][ 'conditions']['name'] preferred_name = data[var]['mv_clinvar']['rcv'][ 'preferred_name'] except TypeError: disease_name = ', '.join( set([ rcv['conditions']['name'] for rcv in data[var]['mv_clinvar']['rcv'] ])) preferred_name = data[var]['mv_clinvar']['rcv'][0][ 'preferred_name'] getev_url = guess_getevidence_url(preferred_name) exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format( data[var]['genome_vcf_line'].chrom[3:], data[var]['genome_vcf_line'].start, data[var]['genome_vcf_line'].ref_allele, data[var]['clinvar_allele'].sequence) # Allele frequency using ExAC data, if myvariant.info had that. if 'mv_exac' in data[var]: total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[ var]['mv_exac']['an']['an'] total_freq = str(total_freq) freq_source = 'ExAC' else: # If not, try to get it from our ClinVar data. try: total_freq = str(data[var]['clinvar_allele'].frequency) freq_source = 'ClinVar' except KeyError: # If that fails, give up on frequency. total_freq = '' freq_source = 'Unknown' data_row = [ inputfilename, var, preferred_name, disease_name, cv_url, exac_url, total_freq, freq_source, getev_url ] csv_out.writerow(data_row) return