def test_pc_iter_2(): seq = 'AAAAAAAAAA' # 0123456789 # CG variants = [ Variant('t', 't', 4, 'A', 'C', 0.25), Variant('t', 't', 5, 'A', 'G', 0.25) ] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 2 == len(pcs) assert 'AAACAAA' in pcs assert 'AACGAA' in pcs it = PseudocontigIterator(seq, variants, 5) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 2 == len(pcs) assert 'AAAACAAAA' in pcs assert 'AAACGAAA' in pcs
def group_inversions(cls, adjs): """Group 2 inversion adjacencies into a single event""" inversions = sorted(adjs, key=lambda adj: (adj.chroms[0], adj.breaks[0])) max_homology = 25 variants = [] i = 0 while i < len(inversions) - 1: if inversions[i].chroms[0] == inversions[i + 1].chroms[0] and\ inversions[i + 1].breaks[0] - inversions[i].breaks[0] <= max_homology and\ ((inversions[i].orients == ('L', 'L') and inversions[i + 1].orients == ('R', 'R')) or (inversions[i].orients == ('R', 'R') and inversions[i + 1].orients == ('L', 'L'))): (adj1, adj2) = (inversions[i], inversions[i + 1]) if inversions[i].orients == ( 'L', 'L') else (inversions[i + 1], inversions[i]) variants.append(Variant('INV', [adj1, adj2])) i += 2 else: if not inversions[i].dubious: variants.append(Variant('INV', [inversions[i]])) i += 1 if i == len(inversions) - 1 and not inversions[i].dubious: variants.append(Variant('INV', [inversions[i]])) return variants
def main(): global options, args separator = '|' # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) if options.non_model: variant = Variant(samples=input_vcf.samples, organism_type='non_model', ploidy=options.ploidy) else: variant = Variant(samples=input_vcf.samples, ploidy=options.ploidy) # Open output file with open(options.output_vcf, 'w') as output_psv: # Generate output file header #variant = ConsequenceType(input_vcf.samples) output_psv.write(variant.create_psv_header(separator=separator)) # Now parse lines in .vcf and output with new format: for record in input_vcf: # Only output sites that hasn't been filtered out if len(record.FILTER) == 0: #for consequence in range(0, len(record.INFO['CSQ'])): variant.get_from_record(record=record) output_psv.write(variant.put_to_psv(separator=separator))
def getVariants(): # Returns initialized variants for testing ret = {} pid = "DCIS_1" rows = ["", ""] ret["1"] = [Variant(pid, "1", "100.0", "200.0", rows)] ret["1"].append(Variant(pid, "1", "1025", "1119", rows)) ret["2"] = [Variant(pid, "2", "25006", "25124", rows)] ret["X"] = [Variant(pid, "X", "90045", "90157.5", rows)] return ret
def parse_MAF(self): ''' maf filetype parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn position = int( str(row[fieldId['Start_position']]).split('.')[0] ) # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons? dp = int(str(row[fieldId['TTotCov']]).split('.')[0]) vf = float(float(row[fieldId['TVarCov']]) / float(dp)) chrom = str(row[fieldId['Chromosome']]) ref = str(row[fieldId['Reference_Allele']]) alt = str(row[fieldId['Tumor_Seq_Allele2']]) effect = self.eff fc = self.fc if ref == "-": ref = "" if alt == "-": alt = "" var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def group_trls(cls, adjs): """Group 2 translocation adjacencies into single reciprocal event""" trls = sorted([adj for adj in adjs if not adj.dubious], key=lambda adj: (adj.chroms[0], adj.breaks[0])) grouped_trl_ids = Set() neighborhood = 10000 variants = [] i = 0 if len(trls) > 1: while i < len(trls) - 1: if trls[i].chroms[0] == trls[i + 1].chroms[0] and\ trls[i].chroms[1] == trls[i + 1].chroms[1] and\ abs(trls[i + 1].breaks[0] - trls[i].breaks[0]) <= neighborhood and\ abs(trls[i + 1].breaks[1] - trls[i].breaks[1]) <= neighborhood and\ ((trls[i].orients == ('L', 'R') and trls[i + 1].orients == ('R', 'L')) or\ (trls[i].orients == ('R', 'L') and trls[i + 1].orients == ('L', 'R')) or\ (trls[i].orients == ('L', 'L') and trls[i + 1].orients == ('R', 'R')) or\ (trls[i].orients == ('R', 'R') and trls[i + 1].orients == ('L', 'L')) ): variants.append(Variant('TRL', [trls[i], trls[i + 1]])) grouped_trl_ids.add(trls[i].id) grouped_trl_ids.add(trls[i + 1].id) i += 2 else: i += 1 grouped_trl_ids = Set() trls_remained = [trl for trl in trls if trl.id not in grouped_trl_ids] return variants, trls_remained
def get_product_skus(self, product): # scrape product variants and stock status from its info # returns a list of variant objects logt(self.tid, 'fetching product variants') variants = [] try: params = { "expand": "variations,informationBlocks,customisations", "channel": "iphone-app" } url = "https://commerce.mesh.mx/stores/{}/products/{}".format(self.sitename, product) r = requests.request( 'GET', url, headers=self.headers, params=params ).json() for size in r['options']: logt(self.tid,"[size] {} \t sku {} \t {}".format( size, r['options'][size]['SKU'], r['options'][size]['stockStatus'] )) v = Variant( size, r['options'][size]['SKU'], r['options'][size]['stockStatus'] ) variants.append(v) return variants except KeyError: logt(self.tid,"[error] exception while getting product info json") exit(-1)
def parse_MuTectOUT(self): ''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc vf = float(row[fieldId['tumor_f']]) dp = int( int(str(row[fieldId['t_ref_count']]).strip()) + int(str(row[fieldId['t_alt_count']]).strip())) position = int(row[fieldId['position']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def process_vcf(self, cols): """Build object from vcf """ vcf = open(self.name, 'r') info_dict, format_dict = {}, {} # Read the meta-information lines from the vcf for i, line in enumerate(vcf): # Handle exceptions: the AF will be calcualted regardless; if line.startswith('##FORMAT=<ID=AF'): pass # Select the INFO/FORMAT lines elif line.startswith('##FORMAT'): vcf_header = VcfHeader(line) format_dict.update({vcf_header.meta_id: vcf_header}) elif line.startswith('##INFO'): vcf_header = VcfHeader(line) info_dict.update({vcf_header.meta_id: vcf_header}) # Keep other meta-info lines elif line.startswith('##'): if line.startswith('##source='): self.caller = line.replace('##source=', '').strip() self.meta_info.append(line) else: break # Only extract the (filtered) DP in the format if "DP" in info_dict.keys() and format_dict.keys(): info_dict.pop("DP", None) if not self.caller: sys.exit("Cannot identify caller from file {}\nPlease add caller \ identify line '##source=(caller name)' to vcf header" .format(self.name)) # When user specify the AF and vcf does not have, try to calculate that # for the user if ('AF' in cols) and ('AF' not in info_dict.keys()): vcf_header = VcfHeader(AF_LINE) info_dict.update({vcf_header.meta_id: vcf_header}) # Select the columns from INFO/FORMAT info_cols, format_cols = extract_cols(info_dict, format_dict, cols) # Add the INFO line (with caller) / FORMAT (unchanged) to header_list self.meta_info += [VcfHeader.write(VcfHeader.add_caller(v, self.caller)) for k, v in info_cols.items()] self.meta_info += [VcfHeader.write(v) for k, v in format_cols.items()] self.header = line # Continue to read the file, this time the variants for j, line in enumerate(vcf): variant = Variant().process_variant(line, caller=self.caller) if variant.alt == '*': print("Warning: Vcf {} line {} has variant with alt=*".format(self.caller, str(i+j+1))) cleaned_variant = Variant.select_info(variant, info_cols, format_cols) # The dictionary is query by chr\tpos\tref\talt self.variants.update({cleaned_variant.variant_key: cleaned_variant}) return self
def parse_SomaticIndelDetector(self): ''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 # Below attempts to grab sample ID. # assumes that sample ID is the final column in the self.header. always true? # if not always true, adopt the parse_mutect solution here as well tmpsampID = header[-1] for i in row[fieldId['FORMAT']].split(':'): if i == "AD": ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1] elif i == "DP": dp = row[fieldId[tmpsampID]].split(':')[j] vf = float(float(ALT_count) / float(dp)) j += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parseVCF(self, file): #load the file to be parsed fileReader = open(file, "r") #loop over the file for line in fileReader: #see if line starts with # and skip if line.startswith("#"): continue #tokenize the line lineTokens = line.split("\t") #set up the variables just so it clear what we are using chromosome = lineTokens[0] position = int(lineTokens[1]) id = lineTokens[2] referenceAllele = lineTokens[3] alternateAllele = lineTokens[4] qualityScore = float(lineTokens[5]) filterFlag = lineTokens[6] infoGroup = lineTokens[7] formatGroup = lineTokens[8] noneGroup = lineTokens[9] #create the variant and add it variant = Variant(chromosome, position, id, referenceAllele, alternateAllele, qualityScore, filterFlag, infoGroup, formatGroup, noneGroup) self.__variants.append(variant)
def parse_SamTools(self): ''' samtools vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc position = int(row[fieldId['POS']]) for i in row[fieldId['INFO']].split(';'): if i.startswith("DP4="): j = i.split('=')[1].split(',') ro = int(int(j[0]) + int(j[1])) ao = int(int(j[2]) + int(j[3])) dp = int(int(ro) + int(ao)) vf = float(float(ao) / float(dp)) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_VarScan(self): ''' varscan vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "FREQ": vf = float( float( str(row[fieldId[header[-1]]].split(':')[j]).strip('%')) / float(100)) j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_HapCaller(self): ''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) ''' for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] if i.startswith("AF="): vf1 = float(i.split('=')[1]) ''' for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "AD": ad = str(row[fieldId[header[-1]]].split(':')[j]) if str(',') in ad: ref_count = int(ad.split(',')[0]) alt_count = int(ad.split(',')[1]) try: vf = float( float(alt_count) / (float(ref_count) + float(alt_count))) except: vf = 0.0 else: abortWithMessage( "Sample {0} may not have Haplotype Caller mutations with no ALT or vf" .format(header[-1])) j += 1 try: vf except: print(row, file=sys.stderr) vf = 0.0 try: dp except: print(row, file=sys.stderr) dp = 0.0 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_MiSeq(self): ''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] fc = self.fc effect = self.eff for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] # if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead if i.startswith("FC=") and not fc: for j in i.split('=')[1].split(','): if str(j.split('_')[0]) not in str(fc): fc += str(j.split('_')[0]) + ";" try: if str(j.split('_')[1]) not in str(effect): effect += str(j.split('_')[1]) + ";" except: pass elif str(i) == "EXON": fc += 'EXON' if not fc: fc = str("?") if not effect: effect = str("?") k = 0 for i in row[fieldId['FORMAT']].split(':'): if str(i) == "VF": vf = float(row[fieldId[header[-1]]].split(':')[k]) ''' #for when vf is not in the format column, but AD is if str(i) == "AD" and not dp or not vf: dp = 0 rd = int(row[fieldId[header[-1]]].split(':')[k].split(',')[0]) ad = int(row[fieldId[header[-1]]].split(':')[k].split(',')[1]) dp = int(rd) + int(ad) ''' k += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def test_pc_iter_3(): seq = 'AAAAAAAAAAA' # 01234567890 # CGT variants = [ Variant('t', 't', 4, 'A', 'C', 0.25), Variant('t', 't', 5, 'A', 'G', 0.25), Variant('t', 't', 6, 'A', 'T', 0.25) ] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 'AAACAAA' in pcs assert 'AACGAA' in pcs assert 'ACATA' in pcs assert 'ACGTA' in pcs
def __setVariant__(self, row): # Reads row of variant file into dict by id and chromosome pid = row[self.vhead["Patient"]] c = self.__setChromosome__(row[self.vhead["Chr"]]) start = row[self.vhead["Start"]] end = row[self.vhead["End"]] name = row[self.vhead["Name"]] if pid not in self.variants.keys(): self.variants[pid] = {} if c not in self.variants[pid].keys(): self.variants[pid][c] = [] self.variants[pid][c].append(Variant(pid, c, start, end, row, name))
def variant_from_index_list(idx_list, line): # Inputs: list of indexes of the line # One line of the input file as a list chrom = line[idx_list[0]] start = line[idx_list[1]] end = line[idx_list[2]] ref = line[idx_list[3]] alt = line[idx_list[4]] gene = line[idx_list[5]] var_type = line[idx_list[6]].replace(" ", "_") var_type = var_type.strip() return Variant(chrom, start, end, ref, alt, gene, var_type, None)
def test_pc_iter_4(): seq = 'AAANAAAAA' # 012345678 # T variants = [Variant('t', 't', 4, 'A', 'T', 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 0 == len(pcs)
def test_pc_iter_deletion_2(): seq = 'AAAAAAAAA' # 012345678 # xxx variants = [Variant('t', 't', 3, 'AAA', [''], 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 1 == len(pcs) assert 'AAAAAA' == pcs[0]
def parse_variants(line): # Parses variant information from annovar-annotated vcf-file. variants = [] chromosome = line[0] start = line[1] end = line[2] ref = line[3] alt = line[4] info = line[7] genes, type, af = parse_info(info.split(';')) for g in genes: new_var = Variant(chromosome, start, end, ref, alt, g, type, af, None) variants.append(new_var) return variants
def test_pc_iter_insertion_2(): seq = 'AAAAAAAAA' # 012345678 # ^ # TT variants = [Variant('t', 't', 4, '', ['TT'], 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 1 == len(pcs) assert 'AATTAA' == pcs[0]
def uniqueVariants(self): '''Return the set of unique variants from the set of all variants (for this feature)''' # exploit the hashtable and uniqueness of sets to quickly find # unique tuples (contig, pos, ref, alt) of variant info # sorted by chrom, pos uniqueVariantsTemp = set() for var in self.variants: candidate = (var.pos.chrom, var.pos.pos, var.ref, var.alt) uniqueVariantsTemp.add(candidate) # sort by chr, then position # TO DO: python sorted() will sort as: chr1, chr10, chr2, chr20, chrX. Fix. uniqueVariantsTemp = sorted(uniqueVariantsTemp, key=lambda varx: (varx[0] + str(varx[1]))) # Now construct a returnable set of Variant objects, # specifying multiple "sources" in the source field # this loop's inner-product is #unique variants * #total variants, times #features # and is a major inefficiency uniqueVariants = set() for uniqueVarTup in uniqueVariantsTemp: source = "" frac = "" dp = "" eff = "" fc = "" #annot = "" for varClass in self.variants: if (varClass.pos.chrom, varClass.pos.pos, varClass.ref, varClass.alt) == uniqueVarTup: source += varClass.source + ", " frac += str(varClass.frac) + ", " dp += str(varClass.dp) + ", " eff += str(varClass.eff) + ", " fc += str(varClass.fc) + ", " #annot += str(varClass.annot) + ", " pos = HTSeq.GenomicPosition(uniqueVarTup[0], uniqueVarTup[1]) uniqueVar = Variant( source.strip(", "), pos, ref=uniqueVarTup[2], alt=uniqueVarTup[3], frac=str(frac).strip(", "), dp=str(dp).strip(", "), eff=str(eff).strip(", "), fc=str(fc).strip(", ")) ######## Karl Modified ############## uniqueVariants.add(uniqueVar) return uniqueVariants
def parse_IonTorrent(self): ''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc for i in row[fieldId['INFO']].split(';'): if i.startswith("AO="): tempval = i.split('=')[1] if i.startswith("RO="): ro = i.split('=')[1] if i.startswith("DP="): dp = i.split("=")[1] if str(',') in str(tempval): tempval2 = [ int(numeric_string) for numeric_string in tempval.split(',') ] try: ao = sum(tempval2) except: abortWithMessage( "AO should be an int, or a list of ints: AO = {0}/".format( tempval2)) else: ao = tempval vf = float(float(ao) / float(float(ro) + float(ao))) position = int(row[fieldId['POS']]) for i in str(row[fieldId['ALT']]).split(','): if len(str(row[fieldId['REF']])) > len(i): # this is a deletion in Ion Torrent data position = int(row[fieldId['POS']]) break var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_GenericGATK(self): ''' Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "AD": ro = int(row[fieldId[header[-1]]].split(':')[j].split(',')[0]) #ao = int(row[fieldId[header[-1]]].split(':')[j].split(',')[-1]) # fails when the mutation has two alternate alleles in the same VCF line ao = sum([ int(x) for x in row[fieldId[header[-1]]].split(':') [j].split(',')[1:] ]) dp = ro + ao try: vf = float( float(ao) / float(dp) ) # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line except: print("\nwarning: no vaf?\n" + str(row) + "\n") vf = 0 break j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def main(): global options, args # ********************** # store in DBNLVar # ********************** # Define connection parameters andd perform connection: connection = httplib2.Http(".cache") connection.add_credentials('*****@*****.**', 'prueba') # Open annotation file and parse each line in it annotation_vcf = vcf.Reader(open(options.input_vcf, 'r')) # Load metadata in variant object variant = Variant(samples=annotation_vcf.samples) for record in annotation_vcf: # Load variant information in DBNLVar, from consequences variant.get_from_record(record=record) for consequence in variant.consequences: resp = load_consequence(consequence=consequence) quit() # Store consequence non-relating data in DBNLVar if not check_record(table='chromosome', value=chrom_to_number(record.CHROM)): #payload = {'id': chrom_to_number(record.CHROM), 'name': number_to_chrom(chrom_to_number(record.CHROM))} load_record(payload=record) # Store consequence relating data in DBNLVar for consequence in record.INFO['CSQ']: for index, annotation in enumerate(consequence.split(separator)): payload = {} resp = requests.get(uri + 'chromosome/id/3/24.json', auth=auth) print resp.json() if resp.status_code == 200: content = resp.json()['content'] else: print "ERROR: Problem in query" raise print content
def __setBlastResults__(self, name, infile): # Reads in infile as a dictionary stored by chromosome (each file is one sample) first = True with open(infile, "r") as f: for line in f: if first == True: delim = getDelim(line) first = False row = line.strip().split(delim) c = row[self.bhead["subjectid"]] pas = self.__evaluateRows__(row) if pas == True and c in self.variants[name].keys(): # Only proceed if there is sufficient match quality and chromosome is present in variants qid = row[self.bhead["queryid"]] start = row[self.bhead["sstart"]] end = row[self.bhead["send"]] if c not in self.results.keys(): self.results[c] = [] self.results[c].append(Variant(qid, c, start, end, row))
def main(): global options, args separator = '|' # Parse the HGVS name into genomic coordinates and alleles. #chrom, offset, ref, alt = hgvs.parse_hgvs_name('ENST00000515609.1:c.30G>T', genome, get_transcript=get_transcript) #print chrom, offset, ref, alt # Format an HGVS name. chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'G') transcript = get_transcript('ENST00000359218.5') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) print hgvs_name chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'GA') transcript = get_transcript('ENST00000359218.5') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) hgvs_var = hgvs.HGVSName(hgvs_name) hgvs_str = 'ENST00000359218.5:c.10597+1079_10597+1080delTTinsT' hgvs_var2 = hgvs.HGVSName(hgvs_str) print hgvs_name quit() # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) variant = Variant(samples=input_vcf.samples) # Open output file with open(options.output_vcf, 'w') as output_psv: # Generate output file header #variant = ConsequenceType(input_vcf.samples) output_psv.write(variant.create_psv_header(separator=separator)) # Now parse lines in .vcf and output with new format: for record in input_vcf: # Only output sites that hasn't been filtered out if len(record.FILTER) == 0: #for consequence in range(0, len(record.INFO['CSQ'])): variant.get_from_record(record=record) output_psv.write(variant.put_to_psv(separator=separator))
def variantParser(variantFile): returnDict = dict() firstLine = True for ro in reader(variantFile, delimiter="\t"): if firstLine: firstLine = False else: # Calculate and assign the variant determinants. iposition = ro[1] iancestral = ro[2] icounts = ro[5] iTotCounts = ro[4] ifrequency = ro[6] isubstitution = ro[18] # Create a new dictionary entry with the variant information mapped to its bp position. returnDict[iposition] = Variant(iposition, iancestral, isubstitution, icounts, ifrequency, iTotCounts) return returnDict
def get_product_variants(self, product): if not(isinstance(product, Product)): raise Exception('Expected product object') log('[{}.json] Getting product variants'.format(product.url), color='blue') endpoint = '{}.json'.format(product.url) r = self.S.get( endpoint, headers=self.headers, verify=False ) try: r.raise_for_status() except requests.exceptions.HTTPError: log('[error][{}][{}.json] Failed to get variants'.format(r.status_code, product.url), color='red') return None with r.json() as json: variant_objects = [] for var in json['variants']: variant_objects.append(Variant(var['id'], var['title'])) return variant_objects