def group_inversions(cls, adjs): """Group 2 inversion adjacencies into a single event""" inversions = sorted(adjs, key=lambda adj: (adj.chroms[0], adj.breaks[0])) max_homology = 25 variants = [] i = 0 while i < len(inversions) - 1: if inversions[i].chroms[0] == inversions[i + 1].chroms[0] and\ inversions[i + 1].breaks[0] - inversions[i].breaks[0] <= max_homology and\ ((inversions[i].orients == ('L', 'L') and inversions[i + 1].orients == ('R', 'R')) or (inversions[i].orients == ('R', 'R') and inversions[i + 1].orients == ('L', 'L'))): (adj1, adj2) = (inversions[i], inversions[i + 1]) if inversions[i].orients == ( 'L', 'L') else (inversions[i + 1], inversions[i]) variants.append(Variant('INV', [adj1, adj2])) i += 2 else: if not inversions[i].dubious: variants.append(Variant('INV', [inversions[i]])) i += 1 if i == len(inversions) - 1 and not inversions[i].dubious: variants.append(Variant('INV', [inversions[i]])) return variants
def test_pc_iter_2(): seq = 'AAAAAAAAAA' # 0123456789 # CG variants = [ Variant('t', 't', 4, 'A', 'C', 0.25), Variant('t', 't', 5, 'A', 'G', 0.25) ] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 2 == len(pcs) assert 'AAACAAA' in pcs assert 'AACGAA' in pcs it = PseudocontigIterator(seq, variants, 5) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 2 == len(pcs) assert 'AAAACAAAA' in pcs assert 'AAACGAAA' in pcs
def process_vcf(self, cols): """Build object from vcf """ vcf = open(self.name, 'r') info_dict, format_dict = {}, {} # Read the meta-information lines from the vcf for i, line in enumerate(vcf): # Handle exceptions: the AF will be calcualted regardless; if line.startswith('##FORMAT=<ID=AF'): pass # Select the INFO/FORMAT lines elif line.startswith('##FORMAT'): vcf_header = VcfHeader(line) format_dict.update({vcf_header.meta_id: vcf_header}) elif line.startswith('##INFO'): vcf_header = VcfHeader(line) info_dict.update({vcf_header.meta_id: vcf_header}) # Keep other meta-info lines elif line.startswith('##'): if line.startswith('##source='): self.caller = line.replace('##source=', '').strip() self.meta_info.append(line) else: break # Only extract the (filtered) DP in the format if "DP" in info_dict.keys() and format_dict.keys(): info_dict.pop("DP", None) if not self.caller: sys.exit("Cannot identify caller from file {}\nPlease add caller \ identify line '##source=(caller name)' to vcf header" .format(self.name)) # When user specify the AF and vcf does not have, try to calculate that # for the user if ('AF' in cols) and ('AF' not in info_dict.keys()): vcf_header = VcfHeader(AF_LINE) info_dict.update({vcf_header.meta_id: vcf_header}) # Select the columns from INFO/FORMAT info_cols, format_cols = extract_cols(info_dict, format_dict, cols) # Add the INFO line (with caller) / FORMAT (unchanged) to header_list self.meta_info += [VcfHeader.write(VcfHeader.add_caller(v, self.caller)) for k, v in info_cols.items()] self.meta_info += [VcfHeader.write(v) for k, v in format_cols.items()] self.header = line # Continue to read the file, this time the variants for j, line in enumerate(vcf): variant = Variant().process_variant(line, caller=self.caller) if variant.alt == '*': print("Warning: Vcf {} line {} has variant with alt=*".format(self.caller, str(i+j+1))) cleaned_variant = Variant.select_info(variant, info_cols, format_cols) # The dictionary is query by chr\tpos\tref\talt self.variants.update({cleaned_variant.variant_key: cleaned_variant}) return self
def getVariants(): # Returns initialized variants for testing ret = {} pid = "DCIS_1" rows = ["", ""] ret["1"] = [Variant(pid, "1", "100.0", "200.0", rows)] ret["1"].append(Variant(pid, "1", "1025", "1119", rows)) ret["2"] = [Variant(pid, "2", "25006", "25124", rows)] ret["X"] = [Variant(pid, "X", "90045", "90157.5", rows)] return ret
def main(): global options, args separator = '|' # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) if options.non_model: variant = Variant(samples=input_vcf.samples, organism_type='non_model', ploidy=options.ploidy) else: variant = Variant(samples=input_vcf.samples, ploidy=options.ploidy) # Open output file with open(options.output_vcf, 'w') as output_psv: # Generate output file header #variant = ConsequenceType(input_vcf.samples) output_psv.write(variant.create_psv_header(separator=separator)) # Now parse lines in .vcf and output with new format: for record in input_vcf: # Only output sites that hasn't been filtered out if len(record.FILTER) == 0: #for consequence in range(0, len(record.INFO['CSQ'])): variant.get_from_record(record=record) output_psv.write(variant.put_to_psv(separator=separator))
def group_trls(cls, adjs): """Group 2 translocation adjacencies into single reciprocal event""" trls = sorted([adj for adj in adjs if not adj.dubious], key=lambda adj: (adj.chroms[0], adj.breaks[0])) grouped_trl_ids = Set() neighborhood = 10000 variants = [] i = 0 if len(trls) > 1: while i < len(trls) - 1: if trls[i].chroms[0] == trls[i + 1].chroms[0] and\ trls[i].chroms[1] == trls[i + 1].chroms[1] and\ abs(trls[i + 1].breaks[0] - trls[i].breaks[0]) <= neighborhood and\ abs(trls[i + 1].breaks[1] - trls[i].breaks[1]) <= neighborhood and\ ((trls[i].orients == ('L', 'R') and trls[i + 1].orients == ('R', 'L')) or\ (trls[i].orients == ('R', 'L') and trls[i + 1].orients == ('L', 'R')) or\ (trls[i].orients == ('L', 'L') and trls[i + 1].orients == ('R', 'R')) or\ (trls[i].orients == ('R', 'R') and trls[i + 1].orients == ('L', 'L')) ): variants.append(Variant('TRL', [trls[i], trls[i + 1]])) grouped_trl_ids.add(trls[i].id) grouped_trl_ids.add(trls[i + 1].id) i += 2 else: i += 1 grouped_trl_ids = Set() trls_remained = [trl for trl in trls if trl.id not in grouped_trl_ids] return variants, trls_remained
def main(): if len(sys.argv) < 3: print( 'usage:\nconvert <pipeline>.xml <output>.sh (arg_name=arg_value)*') return pl_file = sys.argv[1] script = sys.argv[2] args = { arg: Variant.from_string(value, 'string') for (arg, value) in [item.split('=') for item in sys.argv[3:]] } pm = PackageManager() include_sh = open('Test/diff_expr/include.sh', 'w') include_sh.write(pm.get_header()) include_sh.close() pipeline = Pipeline(pl_file, pm) out_file = open(script, 'w') output = pipeline.generate(args) out_file.write('DIR="${BASH_SOURCE%/*}"\n. "$DIR/include.sh"\n\n') out_file.write(output) out_file.close() system('chmod +x ' + script) print(output)
def _get_explicit_value(self, node): '''Return Variant with step's arg value''' if 'val' in node.attrib: return Variant.from_string(node.attrib['val'], node.attrib.get('type')) elif 'ref' in node.attrib: parts = node.attrib['ref'].split('.') if len(parts) == 1: #local variable option_name = parts[0] if option_name in self._inputs: return self._inputs[option_name].get() if option_name in self._outputs: return self._outputs[option_name].get() else: raise RuntimeError('Reference to undefined option: ' + option_name) elif len(parts) == 2: #some pipeline output step_name, output_name = parts if step_name not in self._step_pipelines: raise RuntimeError('Reference to undefined step: ' + step_name) output = self._step_pipelines[step_name]._get_output(output_name) if output is None: raise RuntimeError('Undefined step output %s.%s has:%s' % (parts[0], parts[1], self._step_pipelines[parts[0]]._outputs)) return output.get() else: raise RuntimeError('Wrong reference format') return None
def parse_MAF(self): ''' maf filetype parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn position = int( str(row[fieldId['Start_position']]).split('.')[0] ) # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons? dp = int(str(row[fieldId['TTotCov']]).split('.')[0]) vf = float(float(row[fieldId['TVarCov']]) / float(dp)) chrom = str(row[fieldId['Chromosome']]) ref = str(row[fieldId['Reference_Allele']]) alt = str(row[fieldId['Tumor_Seq_Allele2']]) effect = self.eff fc = self.fc if ref == "-": ref = "" if alt == "-": alt = "" var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_MuTectOUT(self): ''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc vf = float(row[fieldId['tumor_f']]) dp = int( int(str(row[fieldId['t_ref_count']]).strip()) + int(str(row[fieldId['t_alt_count']]).strip())) position = int(row[fieldId['position']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def get_product_skus(self, product): # scrape product variants and stock status from its info # returns a list of variant objects logt(self.tid, 'fetching product variants') variants = [] try: params = { "expand": "variations,informationBlocks,customisations", "channel": "iphone-app" } url = "https://commerce.mesh.mx/stores/{}/products/{}".format(self.sitename, product) r = requests.request( 'GET', url, headers=self.headers, params=params ).json() for size in r['options']: logt(self.tid,"[size] {} \t sku {} \t {}".format( size, r['options'][size]['SKU'], r['options'][size]['stockStatus'] )) v = Variant( size, r['options'][size]['SKU'], r['options'][size]['stockStatus'] ) variants.append(v) return variants except KeyError: logt(self.tid,"[error] exception while getting product info json") exit(-1)
def parseVCF(self, file): #load the file to be parsed fileReader = open(file, "r") #loop over the file for line in fileReader: #see if line starts with # and skip if line.startswith("#"): continue #tokenize the line lineTokens = line.split("\t") #set up the variables just so it clear what we are using chromosome = lineTokens[0] position = int(lineTokens[1]) id = lineTokens[2] referenceAllele = lineTokens[3] alternateAllele = lineTokens[4] qualityScore = float(lineTokens[5]) filterFlag = lineTokens[6] infoGroup = lineTokens[7] formatGroup = lineTokens[8] noneGroup = lineTokens[9] #create the variant and add it variant = Variant(chromosome, position, id, referenceAllele, alternateAllele, qualityScore, filterFlag, infoGroup, formatGroup, noneGroup) self.__variants.append(variant)
def parse_SamTools(self): ''' samtools vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc position = int(row[fieldId['POS']]) for i in row[fieldId['INFO']].split(';'): if i.startswith("DP4="): j = i.split('=')[1].split(',') ro = int(int(j[0]) + int(j[1])) ao = int(int(j[2]) + int(j[3])) dp = int(int(ro) + int(ao)) vf = float(float(ao) / float(dp)) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_SomaticIndelDetector(self): ''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 # Below attempts to grab sample ID. # assumes that sample ID is the final column in the self.header. always true? # if not always true, adopt the parse_mutect solution here as well tmpsampID = header[-1] for i in row[fieldId['FORMAT']].split(':'): if i == "AD": ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1] elif i == "DP": dp = row[fieldId[tmpsampID]].split(':')[j] vf = float(float(ALT_count) / float(dp)) j += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_VarScan(self): ''' varscan vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "FREQ": vf = float( float( str(row[fieldId[header[-1]]].split(':')[j]).strip('%')) / float(100)) j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_HapCaller(self): ''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) ''' for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] if i.startswith("AF="): vf1 = float(i.split('=')[1]) ''' for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "AD": ad = str(row[fieldId[header[-1]]].split(':')[j]) if str(',') in ad: ref_count = int(ad.split(',')[0]) alt_count = int(ad.split(',')[1]) try: vf = float( float(alt_count) / (float(ref_count) + float(alt_count))) except: vf = 0.0 else: abortWithMessage( "Sample {0} may not have Haplotype Caller mutations with no ALT or vf" .format(header[-1])) j += 1 try: vf except: print(row, file=sys.stderr) vf = 0.0 try: dp except: print(row, file=sys.stderr) dp = 0.0 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_MiSeq(self): ''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] fc = self.fc effect = self.eff for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] # if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead if i.startswith("FC=") and not fc: for j in i.split('=')[1].split(','): if str(j.split('_')[0]) not in str(fc): fc += str(j.split('_')[0]) + ";" try: if str(j.split('_')[1]) not in str(effect): effect += str(j.split('_')[1]) + ";" except: pass elif str(i) == "EXON": fc += 'EXON' if not fc: fc = str("?") if not effect: effect = str("?") k = 0 for i in row[fieldId['FORMAT']].split(':'): if str(i) == "VF": vf = float(row[fieldId[header[-1]]].split(':')[k]) ''' #for when vf is not in the format column, but AD is if str(i) == "AD" and not dp or not vf: dp = 0 rd = int(row[fieldId[header[-1]]].split(':')[k].split(',')[0]) ad = int(row[fieldId[header[-1]]].split(':')[k].split(',')[1]) dp = int(rd) + int(ad) ''' k += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def main(): global options, args # ********************** # store in DBNLVar # ********************** # Define connection parameters andd perform connection: connection = httplib2.Http(".cache") connection.add_credentials('*****@*****.**', 'prueba') # Open annotation file and parse each line in it annotation_vcf = vcf.Reader(open(options.input_vcf, 'r')) # Load metadata in variant object variant = Variant(samples=annotation_vcf.samples) for record in annotation_vcf: # Load variant information in DBNLVar, from consequences variant.get_from_record(record=record) for consequence in variant.consequences: resp = load_consequence(consequence=consequence) quit() # Store consequence non-relating data in DBNLVar if not check_record(table='chromosome', value=chrom_to_number(record.CHROM)): #payload = {'id': chrom_to_number(record.CHROM), 'name': number_to_chrom(chrom_to_number(record.CHROM))} load_record(payload=record) # Store consequence relating data in DBNLVar for consequence in record.INFO['CSQ']: for index, annotation in enumerate(consequence.split(separator)): payload = {} resp = requests.get(uri + 'chromosome/id/3/24.json', auth=auth) print resp.json() if resp.status_code == 200: content = resp.json()['content'] else: print "ERROR: Problem in query" raise print content
def test_pc_iter_3(): seq = 'AAAAAAAAAAA' # 01234567890 # CGT variants = [ Variant('t', 't', 4, 'A', 'C', 0.25), Variant('t', 't', 5, 'A', 'G', 0.25), Variant('t', 't', 6, 'A', 'T', 0.25) ] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 'AAACAAA' in pcs assert 'AACGAA' in pcs assert 'ACATA' in pcs assert 'ACGTA' in pcs
def main(): global options, args # ********************** # store in DBNLVar # ********************** # Define connection parameters andd perform connection: connection = httplib2.Http(".cache") connection.add_credentials("*****@*****.**", "prueba") # Open annotation file and parse each line in it annotation_vcf = vcf.Reader(open(options.input_vcf, "r")) # Load metadata in variant object variant = Variant(samples=annotation_vcf.samples) for record in annotation_vcf: # Load variant information in DBNLVar, from consequences variant.get_from_record(record=record) for consequence in variant.consequences: resp = load_consequence(consequence=consequence) quit() # Store consequence non-relating data in DBNLVar if not check_record(table="chromosome", value=chrom_to_number(record.CHROM)): # payload = {'id': chrom_to_number(record.CHROM), 'name': number_to_chrom(chrom_to_number(record.CHROM))} load_record(payload=record) # Store consequence relating data in DBNLVar for consequence in record.INFO["CSQ"]: for index, annotation in enumerate(consequence.split(separator)): payload = {} resp = requests.get(uri + "chromosome/id/3/24.json", auth=auth) print resp.json() if resp.status_code == 200: content = resp.json()["content"] else: print "ERROR: Problem in query" raise print content
def variant_from_index_list(idx_list, line): # Inputs: list of indexes of the line # One line of the input file as a list chrom = line[idx_list[0]] start = line[idx_list[1]] end = line[idx_list[2]] ref = line[idx_list[3]] alt = line[idx_list[4]] gene = line[idx_list[5]] var_type = line[idx_list[6]].replace(" ", "_") var_type = var_type.strip() return Variant(chrom, start, end, ref, alt, gene, var_type, None)
def __setVariant__(self, row): # Reads row of variant file into dict by id and chromosome pid = row[self.vhead["Patient"]] c = self.__setChromosome__(row[self.vhead["Chr"]]) start = row[self.vhead["Start"]] end = row[self.vhead["End"]] name = row[self.vhead["Name"]] if pid not in self.variants.keys(): self.variants[pid] = {} if c not in self.variants[pid].keys(): self.variants[pid][c] = [] self.variants[pid][c].append(Variant(pid, c, start, end, row, name))
def main(): global options, args separator = '|' # Parse the HGVS name into genomic coordinates and alleles. #chrom, offset, ref, alt = hgvs.parse_hgvs_name('ENST00000515609.1:c.30G>T', genome, get_transcript=get_transcript) #print chrom, offset, ref, alt # Format an HGVS name. chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'G') transcript = get_transcript('ENST00000359218.5') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) print hgvs_name chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'GA') transcript = get_transcript('ENST00000359218.5') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) hgvs_var = hgvs.HGVSName(hgvs_name) hgvs_str = 'ENST00000359218.5:c.10597+1079_10597+1080delTTinsT' hgvs_var2 = hgvs.HGVSName(hgvs_str) print hgvs_name quit() # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) variant = Variant(samples=input_vcf.samples) # Open output file with open(options.output_vcf, 'w') as output_psv: # Generate output file header #variant = ConsequenceType(input_vcf.samples) output_psv.write(variant.create_psv_header(separator=separator)) # Now parse lines in .vcf and output with new format: for record in input_vcf: # Only output sites that hasn't been filtered out if len(record.FILTER) == 0: #for consequence in range(0, len(record.INFO['CSQ'])): variant.get_from_record(record=record) output_psv.write(variant.put_to_psv(separator=separator))
def create_variant (self, value, scope, datatype=None): """Creates a `Variant` of this topic name with the specified string `value` and `scope`. If `datatype` is None, the newly created `Variant` will have the datatype xsd:string. The newly created `Variant` will contain all themes from the parent name and the themes specified in `scope`. :param value: the string value or locator which represents an IRI :type value: string or `Locator` :param scope: list of themes :type scope: list of `Topic`s :rtype: `Variant` """ if value is None: raise ModelConstraintException(self, 'The value may not be None') if not scope: raise ModelConstraintException(self, 'The scope may not be None') if type(scope) not in (type([]), type(())): scope = [scope] if scope == list(self.get_scope()): raise ModelConstraintException( self, 'The variant would be in the same scope as the parent') if datatype is None: if isinstance(value, Locator): datatype = Locator(XSD_ANY_URI) elif isinstance(value, str): datatype = Locator(XSD_STRING) if isinstance(value, Locator): value = value.to_external_form() variant = Variant(name=self, datatype=datatype.to_external_form(), value=value, topic_map=self.topic_map) variant.save() for theme in scope: variant.scope.add(theme) return variant
def test_pc_iter_4(): seq = 'AAANAAAAA' # 012345678 # T variants = [Variant('t', 't', 4, 'A', 'T', 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 0 == len(pcs)
def test_pc_iter_deletion_2(): seq = 'AAAAAAAAA' # 012345678 # xxx variants = [Variant('t', 't', 3, 'AAA', [''], 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 1 == len(pcs) assert 'AAAAAA' == pcs[0]
def find_nocov_variants(covlist,chrom='',caller='',min_cov=5): variants = [] assert min(covlist[1:]) >= 0 nocov = [i for i,v in enumerate(covlist) if v < min_cov] nocov.remove(0) # take off the -1 at index 0 if len(covlist)-1 == len(nocov): return None # entire sequence has no coverage nocov_intervals = list(intervals(nocov)) for iv in nocov_intervals: data = {'chrom':chrom,'caller':caller,'pos':iv[0], 'type': 'no_cov'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) variants.append(Variant.from_dict(data)) return variants
def parse_variants(line): # Parses variant information from annovar-annotated vcf-file. variants = [] chromosome = line[0] start = line[1] end = line[2] ref = line[3] alt = line[4] info = line[7] genes, type, af = parse_info(info.split(';')) for g in genes: new_var = Variant(chromosome, start, end, ref, alt, g, type, af, None) variants.append(new_var) return variants
def test_pc_iter_insertion_2(): seq = 'AAAAAAAAA' # 012345678 # ^ # TT variants = [Variant('t', 't', 4, '', ['TT'], 0.25)] it = PseudocontigIterator(seq, variants, 4) pc = it.next() pcs = [] while pc: pcs.append(pc) pc = it.next() assert 1 == len(pcs) assert 'AATTAA' == pcs[0]
def uniqueVariants(self): '''Return the set of unique variants from the set of all variants (for this feature)''' # exploit the hashtable and uniqueness of sets to quickly find # unique tuples (contig, pos, ref, alt) of variant info # sorted by chrom, pos uniqueVariantsTemp = set() for var in self.variants: candidate = (var.pos.chrom, var.pos.pos, var.ref, var.alt) uniqueVariantsTemp.add(candidate) # sort by chr, then position # TO DO: python sorted() will sort as: chr1, chr10, chr2, chr20, chrX. Fix. uniqueVariantsTemp = sorted(uniqueVariantsTemp, key=lambda varx: (varx[0] + str(varx[1]))) # Now construct a returnable set of Variant objects, # specifying multiple "sources" in the source field # this loop's inner-product is #unique variants * #total variants, times #features # and is a major inefficiency uniqueVariants = set() for uniqueVarTup in uniqueVariantsTemp: source = "" frac = "" dp = "" eff = "" fc = "" #annot = "" for varClass in self.variants: if (varClass.pos.chrom, varClass.pos.pos, varClass.ref, varClass.alt) == uniqueVarTup: source += varClass.source + ", " frac += str(varClass.frac) + ", " dp += str(varClass.dp) + ", " eff += str(varClass.eff) + ", " fc += str(varClass.fc) + ", " #annot += str(varClass.annot) + ", " pos = HTSeq.GenomicPosition(uniqueVarTup[0], uniqueVarTup[1]) uniqueVar = Variant( source.strip(", "), pos, ref=uniqueVarTup[2], alt=uniqueVarTup[3], frac=str(frac).strip(", "), dp=str(dp).strip(", "), eff=str(eff).strip(", "), fc=str(fc).strip(", ")) ######## Karl Modified ############## uniqueVariants.add(uniqueVar) return uniqueVariants
def parse_IonTorrent(self): ''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc for i in row[fieldId['INFO']].split(';'): if i.startswith("AO="): tempval = i.split('=')[1] if i.startswith("RO="): ro = i.split('=')[1] if i.startswith("DP="): dp = i.split("=")[1] if str(',') in str(tempval): tempval2 = [ int(numeric_string) for numeric_string in tempval.split(',') ] try: ao = sum(tempval2) except: abortWithMessage( "AO should be an int, or a list of ints: AO = {0}/".format( tempval2)) else: ao = tempval vf = float(float(ao) / float(float(ro) + float(ao))) position = int(row[fieldId['POS']]) for i in str(row[fieldId['ALT']]).split(','): if len(str(row[fieldId['REF']])) > len(i): # this is a deletion in Ion Torrent data position = int(row[fieldId['POS']]) break var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_GenericGATK(self): ''' Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "AD": ro = int(row[fieldId[header[-1]]].split(':')[j].split(',')[0]) #ao = int(row[fieldId[header[-1]]].split(':')[j].split(',')[-1]) # fails when the mutation has two alternate alleles in the same VCF line ao = sum([ int(x) for x in row[fieldId[header[-1]]].split(':') [j].split(',')[1:] ]) dp = ro + ao try: vf = float( float(ao) / float(dp) ) # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line except: print("\nwarning: no vaf?\n" + str(row) + "\n") vf = 0 break j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def __setBlastResults__(self, name, infile): # Reads in infile as a dictionary stored by chromosome (each file is one sample) first = True with open(infile, "r") as f: for line in f: if first == True: delim = getDelim(line) first = False row = line.strip().split(delim) c = row[self.bhead["subjectid"]] pas = self.__evaluateRows__(row) if pas == True and c in self.variants[name].keys(): # Only proceed if there is sufficient match quality and chromosome is present in variants qid = row[self.bhead["queryid"]] start = row[self.bhead["sstart"]] end = row[self.bhead["send"]] if c not in self.results.keys(): self.results[c] = [] self.results[c].append(Variant(qid, c, start, end, row))
def get_product_variants(self, product): if not(isinstance(product, Product)): raise Exception('Expected product object') log('[{}.json] Getting product variants'.format(product.url), color='blue') endpoint = '{}.json'.format(product.url) r = self.S.get( endpoint, headers=self.headers, verify=False ) try: r.raise_for_status() except requests.exceptions.HTTPError: log('[error][{}][{}.json] Failed to get variants'.format(r.status_code, product.url), color='red') return None with r.json() as json: variant_objects = [] for var in json['variants']: variant_objects.append(Variant(var['id'], var['title'])) return variant_objects
def main(): if len(sys.argv) < 3: print('usage:\nconvert <pipeline>.xml <output>.sh (arg_name=arg_value)*') return pl_file = sys.argv[1] script = sys.argv[2] args = {arg : Variant.from_string(value, 'string') for (arg, value) in [item.split('=') for item in sys.argv[3:]]} pm = PackageManager() include_sh = open('Test/diff_expr/include.sh', 'w') include_sh.write(pm.get_header()) include_sh.close() pipeline = Pipeline(pl_file, pm) out_file = open(script, 'w') output = pipeline.generate(args) out_file.write('DIR="${BASH_SOURCE%/*}"\n. "$DIR/include.sh"\n\n') out_file.write(output) out_file.close() system('chmod +x ' + script) print(output)
def _process_option(self, node, args): '''Return Option, that contain values produced using args comed from step declaration''' opt = Option(node.attrib.get('repr')) name = node.attrib['name'] if 'default' in node.attrib: opt.set_default_val(Variant.from_string(node.attrib['default'], node.attrib['type'])) elif 'default_ref' in node.attrib: ref = node.attrib['default_ref'].strip() if ref in self._inputs: opt.set_default_val(self._inputs[ref].get()) else: raise RuntimeError('Reference to currently undefined symbol: ' + ref) else: for child in node: #check by RELAXNG if child.tag == 'default': opt.set_default_val(self._eval_expression(child)) if name in args: opt.set_val(args[name]) return opt
def join(mod_params, args): str_value = (mod_params or '').join([arg.to_string() for arg in args]) return Variant.from_string(str_value, 'string')
def base_name(mod_params, args): assert len(args) == 1 return Variant.from_string('.'.join(args[0].to_string().split('.')[:-1]))
def to_list(mod_params, args): return Variant.from_variant_list(args)
def find_variants(covlist, seq, chrom, min_cov=5, min_score=30, exclude_edges=False, exclude_overlaps=False): ''' identify coverage variants in covlist Returns dict with keys 'mean_cov','pct_cov', and 'variants', where dict['variants'] is a list of Variant objects ''' assert min(covlist[1:]) >= 0 assert len(covlist) - 1 == len(seq), "Number of coverage values (%d) is not equal to sequence length (%d)" % (len(covlist)-1,len(seq)) retval = {} nocov = [i for i,v in enumerate(covlist) if v < min_cov] nocov.remove(0) retval['mean_cov'] = scipy.mean(covlist[1:]) retval['pct_cov'] = 1 - (float(len(nocov)) / (len(covlist) - 1)) if len(nocov) == len(seq): return retval nocov_intervals = list(intervals(nocov)) #covscores,localmeans = local_coverage_score(covlist) covscores,localmeans = adjusted_coverage_score(covlist) covdip = [i for i,v in enumerate(covscores) if v >= min_score] covdip_intervals = list(intervals(covdip)) # refine intervals if exclude_edges: # ignore intervals that overlap the beginning and end of reference covdip_intervals = [iv for iv in covdip_intervals if not iv[0]==1 and not iv[1]==(len(covlist)-1)] if exclude_overlaps: # ignore covdip intervals that overlap with nocov intervals covdip_intervals = remove_overlap(covdip_intervals,nocov_intervals) # covdip = list(itertools.chain(*[range(v1,v2+1) for v1,v2 in covdip_intervals]) # positions with no coverage are not considered to be coverage dips covdip = [p for p in covdip if p not in nocov] variants = [] for iv in nocov_intervals: data = {'chrom':chrom, 'pos':iv[0], 'type': 'no_cov'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) variants.append(Variant.from_dict(data)) for iv in covdip_intervals: data = {'chrom':chrom, 'pos':iv[0], 'type': 'cov_dip'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) intscores = covscores[iv[0]:(iv[1]+1)] intmeans = localmeans[iv[0]:(iv[1]+1)] data['quality'] = max(intscores) data['info'] = {'CovScores':'%s' % ','.join(['%d' % int(round(v)) for v in intscores]), 'LocalMeans':'%s' % ','.join(['%d' % int(round(v)) for v in intmeans]), } data['ref'] = str(seq[iv[0]:(iv[1]+1)].seq).upper() # data['alt'] = data['ref'].lower() variants.append(Variant.from_dict(data)) if variants: retval['variants'] = variants return retval
def script(res1, res2, out, **kwargs): variants = Variant.load_res_file(res1) rand_variants = Variant.load_res_file(res2)
#import matplotlib.pyplot as plt from variant import Variant """Plot AF vs. silva score, given a list of variants""" def plot_freq(variants): afs = [x.af for x in variants] scores = [x.score for x in variants] #plt.scatter(scores, afs) #plt.show() f = open('/dupa-filer/talf/silva-pipeline/test.out', 'w') for a,s in zip(afs, scores): f.write('\t'.join([str(a),str(s)]) + '\n') f.close() if __name__ == '__main__': variants = Variant.load_res_file('/dupa-filer/talf/silva-pipeline/1000gp_rare_results.txt') plot_freq(variants)
if not os.path.isdir(args.jobdir): sys.exit('Error: directory "%s" does not exist' % args.pooldir) if not os.path.exists(args.reffile): sys.exit('Error: reference file "%s" does not exist' % args.reffile) job_path = os.path.abspath(args.jobdir) reference_file = os.path.abspath(args.reffile) # load sequences seqs = dict( [(s.id,s) for s in SeqIO.parse(reference_file,'fasta')] ) summaries = dict(( (name,{'variants':[]}) for name in seqs.keys())) ''' GATK variants ''' print >>sys.stderr, "[ Reading GATK variants ]" vlines = [l.strip('\n') for l in open('%s/GATK/snps.gatk.vcf' % job_path,'rU') if not l.startswith('#')] for l in vlines: v = Variant.from_vcf(l) v.caller = 'gatk' summaries[v.chrom]['variants'].append(v) ''' PacBio variants ''' print >>sys.stderr, "[ Reading GenCons variants ]" glines = [l.strip('\n') for l in gzip.open('%s/data/variants.gff.gz' % job_path,'rb') if not l.startswith('#')] for l in glines: v = Variant.from_gff(l) v.caller = 'gencons' summaries[v.chrom]['variants'].append(v) ''' coverage variants ''' print >>sys.stderr, "[ Reading coverage variants ]" covdata = parse_covdepth('%s/GATK/covdepth' % job_path) covvars = {}