def __apply_filter_step(self, step_dict, entries_list): """ Apply filter method on entries in VCF/BED as defined in class VCFfilter """ self.out_fname = self.fname.replace( ".vcf", ".%02d_%s.vcf" % (step_dict['order'], step_dict['name'])) if not self.skip: try: vcfout_filtered = vcf.VCFWriter( open(os.path.join(self.out_dir, self.out_fname), "w"), self.vcf_template) entries_list = list( getattr(VCFfilters(), step_dict['method'])(entries_list, template=self.vcf_template, sample=self.sample, **step_dict)) for record in entries_list: vcfout_filtered.write_record(record) vcfout_filtered.close() except AttributeError: print u"[Error] Method %s not defined." % step_dict['method'] raise else: pass return entries_list
def filter_by_background(in_vcf, full_vcf, background, data): """Filter SV calls also present in background samples. Skips filtering of inversions, which are not characterized differently between cases and controls in test datasets. """ Filter = collections.namedtuple('Filter', ['id', 'desc']) back_filter = Filter(id='InBackground', desc='Rejected due to presence in background sample') out_file = "%s-filter.vcf" % utils.splitext_plus(in_vcf)[0] if not utils.file_uptodate(out_file, in_vcf) and not utils.file_uptodate( out_file + ".vcf.gz", in_vcf): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: reader = vcf.VCFReader(filename=in_vcf) reader.filters["InBackground"] = back_filter full_reader = vcf.VCFReader(filename=full_vcf) writer = vcf.VCFWriter(out_handle, template=reader) for out_rec, rec in zip(reader, full_reader): rec_type = rec.genotype(dd.get_sample_name(data)).gt_type if rec_type == 0 or any(rec_type == rec.genotype( dd.get_sample_name(x)).gt_type for x in background): out_rec.add_filter("InBackground") writer.write_record(out_rec) return vcfutils.bgzip_and_index(out_file, data["config"])
def _add_reject_flag(in_file, config): """Add REJECT flag to all records that aren't flagged somatic (SS=2)""" Filter = namedtuple('Filter', ['id', 'desc']) reject_filter = Filter(id='REJECT', desc='Rejected as non-SOMATIC or by quality') # NOTE: PyVCF will write an uncompressed VCF base, ext = utils.splitext_plus(in_file) name = "rejectfix" out_file = "{0}-{1}{2}".format(base, name, ".vcf") if utils.file_exists(in_file): reader = vcf.VCFReader(filename=in_file) # Add info to the header of the reader reader.filters["REJECT"] = reject_filter with file_transaction(config, out_file) as tx_out_file: with open(tx_out_file, "wb") as handle: writer = vcf.VCFWriter(handle, template=reader) for record in reader: if "SS" in record.INFO: # VarScan encodes it as a string # TODO: Set it as integer when cleaning if record.INFO["SS"] != "2": record.add_filter("REJECT") writer.write_record(record) # Re-compress the file out_file = bgzip_and_index(out_file, config) move_vcf(in_file, "{0}.orig".format(in_file)) move_vcf(out_file, in_file) with open(out_file, "w") as out_handle: out_handle.write("Moved to {0}".format(in_file))
def exclude(entries, distance, exfile, **kwargs): """ Performs exclusion operations with bedtools window -v :param entries: :param distance: :param exfile: :param kwargs: :return: """ # create temporary VCF files vcf_temp_in = tempfile.NamedTemporaryFile( suffix=".vcf") # write IN-VCF to disk temporarily vcf_temp_out = tempfile.NamedTemporaryFile( suffix=".vcf") # write OUT-VCF to disk temporarily vcfin = vcf.VCFWriter(vcf_temp_in, kwargs["template"]) vcfout = vcf.VCFWriter(vcf_temp_out, kwargs["template"]) for record in entries: # write IN -VCF vcfin.write_record(record) vcfin.flush() # Inbetween flushing (avoid clogging) entries = BedTool( vcf_temp_in.name) # generate BedTool object from VCF we just wrote # format string (e.g. for sample-specific exclude files) if 'sample' in kwargs.keys(): exfile = exfile.format(SAMPLE=kwargs['sample']) entries = entries.window( exfile, w=distance, v=True, output=vcf_temp_out.name) # apply window operation if len(entries) > 0: passed_vcf = vcf.VCFReader(vcf_temp_out) # return VCF object else: passed_vcf = [] # return empty list vcfin.close() # close writer, delete temporary file # TODO implement closing of vcfout without breaking reading return passed_vcf
def filter_file(source, destination): reader = vcf.VCFReader(filename=source) with open(destination, "w") as handle: writer = vcf.VCFWriter(handle, reader) for record in reader: if all(not sample.called for sample in record): continue writer.write_record(record) final = pysam.tabix_index(destination, preset="vcf", force=True) return final
def add_annotation(self): """ <p> Read the input VCF file, add annotations to the #INFO column and write it back to the output VCF file. </p> """ vcfReader = vcf.Reader(open(self.inputFile, 'r')) """ How to add info header <http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41> """ vcfReader.infos['TSSOL'] = VcfInfo( 'TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcfReader.infos['CCURI'] = VcfInfo( 'CCURI', vcf_field_counts['A'], 'String', 'Info includes the URL of the cage cluster to which the' ' variant overlapping') vcfReader.infos['SAMPURI'] = VcfInfo( 'SAMPURI', vcf_field_counts['A'], 'String', 'Info includes the URL of the samples with to which the' ' variant overlapping') vcfWriter = vcf.VCFWriter(open(self.outputFile, 'w'), vcfReader) cnt = 0 cnt_block = 100 t1 = time.time() #pool = Pool(self.n_parallel) #batch = list(itertools.islice(vcfReader, self.n_parallel)) #res = pool.map(parallel_annotation_caller, zip([self]*len(batch), batch)) for record in vcfReader: vcfWriter.write_record(self.get_annotation(record)) if cnt % cnt_block == 1: t2 = time.time() ips = cnt_block / (t2 - t1) print "speed: %.2f iters/s = %d iters p/h = %.1f hours/million iters" % \ (ips, ips * 3600, 1000000 / ips / 3600) t1 = time.time() cnt += 1 vcfWriter.close()
def fix_somatic_calls(in_file, config): """Fix somatic variant output, standardize it to the SOMATIC flag. """ if vcf is None: raise ImportError("Require PyVCF for manipulating cancer VCFs") # HACK: Needed to replicate the structure used by PyVCF Info = namedtuple('Info', ['id', 'num', 'type', 'desc']) somatic_info = Info(id='SOMATIC', num=0, type='Flag', desc='Somatic event') Filter = namedtuple('Filter', ['id', 'desc']) reject_filter = Filter(id='REJECT', desc='Rejected as non-SOMATIC or by quality') # NOTE: PyVCF will write an uncompressed VCF base, ext = utils.splitext_plus(in_file) name = "somaticfix" out_file = "{0}-{1}{2}".format(base, name, ".vcf") if utils.file_exists(in_file): reader = vcf.VCFReader(filename=in_file) # Add info to the header of the reader reader.infos["SOMATIC"] = somatic_info reader.filters["REJECT"] = reject_filter for ext in [".gz", ".gz.tbi"]: if os.path.exists(out_file + ext): os.remove(out_file + ext) with file_transaction(config, out_file) as tx_out_file: with open(tx_out_file, "wb") as handle: writer = vcf.VCFWriter(handle, template=reader) for record in reader: # Handle FreeBayes is_somatic = False if "VT" in record.INFO: if record.INFO["VT"] == "somatic": record.add_info("SOMATIC", True) is_somatic = True # Discard old record del record.INFO["VT"] if not is_somatic: record.add_filter("REJECT") writer.write_record(record) # Re-compress the file out_file = bgzip_and_index(out_file, config) move_vcf(in_file, "{0}.orig".format(in_file)) move_vcf(out_file, in_file) with open(out_file, "w") as out_handle: out_handle.write("Moved to {0}".format(in_file))
def main(): """The main function """ parser = cmdline_parser() args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) if args.debug: LOG.setLevel(logging.DEBUG) assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam) samfh = pysam.Samfile(args.bam) # setup vcf_reader # if args.vcfin == '-': vcf_reader = vcf.VCFReader(sys.stdin) else: vcf_reader = vcf.VCFReader(filename=args.vcfin) variants = [r for r in vcf_reader] LOG.info("Loaded %d variants" % len(variants)) if args.mtc.lower() != 'None': LOG.info("Will use %s for MTC on %s with alpha %f" % (args.mtc, args.mtc_tag, args.mtc_alpha)) else: LOG.info("No multiple testing correction will be done") # setup vcf_writer # if args.vcfout == '-': fh_out = sys.stdout else: if os.path.exists(args.vcfout): LOG.fatal( "Cowardly refusing to overwrite already existing file %s" % (args.vcfout)) sys.exit(1) if args.vcfout[-3:] == '.gz': fh_out = gzip.open(args.vcfout, 'w') else: fh_out = open(args.vcfout, 'w') # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep) #vcf_writer = vcf.VCFWriter(fh_out) #vcf_writer.meta_from_reader(vcf_reader) pvalues = [] for (var_no, var) in enumerate(variants): if var_no % 500 == 1: LOG.info("Computing bias for var %d of %d" % (var_no, len(variants))) if var.INFO.has_key('INDEL'): LOG.warn("Skipping unsupported indel variant %s:%d" % (var.CHROM, var.POS)) continue reads = list( samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS)) LOG.debug("%s %d: %d (unfiltered) reads covering position" % (var.CHROM, var.POS, len(reads))) ref_mquals = [] alt_mquals = [] ref_bquals = [] alt_bquals = [] # only for PE #ref_isize = [] #alt_isize = [] # following two meant to test #alt_vpos = [] #rlens = [] for r in reads: if skip_read(r): continue orphan = (r.flag & 0x1) and not (r.flag & 0x2) if orphan and not args.use_orphan: continue if r.mapq < args.min_mq: continue vpos_on_read = [ vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs if vpos_on_ref == var.POS - 1 ] assert len(vpos_on_read) == 1 vpos_on_read = vpos_on_read[0] if vpos_on_read == None: # skip deletions continue #alt_vpos.append(vpos_on_read) #rlens.append(r.rlen) b = r.query[vpos_on_read] bq = ord(r.qqual[vpos_on_read]) - 33 mq = r.mapq if bq < args.min_bq: continue assert len(var.REF) == 1 and len(var.ALT) == 1 if b.upper() == var.REF[0].upper(): ref_mquals.append(mq) ref_bquals.append(bq) #if not args.use_orphan: # ref_isize.append(abs(r.tlen)) elif b.upper() == str(var.ALT[0]).upper(): alt_mquals.append(mq) alt_bquals.append(bq) #if not args.use_orphan: # alt_isize.append(abs(r.tlen)) else: LOG.debug("Skipping non-ref-alt base %s at %s:%d" % (b, var.CHROM, var.POS)) continue LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" % (var.CHROM, var.POS, len(ref_mquals), len(alt_mquals))) # mannwhitneyu fails if all values the same if len(set(ref_mquals).union(alt_mquals)) == 1: m_pv = 1.0 elif len(ref_mquals) == 0 or len(alt_mquals) == 0: m_pv = 1.0 else: # compute only if alternate quals are smaller on average if mean(alt_mquals) < mean(ref_mquals): ustat = mannwhitneyu(ref_mquals, alt_mquals) m_pv = ustat[1] else: m_pv = 1.0 # same for bqs if len(set(ref_bquals).union(alt_bquals)) == 1: b_pv = 1.0 elif len(ref_bquals) == 0 or len(alt_bquals) == 0: b_pv = 1.0 else: if mean(alt_bquals) < mean(ref_bquals): ustat = mannwhitneyu(ref_bquals, alt_bquals) b_pv = ustat[1] else: b_pv = 1.0 # same for isize-qs #if len(ref_isize) and len(alt_isize): # if len(set(ref_isize).union(alt_isize))==1: # i_pv = 1 # else: # ustat = mannwhitneyu(ref_isize, alt_isize) # i_pv = ustat[1] #else: # i_pv = 1 c_pv = fisher_comb(m_pv, b_pv) #import pdb; pdb.set_trace() LOG.debug("%s %d: mb %f bb %f cb %f" % (var.CHROM, var.POS, m_pv, b_pv, c_pv)) var.INFO['MB'] = prob_to_phredqual(m_pv) var.INFO['BB'] = prob_to_phredqual(b_pv) #var.INFO['IB'] = prob_to_phredqual(i_pv) var.INFO['CB'] = prob_to_phredqual(c_pv) if args.mtc.lower() != 'none': pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag]))) if args.mtc.lower() != 'none': ftag = "%s<%f" % (args.mtc, args.mtc_alpha) rej_idxs = [] if args.mtc == 'bonf': rej_idxs = [ i for (i, p) in enumerate( multiple_testing.Bonferroni(pvalues).corrected_pvals) if p < args.mtc_alpha ] elif args.mtc == 'holmbonf': rej_idxs = [ i for (i, p) in enumerate( multiple_testing.Bonferroni(pvalues).corrected_pvals) if p < args.mtc_alpha ] elif args.mtc == 'fdr': rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha) else: raise ValueError("unknown MTC method %s" % args.mtc) for i in rej_idxs: # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]: # new_f = [ftag] #else: # new_f = "%s;%s" % (variants[i].FILTER, ftag) #variants[i] = variants[i]._replace(FILTER=new_f) variants[i].FILTER.append(ftag) LOG.info("%d of %d variants didn't pass filter" % (len(rej_idxs), len(variants))) # pyvcf doesn't need write_metainfo or write_header #vcf_writer.write_metainfo() #vcf_writer.write_header() for var in variants: filtered = len(var.FILTER) > 0 and var.FILTER not in [".", "PASS"] if args.pass_only and filtered: continue # LoFreq's vcf clone called this write_rec() vcf_writer.write_record(var) if fh_out != sys.stdout: fh_out.close()
try: self.input_vcf_file = input_vcf_file self.reader = vcf.VCFReader(filename=self.input_vcf_file) except Exception, e: logging.error("Error opening input VCF file: " + str(e)) raise ValueError("Error opening input VCF file: " + str(e)) else: logging.error("Input VCF file does not exist!") raise ValueError("Input VCF file does not exist!") # loads writer try: if output_vcf_file is None or output_vcf_file == "": output_vcf = sys.stdout else: output_vcf = open(output_vcf_file, 'w') self.writer = vcf.VCFWriter(output_vcf, self.reader) except Exception, e: logging.error("Error opening output VCF file: " + str(e)) raise ValueError("Error opening output VCF file: " + str(e)) # loads writer for duplicated variants try: if output_vcf_file is None or output_vcf_file == "": output_duplicated_vcf = sys.stderr else: duplicated_vcf_file = os.path.join( os.path.dirname(os.path.realpath(output_vcf_file)), os.path.splitext(os.path.basename(output_vcf_file))[0] + ".duplicated.vcf") output_duplicated_vcf = open(duplicated_vcf_file, 'w') self.writer_duplicated = vcf.VCFWriter(output_duplicated_vcf, self.reader)
def main(): """The main function """ parser = cmdline_parser() args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) if args.debug: LOG.setLevel(logging.DEBUG) assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam) samfh = pysam.Samfile(args.bam) # setup vcf_reader # if args.vcfin[-3:] == '.gz': fh_in = gzip.open(args.vcfin) compressed = True else: compressed = False if args.vcfin == '-': fh_in = sys.stdin else: fh_in = open(args.vcfin) vcf_reader = vcf.VCFReader(fh_in, compressed) # setup vcf_writer # if args.vcfout == '-': fh_out = sys.stdout else: if os.path.exists(args.vcfout): LOG.fatal("Cowardly refusing to overwrite already existing" " file %s" % (args.vcfout)) sys.exit(1) if args.vcfout[-3:] == '.gz': fh_out = gzip.open(args.vcfout, 'w') else: fh_out = open(args.vcfout, 'w') # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf # clone didn't vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep) #vcf_writer = vcf.VCFWriter(fh_out) #vcf_writer.meta_from_reader(vcf_reader) # FIXME should add filter description to header for (var_no, var) in enumerate(vcf_reader): if var_no % 500 == 1: LOG.info("Analyzing variant %d" % (var_no)) if 'INDEL' in var.INFO: LOG.warn("Skipping indel %s:%d" % (var.CHROM, var.POS)) continue if len(var.REF) > 1 or len(var.ALT) > 1: LOG.warn("Skipping indel (not tagged as such) %s:%d" % (var.CHROM, var.POS)) continue reads = list( samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS)) LOG.debug("%s %d: %d (unfiltered) reads covering position" % (var.CHROM, var.POS, len(reads))) ref_bquals = [] alt_bquals = [] # FIXME huge code overlap with lofreq2_bias.py for r in reads: if skip_read(r): continue # determine position on read for variant to then determine # the current base and its basequal # vpos_on_read = [ vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs if vpos_on_ref == var.POS - 1 ] #if False: # if len(vpos_on_read)!=1: # #import pdb; pdb.set_trace() # from IPython import embed; embed() assert len(vpos_on_read) == 1 vpos_on_read = vpos_on_read[0] if vpos_on_read == None: # skip deletions continue b = r.query[vpos_on_read] bq = ord(r.qqual[vpos_on_read]) - 33 assert len(var.REF) == 1 and len(var.ALT) == 1 if b.upper() == var.REF[0].upper(): ref_bquals.append(bq) elif b.upper() == str(var.ALT[0]).upper(): alt_bquals.append(bq) else: LOG.debug("Skipping non-ref-alt base %s at %s:%d" % (b, var.CHROM, var.POS)) continue # " A candidate is rejected if, in the control data, there are # (i) >= 2 observations of the alternate allele or they represent # >= 3% of the reads; and (ii) their sum of quality scores is >= # 20." # FIXME set filter var.INFO['AN'] = True print_this_var = True num_alt = len(alt_bquals) num_ref = len(ref_bquals) num_both = num_alt + num_ref if num_both == 0: LOG.warn("No alt or ref bases for var %s" % var) print_this_var = True else: if (num_alt >= 2 or num_alt / float(num_both) >= 0.03 ) and sum(alt_bquals) > 20: var.FILTER.append(FILTER_TAG) if args.pass_only: print_this_var = False if print_this_var: # LoFreq's vcf clone called this write_rec() vcf_writer.write_record(var) if fh_in != sys.stdout: fh_in.close() if fh_out != sys.stdout: fh_out.close()
def makevcf(lMutatationsFile, lVcfTemplate, lMixtureRatio, lJobName): """ Create vcf file according to values in the mutations file The header will be picked up from the template vcf file :param lMutatationsFile: Mutations csv file :param lVcfTemplate: Template vcf file for header :param lMixtureRatio: Mixture ratio :param lJobName: Jobname :return: A dict with paths of vcf files created """ rVcfFilePathDict = {} contributorList = lMixtureRatio.split(":") currentContributor = 0 NoAlleleSpecifiedChar = '-' # Reading in the vcf template file for header and a dummy record vcfTemplateObj = vcf.Reader(open(lVcfTemplate, 'r')) dummyRecord = vcfTemplateObj.next() # Create a vcf file for each contributor for indContributor in contributorList: contributorStrRep = 'contributor' + str((currentContributor + 1)) vcfWriteObj = vcf.VCFWriter( open(lJobName + "_" + contributorStrRep + ".vcf", 'w'), vcfTemplateObj) logging.info('Creating vcf file for %s' % contributorStrRep) with open(lMutatationsFile, 'rU') as csvFileName: csvFile = csv.DictReader(csvFileName, dialect=csv.excel) for indRow in csvFile: alternateAlleles = indRow['Alternate Alleles'].split('/') alleleFreq = indRow['Allele Frequency'].split('/') currentAlternateAllele, currentAlleleFreq = None, None try: currentAlternateAllele = alternateAlleles[ currentContributor] currentAlleleFreq = alleleFreq[currentContributor] except IndexError: logging.warning( 'No alternate allele or freq given for contributor %d at site %s:%s', contributorStrRep, indRow['Chromosome'], indRow['Position']) else: currentRecord = deepcopy(dummyRecord) if currentAlternateAllele != NoAlleleSpecifiedChar and currentAlleleFreq != NoAlleleSpecifiedChar: currentRecord.CHROM = indRow['Chromosome'] currentRecord.POS = indRow['Position'] currentRecord.REF = indRow['Reference Allele'] currentRecord.ALT = currentAlternateAllele.split( ',') # pyvcf takes a list for ALT currentRecord.INFO['AF'] = currentAlleleFreq if float( currentAlleleFreq ) >= 1.0: # if HOM then pl=3 else pl=default 2 from template currentRecord.INFO['pl'] = 3 # Adding support for mutation type i.e. SUBSTITUTE for SNP INSERT/DELETE for ins/del # specifying currentRecord.ALT[0] as we have list as alternate allele while we want # to check the length of the first alternate allele. Hard coding 0 in here as i dont think # we will have scenario of specifying multiple alternate alleles for one contributor if len(currentRecord.REF) == len(currentRecord.ALT[0]): currentRecord.INFO['mt'] = 'SUBSTITUTE' elif len(currentRecord.REF) > len( currentRecord.ALT[0]): currentRecord.INFO['mt'] = 'DELETE' elif len(currentRecord.REF) < len( currentRecord.ALT[0]): currentRecord.INFO['mt'] = 'INSERT' else: logging.warning( 'Unknown mutation type observed for contributor %d at site %s:%s', contributorStrRep, indRow['Chromosome'], indRow['Position']) vcfWriteObj.write_record(currentRecord) vcfWriteObj.close() rVcfFilePathDict[contributorStrRep] = os.path.abspath( lJobName + "_" + contributorStrRep + ".vcf") currentContributor += 1 return rVcfFilePathDict
def addTSSInfo(self, vcfInputFile): vcf_reader = vcf.Reader(open(vcfInputFile, 'r')) vcf_reader.infos['TSSOL'] = VcfInfo( 'TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader) query = SPARQLQueries.sparqlQueries() totalVar = 0 tssOLVar = 0 lo = LiftOver('hg38ToHg19.over.chain.gz') for record in vcf_reader: variantStart = record.start variantEnd = record.end variantChromosome = record.CHROM variantSubType = record.var_subtype isOverlapping = False # Adding chr prefix to the chromosome if "chr" not in variantChromosome: variantChromosome = "chr" + str(record.CHROM) #liftover from hg20 to hg19 data = lo.convert_coordinate(variantChromosome, variantStart) #print variantChromosome print variantStart print variantEnd if ((data != None)): data2 = data.pop() variantChromosomehg19 = data2[0] variantStarthg19 = data2[1] data = lo.convert_coordinate(variantChromosome, variantEnd) data2 = data.pop() variantEndhg19 = data2[1] # SPARQL query result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql', variantStarthg19, variantEndhg19, variantChromosomehg19) for row in result: values = sparql.unpack_row(row) cageStart = values[1] cageEnd = values[2] if ((variantSubType == 'ins') & (variantStart > cageStart)): isOverlapping = True tssOLVar = tssOLVar + 1 break elif ((variantSubType != 'ins') & (cageStart > 0)): isOverlapping = True tssOLVar = tssOLVar + 1 break totalVar = totalVar + 1 record.add_info('TSSOL', [isOverlapping]) else: print "No liftover found for this pos = " + record.ID vcf_writer.write_record(record) print "No of variants = " + str(totalVar) print "No of tss overlapping variants = " + str(tssOLVar)
self.v1 = vcf.Reader(open(vcf1)) self.v2 = vcf.Reader(open(vcf2)) #self.it = itertools.product(self.v1, self.v2) def intersect(self): i = j = 0 for left in self.v1: i += 1 for right in self.v2: j += 1 if left == right: print "(%d, %d) %s %s %s" % (i, j, left.start, left.end, left.CHROM) else: print "(%d, %d)" % (i, j) if __name__ == "__main__": #isec = intersection(sys.argv[1], sys.argv[2]) #isec.intersect() r1 = vcf.Reader(open(sys.argv[1])) r2 = vcf.Reader(open(sys.argv[2])) w = vcf.VCFWriter(open(sys.argv[3], 'w'), r1) intersectIter2(r1, r2, w)