def main(args, logger): """ Main wrapper script for removing non-standard variants """ # Allowed good = set(['A', 'T', 'C', 'G']) # Reader reader = pysam.VariantFile(args.input_vcf) # Writer mode = 'wz' if args.output_filename.endswith('gz') else 'w' writer = pysam.VariantFile(args.output_filename, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): alleles = list(record.alleles) alleles_set = set(list(''.join(alleles).upper())) check = alleles_set - good if check: logger.warning('Removing %s:%s:%s', record.chrom, record.pos, ','.join(alleles)) continue else: writer.write(record) finally: reader.close() writer.close() if mode == 'wz': pysam.tabix_index(args.output_filename, preset='vcf', force=True)
def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g', in_format='vcf', out_format='vcf', eff_options=''): """ TODO: docstring here """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') else: tmpVcf = outVcf args = ' '.join([ 'eff', '-c', '{}/snpEff.config'.format(self.executable_path()), '-i', in_format, '-o', out_format, genome, '-treatAllAsProteinCoding false', '-noLog', '-ud 0', '-noStats', eff_options ]) if inVcf.endswith('.gz'): pre_pipe = "zcat {} | ".format(inVcf) else: pre_pipe = "cat {} | ".format(inVcf) post_pipe = " > {}".format(tmpVcf) self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe, post_pipe=post_pipe) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf)
def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genome, inVcf ] with open(tmpVcf, 'wt') as outf: self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf)
def indexFile(options): filename = options.output if not options.ensembl is None: sys.stdout.write('Compressing output file... ') sys.stdout.flush() pysam.tabix_compress(filename, filename + '.gz', force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file... ') sys.stdout.flush() pysam.tabix_index(filename + '.gz', seq_col=2, start_col=4, end_col=5, meta_char='#', force=True) sys.stdout.write('OK\n') else: print 'Compressing file...' pysam.tabix_compress(filename, filename + '.gz', force=True) print 'Indexing file...' pysam.tabix_index(filename + '.gz', seq_col=1, start_col=2, end_col=2, meta_char='#', force=True)
def ensureIndexed(bedPath, preset="bed", trySorting=True): if not bedPath.endswith(".gz"): if not os.path.exists(bedPath + ".gz"): logging.info("bgzf compressing {}".format(bedPath)) pysam.tabix_compress(bedPath, bedPath + ".gz") if not os.path.exists(bedPath + ".gz"): raise Exception( "Failed to create compress {preset} file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath) ) bedPath += ".gz" if not os.path.exists(bedPath + ".tbi"): logging.info("creating tabix index for {}".format(bedPath)) pysam.tabix_index(bedPath, preset=preset) if not os.path.exists(bedPath + ".tbi"): raise Exception( "Failed to create tabix index file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath) ) line = pysam.Tabixfile(bedPath).fetch().next() if len(line.strip().split("\t")) < 6 and preset == "bed": raise AnnotationError( "BED files need to have at least 6 (tab-delimited) fields (including " "chrom, start, end, name, score, strand; score is unused)" ) if len(line.strip().split("\t")) < 9 and preset == "gff": raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields") return bedPath
def testIndexPresetCompressed(self): '''test indexing via preset.''' pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz") pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset) checkBinaryEqual(self.tmpfilename + ".gz", self.filename) checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
def batchTestHelper(self, modFile, pool, refLens): tmpName = tempfile.mkstemp('.tsv')[1] tmpfp = open(tmpName, 'wb') for line in modFile: tmpfp.write(line) tmpfp.close() pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, meta_char='#', zerobased=True) tmpName += '.gz' modFile.close() self.chromoID = '1' self.modobj = mod.Mod(tmpName) self.modobj.load(self.chromoID) for tup in pool: bamIter=[Read(tup[0], tup[1]+1, tup[2]) for tup in pool] a = annot.Annotator(self.chromoID, refLens[self.chromoID], self.modobj, bamIter) results = a.execute() for i,res in enumerate(results): self.assertEqual(polish(res[0]),pool[i][3]) self.assertEqual(res[1], pool[i][4]) self.assertEqual(res[2], pool[i][5]) self.assertEqual(res[3], pool[i][6]) self.assertEqual(res[4], pool[i][7]) os.remove(tmpName) os.remove(tmpName+'.tbi')
def make_bias_track(args, bases = 500000, splitsize = 1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.Scores.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
def _make_assembly_vcf(self): tmp_vcf = self.final_assembly_vcf + '.tmp' cmd = ' '.join([ self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f', self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>', tmp_vcf ]) common.syscall(cmd, verbose=self.verbose) cmd = ' '.join([ self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe, 'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>', self.final_assembly_read_depths + '.tmp' ]) common.syscall(cmd, verbose=self.verbose) pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths) pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1) os.unlink(self.final_assembly_read_depths + '.tmp') cmd = ' '.join([ self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe, 'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp), ' & MIN(DV)>=' + str(self.bcf_min_dv), ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=', str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf ]) common.syscall(cmd, verbose=self.verbose) os.unlink(tmp_vcf)
def addVariantSet( self, variantFileName, dataset, referenceSet, ontology, biosamples): inputVcf = os.path.join( self.inputDirectory, variantFileName) outputVcf = os.path.join( self.outputDirectory, variantFileName) shutil.copy(inputVcf, outputVcf) pysam.tabix_index(outputVcf, preset="vcf") variantSet = variants.HtslibVariantSet( dataset, variantFileName.split('_')[1]) variantSet.setReferenceSet(referenceSet) variantSet.populateFromFile( [os.path.abspath(outputVcf + ".gz")], [os.path.abspath(outputVcf + ".gz.tbi")]) variantSet.checkConsistency() for callSet in variantSet.getCallSets(): for biosample in biosamples: if biosample.getLocalId() == callSet.getLocalId(): callSet.setBiosampleId(biosample.getId()) self.repo.insertVariantSet(variantSet) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) self.repo.insertVariantAnnotationSet(annotationSet)
def classDisbyMotif(paras): path_dis = paras["output_dis"] path_dis_parameter = paras["output_tmp"] min_support_reads = paras["minimum_support_reads"] print("[INFO] Scanning the distribution file of microsatellite!") vcffile = pysam.VariantFile(path_dis) File_motif = {} recordNum = 0 for rec in vcffile.fetch(): recordNum += 1 recordInfo = rec.info motif = recordInfo["motif"] support_reads = int(recordInfo["support_reads"].split("|")[0]) if support_reads > min_support_reads: if motif not in File_motif: File_motif[motif] = pysam.VariantFile(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", 'w', header=vcffile.header) File_motif[motif].write(rec) motifList = [] for motif in File_motif: File_motif[motif].close() pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", force=True, preset="vcf") motifList.append(motif) set_value("motifList", motifList)
def make_index(file_name): """Make index file for input file""" f_bs, f_ext = os.path.splitext(file_name) def indexed(fn, ext): return os.path.exists(fn + ext) def uptodate(fn, ext): return os.getmtime(fn) < os.getmtime(fn + ext) infomsg = "{} was indexed and is uptodate. Skipping".format(file_name) if f_ext == ".fa": if indexed(file_name, ".fai") and uptodate(file_name, ".fai"): print(infomsg) else: pysam.faidx(file_name) elif f_ext in [".bam", ".cram"]: if indexed(file_name, ".bai") and uptodate(file_name, ".bai"): print(infomsg) else: pysam.index(file_name) elif f_ext in [".gff", ".bed", ".vcf", ".sam"]: if indexed(file_name, ".gz.tbi") and uptodate( file_name, ".gz.tbi"): print(infomsg) else: pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))
def get_cov(args, bases = 50000, splitsize = 1000): """function to get coverages """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.cov.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeCov, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.cov.bedgraph') pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
def class_dis_by_motif(paras): path_pre = paras["output_pre"] path_dis_parameter = paras["output_tmp"] min_support_reads = paras["minimum_support_reads"] logger.info("Scanning the distribution file of microsatellite!") vcf_file = pysam.VariantFile(path_pre) files_motif = {} record_num = 0 for rec in vcf_file.fetch(): record_num += 1 record_info = rec.info motif = record_info["motif"] support_reads = record_info["depth"] if support_reads > min_support_reads: if motif not in files_motif: files_motif[motif] = pysam.VariantFile( path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", 'w', header=vcf_file.header) files_motif[motif].write(rec) motif_list = [] for motif in files_motif: files_motif[motif].close() pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", force=True, preset="vcf") motif_list.append(motif) set_value("motif_list", motif_list)
def main(): # Read options, args. usage = "Usage: %prog [options] tabular_input_file bgzip_output_file" parser = optparse.OptionParser(usage=usage) parser.add_option('-c', '--chr-col', type='int', default=0, dest='chrom_col') parser.add_option('-s', '--start-col', type='int', default=1, dest='start_col') parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col') (options, args) = parser.parse_args() if len(args) != 2: parser.print_usage() exit(1) input_fname, output_fname = args output_dir = os.path.dirname(output_fname) if not os.path.exists(output_dir): os.makedirs(output_dir) pysam.tabix_compress(input_fname, output_fname, force=True) # Column indices are 0-based. pysam.tabix_index(output_fname, seq_col=options.chrom_col, start_col=options.start_col, end_col=options.end_col)
def main(): # Read options, args. parser = optparse.OptionParser() parser.add_option('-c', '--chr-col', type='int', dest='chrom_col') parser.add_option('-s', '--start-col', type='int', dest='start_col') parser.add_option('-e', '--end-col', type='int', dest='end_col') parser.add_option('-P', '--preset', dest='preset') (options, args) = parser.parse_args() input_fname, index_fname, out_fname = args # Create index. if options.preset: # Preset type. pysam.tabix_index(filename=index_fname, preset=options.preset, keep_original=True, index_filename=out_fname) else: # For interval files; column indices are 0-based. pysam.tabix_index(filename=index_fname, seq_col=(options.chrom_col - 1), start_col=(options.start_col - 1), end_col=(options.end_col - 1), keep_original=True, index_filename=out_fname) if os.path.getsize(index_fname) == 0: sys.stderr.write( "The converted tabix index file is empty, meaning the input data is invalid." )
def compressVcf(vcfname,forceflag=False,remove=False): """Runs bgzip and tabix on input VCF file. Using the pysam library, this function runs the bgzip and tabix utilities on the given input file. By default, this will not overwrite an existing zipped file, but will overwrite an existing index. `remove` can be set to delete the unzipped file. Parameters ---------- vcfname : str Name of uncompressed VCF file forceflag : bool (False) If true, will overwrite (vcfname).gz if it exists remove : bool (False) If true, will delete uncompressed source file Returns ------- cvcfname : str Filepath to compressed VCF file """ cvcfname = vcfname+".gz" pysam.tabix_compress(vcfname,cvcfname,force=forceflag) pysam.tabix_index(cvcfname,preset="vcf",force=True) if remove: os.remove(vcfname) return cvcfname
def ensureIndexed(bedPath, preset="bed", trySorting=True): if not bedPath.endswith(".gz"): if not os.path.exists(bedPath + ".gz"): logging.info("bgzf compressing {}".format(bedPath)) pysam.tabix_compress(bedPath, bedPath + ".gz") if not os.path.exists(bedPath + ".gz"): raise Exception( "Failed to create compress {preset} file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format( preset=preset, file=bedPath)) bedPath += ".gz" if not os.path.exists(bedPath + ".tbi"): logging.info("creating tabix index for {}".format(bedPath)) pysam.tabix_index(bedPath, preset=preset) if not os.path.exists(bedPath + ".tbi"): raise Exception( "Failed to create tabix index file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath)) line = pysam.Tabixfile(bedPath).fetch().next() if len(line.strip().split("\t")) < 6 and preset == "bed": raise AnnotationError( "BED files need to have at least 6 (tab-delimited) fields (including " "chrom, start, end, name, score, strand; score is unused)") if len(line.strip().split("\t")) < 9 and preset == "gff": raise AnnotationError( "GFF/GTF files need to have at least 9 tab-delimited fields") return bedPath
def to_tabix(bgzip_fname, out_fname, preset=None, chrom_col=None, start_col=None, end_col=None): # Create index. if preset: # Preset type. bgzip_fname = pysam.tabix_index(filename=bgzip_fname, preset=preset, keep_original=True, index=out_fname, force=True) else: # For interval files; column indices are 0-based. bgzip_fname = pysam.tabix_index(filename=bgzip_fname, seq_col=(chrom_col - 1), start_col=(start_col - 1), end_col=(end_col - 1), keep_original=True, index=out_fname, force=True) if os.path.getsize(out_fname) == 0: sys.exit( "The converted tabix index file is empty, meaning the input data is invalid." ) return bgzip_fname
def run(argv): if '-h' in argv or '--help' in argv: print('Make a single large tabixed file of all phenotypes data') exit(1) if should_run(): # we don't need `ffi.new('char[]', ...)` because args are `const` ret = lib.cffi_make_matrix( sites_filepath.encode('utf8'), common_filepaths['pheno']('*').encode('utf8'), matrix_gz_tmp_filepath.encode('utf8')) ret_bytes = ffi.string(ret, maxlen=1000) if ret_bytes != b'ok': raise PheWebError( 'The portion of `pheweb matrix` written in c++/cffi failed with the message ' + repr(ret_bytes)) os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath) pysam.tabix_index( filename=matrix_gz_filepath, force=True, seq_col=0, start_col=1, end_col=1 # note: these are 0-based, but `/usr/bin/tabix` is 1-based ) else: print('matrix is up-to-date!')
def tabix_bedgraph(bedgraph): pysam.tabix_compress(bedgraph, bedgraph + '.gz') pysam.tabix_index(bedgraph + '.gz', seq_col=0, start_col=1, end_col=2, zerobased=True)
def gff32tabix(self, file_sorted_gff3, file_sorted_gz_gff3, file_gff3_tbi): # pylint: disable=no-self-use,unused-argument """ GFF3 to Tabix Compresses the sorted GFF3 file and then uses Tabix to generate an index of the GFF3 file. Parameters ---------- file_sorted_gff3 : str Location of a sorted GFF3 file file_sorted_gz_gff3 : str Location of the bgzip compressed GFF3 file file_gff3_tbi : str Location of the Tabix index file Example ------- .. code-block:: python :linenos: if not self.gff32tabix(self, file_sorted_gff3, gz_file, tbi_file): output_metadata.set_exception( Exception( "gff32tabix: Could not process files {}, {}.".format(*input_files))) """ pysam.tabix_compress(file_sorted_gff3, file_sorted_gz_gff3) # pylint: disable=no-member pysam.tabix_index(file_sorted_gz_gff3, preset='gff') # pylint: disable=no-member return True
def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") sortedAccessionString = ", ".join(sorted(genomes)) databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55] genomeToUse = "" # if we don't have the genome, by name (snpEff official) or by hash (custom) if (not self.has_genome(databaseId)): if (not self.has_genome(genomes[0])): _log.info("Checking for snpEff database online...") # check to see if it is available for download, and if so install it for row in self.available_databases(): if (genomes[0].lower() in row['Genome'].lower()) or ( genomes[0].lower() in row['Bundle'].lower() ) or ( genomes[0].lower() in row['Organism'].lower() ): self.download_db(row['Genome']) # backward compatability for where a single genome name is provided if self.has_genome(genomes[0]): genomeToUse = genomes[0] else: # if the hash of the accessions passed in is not present in the genomes db if not self.has_genome(databaseId): self.create_db(genomes, emailAddress, JVMmemory) if self.has_genome(databaseId): genomeToUse = databaseId if not genomeToUse: raise Exception() args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse, os.path.realpath(inVcf) ] command_ps = self.execute('ann', args, JVMmemory=JVMmemory) if command_ps.returncode == 0: with open(tmpVcf, 'wt') as outf: outf.write(command_ps.stdout.decode("utf-8")) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf) else: raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)]) databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55] genomeToUse = "" # if we don't have the genome, by name (snpEff official) or by hash (custom) if (not self.has_genome(databaseId)): if (not self.has_genome(genomes[0])): _log.info("Checking for snpEff database online...") # check to see if it is available for download, and if so install it for row in self.available_databases(): if (genomes[0].lower() in row['Genome'].lower()) or ( genomes[0].lower() in row['Bundle'].lower() ) or ( genomes[0].lower() in row['Organism'].lower() ): self.download_db(row['Genome']) # backward compatability for where a single genome name is provided if self.has_genome(genomes[0]): genomeToUse = genomes[0] else: # if the hash of the accessions passed in is not present in the genomes db if not self.has_genome(databaseId): self.create_db(genomes, emailAddress, JVMmemory) if self.has_genome(databaseId): genomeToUse = databaseId if not genomeToUse: raise Exception() args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse, os.path.realpath(inVcf) ] command_ps = self.execute('ann', args, JVMmemory=JVMmemory) if command_ps.returncode == 0: with open(tmpVcf, 'wt') as outf: outf.write(command_ps.stdout.decode("utf-8")) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf) else: raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
def testEmptyFileVCFGZWithIndex(self): with get_temp_context("tmp_testEmptyFile.vcf") as fn: with open(fn, "w"): pass # tabix_index will automatically compress pysam.tabix_index(fn, preset="vcf", force=True) self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
def setUp( self ): self.tmpfilename = "tmp_%s.vcf" % id(self) shutil.copyfile( self.filename, self.tmpfilename ) pysam.tabix_index( self.tmpfilename, preset = "vcf" ) self.tabix = pysam.Tabixfile( self.tmpfilename + ".gz" ) self.compare = [ x[:-1].split("\t") for x in open( self.filename, "r") if not x.startswith("#") ]
def test_indexing_to_custom_location_works(self): '''test indexing a file with a non-default location.''' index_path = get_temp_filename(suffix='custom.tbi') pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True) self.assertTrue(checkBinaryEqual(index_path, self.filename_idx)) os.unlink(index_path)
def testIndexPresetUncompressed(self): '''test indexing via preset.''' pysam.tabix_index(self.tmpfilename, preset=self.preset) # check if uncompressed file has been removed self.assertEqual(os.path.exists(self.tmpfilename), False) checkBinaryEqual(self.tmpfilename + ".gz", self.filename) checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
def indexingVariantFile(self, varFile): """Index variant file with Tabix""" logging.info('Trying to index file: {}'.format(varFile)) pysam.tabix_index(varFile, force=True, preset="vcf") if not self.isVariantFileIndexed(varFile): raise Exception("Can not index file: {}".format(varFile)) return True
def get_refseq_allele(vcf_file, fasta_file, out_file, verbose=False): """ Output a reference VCF file that contains the ref/alt allele coding with the ref allele matching the reference sequence. This assumes that all alleles are in the forward strand. If neither the ref or alt allele is not found in the reference sequence, the alleles are output as missing. Attributes ---------- vcf_file : str VCF/BCF file name. fasta_file : str Fasta file name. Must be unzipped and have an fai index. out_file : str Output VCF file name. """ rdr = fasta_fai.Reader(fasta_file) vcf_in = pysam.VariantFile(vcf_file, mode='r') vcf_out = pysam.VariantFile(out_file, mode='w') for r in vcf_in.header.records: vcf_out.header.add_record(r) counter = 0 for rec in vcf_in.fetch(): counter = counter + 1 if verbose and counter % 1000 == 0: print(counter, "records") rec_out = vcf_out.new_record() rec_out.id = rec.id rec_out.pos = rec.pos rec_out.chrom = rec.chrom o = list(range(len(rec.alleles))) orv = list(reversed(o)) ref_i = 0 for i in orv: a = rec.alleles[i] refseq_base = rdr.get_seq(rec.chrom, rec.pos - 1, len(a)) if a == refseq_base: ref_i = i o[ref_i] = 0 o[0] = ref_i alleles = list() for i in o: alleles.append(rec.alleles[i]) if (len(alleles) == 1): alleles.append('.') rec_out.alleles = tuple(alleles) alleles_set = set(rec.alleles) vcf_out.write(rec_out) rdr.close() vcf_in.close() vcf_out.close() if out_file[-3:] == ".gz": pysam.tabix_index(out_file, preset="vcf", force=True)
def test_indexing_with_lineskipping_works(self): '''test indexing via preset and lineskip.''' pysam.tabix_index(self.tmpfilename, seq_col=0, start_col=3, end_col=4, line_skip=1, zerobased=False) self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
def test_vcf_with_tbi_index(self): with get_temp_context("tmp_fn.vcf") as fn: shutil.copyfile(self.vcf_filename, fn) pysam.tabix_index(fn, preset="vcf", force=True) self.assertTrue(os.path.exists(fn + ".gz" + ".tbi")) self.assertFalse(os.path.exists(fn + ".gz" + ".csi")) with pysam.VariantFile(fn + ".gz") as inf: self.assertEqual(len(list(inf.fetch("20"))), 3)
def indexFile(input_file): sys.stdout.write('Compressing file... ') sys.stdout.flush() pysam.tabix_compress(input_file, input_file + '.gz', force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file... ') sys.stdout.flush() pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True) sys.stdout.write('OK\n')
def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) if args.fasta is not None: chrs_fasta = read_chrom_sizes_from_fasta(args.fasta) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down)) else: chunks = ChunkList.read(args.bed) if args.bam is not None: chrs_bam = read_chrom_sizes_from_bam(args.bam) chunks.checkChroms(chrs_bam, chrom_source = "BAM file") chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
def test_indexing_with_lineskipping_works(self): '''test indexing via preset and lineskip.''' pysam.tabix_index(self.tmpfilename, seq_col=0, start_col=3, end_col=4, line_skip=1, zerobased=False) self.assertFalse(checkBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx))
def make_non_somatic_panel(file_lst, panelname, genome, cosmic_db, cnt): indel_lst = filter_indels(file_lst, genome, cosmic_db, cnt) vcf_data = to_vcf_data(indel_lst) with open(panelname, "w") as f: f.write(vcf_header() + "\n") f.write("\n".join(vcf_data)) pysam.tabix_index(panelname, preset="vcf")
def test_indexing_with_explict_columns_works(self): '''test indexing via preset.''' pysam.tabix_index(self.tmpfilename, seq_col=0, start_col=3, end_col=4, line_skip=0, zerobased=False) self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
def indexFile(f, options): sys.stdout.write(f'Compressing output file {f}... ') sys.stdout.flush() pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True) sys.stdout.write('OK\n') sys.stdout.write(f'Indexing output file {f}... ') sys.stdout.flush() pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#', force=True) sys.stdout.write('OK\n')
def test_indexing_to_custom_location_works(self): '''test indexing a file with a non-default location.''' index_path = get_temp_filename(suffix='custom.tbi') pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True) self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx)) os.unlink(index_path)
def _index_with_tabix(self): """Compress and index output file by Tabix""" pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True) pysam.tabix_index(self._fn + '.gz', seq_col=self.idx_chrom, start_col=self.idx_start, end_col=self.idx_end, meta_char='#', force=True)
def process_vcf(archive, vcf, vcf_index, output_prefix): """ Extracts and processes the caveman vcf file. """ out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix) logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf)) extract_file(archive, vcf, out_raw_vcf) out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix) logger.info( "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index)) extract_file(archive, vcf_index, out_raw_vcf_index) # Update the sample name using BGZFile which doesn't assert any VCF format logger.info("Processing raw VCF to change TUMOUR -> TUMOR...") out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix) logger.info("Creating final vcf {0}".format(out_formatted_vcf)) writer = pysam.BGZFile(out_formatted_vcf, mode='wb') reader = pysam.BGZFile(out_raw_vcf, mode='rb') try: for line in reader: line = line.decode('utf-8') if line.startswith('##'): if line.startswith('##SAMPLE=<ID=TUMOUR'): new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: new_line = line + '\n' writer.write(new_line.encode('utf-8')) elif line.startswith('#CHROM'): new_line = line.replace('TUMOUR', 'TUMOR') + '\n' writer.write(new_line.encode('utf-8')) else: # BINF-306: fix rare case of alt == ref in caveman vcf. cols = line.split('\t') if cols[3] == cols[4]: logger.warn( "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}" .format(cols[0], cols[1], cols[3], cols[4])) continue new_line = line + '\n' writer.write(new_line.encode('utf-8')) finally: writer.close() reader.close() # tabix index logger.info("Creating final vcf index {0}".format(out_formatted_vcf + '.tbi')) pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True) # clean up logger.info("Cleaning up tmp files...") os.remove(out_raw_vcf) os.remove(out_raw_vcf_index)
def test_indexing_with_explict_columns_works(self): '''test indexing via preset.''' pysam.tabix_index(self.tmpfilename, seq_col=0, start_col=3, end_col=4, line_skip=0, zerobased=False) self.assertTrue(checkBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx))
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path): make_basedir(ivf_path) tmp_path = get_tmp_path(ivf_path) pysam.tabix_compress(vf_path, tmp_path, force=True) os.rename(tmp_path, ivf_path) pysam.tabix_index( filename=ivf_path, force=True, seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`. line_skip=1, # skip header )
def run_merge(args): if not args.out: args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3]) occ = NucList.read(args.occpeaks, "occ", args.min_occ) nuc = NucList.read(args.nucpos, "nuc", args.min_occ) new = merge(occ, nuc, args.sep) out = open(args.out + '.nucmap_combined.bed','w') out.write(new.asBed()) out.close() pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True) shell_command('rm ' + args.out + '.nucmap_combined.bed') pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)
def testEmptyFileVCFGZWithIndex(self): with open("tmp_testEmptyFile.vcf", "w"): pass pysam.tabix_index("tmp_testEmptyFile.vcf", preset="vcf", force=True) self.assertRaises(ValueError, pysam.VariantFile, "tmp_testEmptyFile.vcf.gz") os.unlink("tmp_testEmptyFile.vcf.gz") os.unlink("tmp_testEmptyFile.vcf.gz.tbi")
def index(destDir, inputFilename, fileColumnNumList=None, preset=None): """ Create a tabix index file for genomic position datasource tsv files. Prerequisites (for genomic position indexed): Input file has three columns that can be mapped to chromosome, start position, and end position without any modification. For example, ['hg19.oreganno.chrom', 'hg19.oreganno.chromStart', 'hg19.oreganno.chromEnd'] in oreganno.hg19.txt This will overwrite an existing index (since the force parameter is set to True in pysam.tabix_index() call). Also, in cases where the inputFilename doesn't end with a ".gz", the a compressed file will be created and indexed. :param destDir: destination directory :param ds_foldername: destination folder name :param fileColumnNumList: ordered list. This list contains the corresponding entries (column numbers) in the tsv file. Typically, this would be [chr,start,end] or [gene, startAA, endAA] :param inputFilename: tsv file input :param preset: if preset is provided, the column coordinates are taken from a preset. Valid values for preset are "gff", "bed", "sam", "vcf", "psltbl", and "pileup". """ fileColumnNumList = [] if fileColumnNumList is None else fileColumnNumList inputFilename = os.path.abspath(inputFilename) fileDir = os.path.dirname(inputFilename) fileName, fileExtension = os.path.splitext(os.path.basename(inputFilename)) if fileExtension in (".gz",): # Ensure .gz.tbi file is there as well inputIndexFilename = os.path.join(fileDir, string.join([inputFilename, "tbi"], ".")) if not os.path.exists(inputIndexFilename): msg = "Missing tabix index file %s." % inputIndexFilename raise TabixIndexerFileMissingError(msg) outputFilename = os.path.join(destDir, string.join([fileName, "gz"], ".")) shutil.copyfile(inputFilename, outputFilename) outputIndexFilename = os.path.join(destDir, string.join([fileName, "gz", "tbi"], ".")) shutil.copyfile(inputIndexFilename, outputIndexFilename) return outputFilename outputFilename = os.path.join(destDir, string.join([fileName, ".tabix_indexed", fileExtension], "")) # Copy the input file to output file. shutil.copyfile(inputFilename, outputFilename) # Load the file into a tsvReader. if preset in ("gff", "bed", "sam", "vcf", "psltbl", "pileup"): tabix_index = pysam.tabix_index(filename=outputFilename, force=True, preset=preset) else: # Have to specify min_size=0 in pysam 0.8.1 to get pysam to correctly output a .tbi file tabix_index = pysam.tabix_index(filename=outputFilename, force=True, seq_col=fileColumnNumList[0], start_col=fileColumnNumList[1], end_col=fileColumnNumList[2]) return tabix_index
def run(args): fin=IO.fopen(args.input,"r") outfile=args.input if not args.sorted: l = [ i for i in TableIO.parse(fin,args.format) ] l.sort() name=splitext(args.input) outfile = "{name[0]}.sorted{name[1]}".format(name=name) out = IO.fopen(outfile,"w") for i in l: print(i,file=out) out.close() format=args.format.translate(None,digits) tabix_index(outfile,preset=format)
def to_tabix(bgzip_fname, out_fname, preset=None, chrom_col=None, start_col=None, end_col=None): # Create index. if preset: # Preset type. bgzip_fname = pysam.tabix_index(filename=bgzip_fname, preset=preset, keep_original=True, index=out_fname, force=True) else: # For interval files; column indices are 0-based. bgzip_fname = pysam.tabix_index(filename=bgzip_fname, seq_col=(chrom_col - 1), start_col=(start_col - 1), end_col=(end_col - 1), keep_original=True, index=out_fname, force=True) if os.path.getsize(out_fname) == 0: sys.stderr.write("The converted tabix index file is empty, meaning the input data is invalid.") return bgzip_fname
def merge_vcfs(in_vcfs_dir, contigs, out_vcf): logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir) header_done = False out_vcf_file = open(out_vcf, "w") for contig in contigs: chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name) if os.path.isfile(chr_vcf): chr_tabix_file = pysam.Tabixfile(chr_vcf) if not header_done: print_header(chr_tabix_file.header, out_vcf_file) for entry in chr_tabix_file.fetch(): out_vcf_file.write("%s\n" % entry) chr_tabix_file.close() out_vcf_file.close() pysam.tabix_index(out_vcf, force=True, preset="vcf")
def indexFile(options): filename=options.output if not options.ensembl is None: sys.stdout.write('Compressing output file... ') sys.stdout.flush() pysam.tabix_compress(filename,filename+'.gz',force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file... ') sys.stdout.flush() pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True) sys.stdout.write('OK\n') else: print 'Compressing file...' pysam.tabix_compress(filename,filename+'.gz',force=True) print 'Indexing file...' pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)
def _prepare(self): if not os.path.isfile(self.ingtf + ".gz.tbi"): print "Generate indexed GTF (tabix) file: '{0}'...".format(self.ingtf) compressed_gtf = pysam.tabix_index(self.ingtf, preset="gff") else: compressed_gtf = self.ingtf + ".gz" self.tabixfile = pysam.Tabixfile(compressed_gtf)
def addVariantSet(self, variantFileName, dataset, referenceSet, ontology): inputVcf = os.path.join( self.inputDirectory, variantFileName) outputVcf = os.path.join( self.outputDirectory, variantFileName) shutil.copy(inputVcf, outputVcf) pysam.tabix_index(outputVcf, preset="vcf") variantSet = variants.HtslibVariantSet( dataset, variantFileName.split('_')[1]) variantSet.setReferenceSet(referenceSet) variantSet.populateFromFile( [outputVcf + ".gz"], [outputVcf + ".gz.tbi"]) variantSet.checkConsistency() self.repo.insertVariantSet(variantSet) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) self.repo.insertVariantAnnotationSet(annotationSet)
def bgzip_index(original_file, new_file, file_format): """ :param original_file: :param new_file: :param file_format: :return: """ if file_format.lower() == 'fa': tabix_compress(original_file, new_file) faidx(new_file) delete_file(original_file) elif file_format.lower() == 'vcf': tabix_index(original_file, preset="vcf", force=True) else: raise G2GValueError("Unknown file format: {0}".format(file_format))
def __tabix(self, file_name): """ tabix into gz and tbi file """ return pysam.tabix_index(file_name, force = True, seq_col = combivep_settings.LJB_PARSED_0_INDEX_CHROM, start_col = combivep_settings.LJB_PARSED_0_INDEX_POS, end_col = combivep_settings.LJB_PARSED_0_INDEX_POS, zerobased = False)
def getTabixMod(filename): '''Unzip a mod file, use bgzip to rezip it, and and build tabix index.''' logger = logging.getLogger('tmpmod') logger.info('extracting MOD file ...') modfp = gzip.open(filename, 'rb') tmpName = tempfile.mkstemp('.tsv')[1] tmpfp = open(tmpName, 'wb') tmpfp.writelines(modfp) tmpfp.close() modfp.close() pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, meta_char='#', zerobased=True) tmpName += '.gz' logger.info('temporary file %s created', tmpName) return tmpName #print(getTabixMod("../data/B.mod"))