Python tabix_index示例，pysam.tabix_index Python示例

示例#1

0

显示文件

文件： remove_nonstandard_variants.py 项目： NCI-GDC/gdc-sanger-somatic-tool

def main(args, logger):
    """
    Main wrapper script for removing non-standard variants
    """
    # Allowed
    good = set(['A', 'T', 'C', 'G'])

    # Reader
    reader = pysam.VariantFile(args.input_vcf)

    # Writer
    mode = 'wz' if args.output_filename.endswith('gz') else 'w'
    writer = pysam.VariantFile(args.output_filename,
                               mode=mode,
                               header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            alleles = list(record.alleles)
            alleles_set = set(list(''.join(alleles).upper()))
            check = alleles_set - good
            if check:
                logger.warning('Removing %s:%s:%s', record.chrom, record.pos,
                               ','.join(alleles))
                continue
            else:
                writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == 'wz':
        pysam.tabix_index(args.output_filename, preset='vcf', force=True)

示例#2

0

显示文件

文件： snpeff.py 项目： mlin/viral-ngs

    def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g',
            in_format='vcf', out_format='vcf', eff_options=''):
        """
        TODO: docstring here
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        else:
            tmpVcf = outVcf

        args = ' '.join([
                'eff',
                    '-c', '{}/snpEff.config'.format(self.executable_path()),
                    '-i', in_format,
                    '-o', out_format,
                    genome,
                    '-treatAllAsProteinCoding false',
                    '-noLog',
                    '-ud 0',
                    '-noStats',
                    eff_options
                ])

        if inVcf.endswith('.gz'):
            pre_pipe = "zcat {} | ".format(inVcf)
        else:
            pre_pipe = "cat {} | ".format(inVcf)
        post_pipe = " > {}".format(tmpVcf)
        self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe,
                post_pipe=post_pipe)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)

示例#3

0

显示文件

文件： snpeff.py 项目： ACEGID-Senegal/viral-ngs

    def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        args = [
            '-treatAllAsProteinCoding', 'false',
            '-t',
            '-noLog',
            '-ud', '0',
            '-noStats',
            '-noShiftHgvs',
            genome,
            inVcf
            ]
        with open(tmpVcf, 'wt') as outf:
            self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)

示例#4

0

显示文件

def indexFile(options):
    filename = options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ')
        sys.stdout.flush()
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        sys.stdout.write('OK\n')
        sys.stdout.write('Indexing output file... ')
        sys.stdout.flush()
        pysam.tabix_index(filename + '.gz',
                          seq_col=2,
                          start_col=4,
                          end_col=5,
                          meta_char='#',
                          force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename + '.gz',
                          seq_col=1,
                          start_col=2,
                          end_col=2,
                          meta_char='#',
                          force=True)

示例#5

0

显示文件

文件： tabix.py 项目： apregier/svviz

def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath)
                )
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath)
            )

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)"
        )
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath

示例#6

0

显示文件

文件： tabix_test.py 项目： carolinux/pysam

    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

示例#7

0

显示文件

文件： testAnnotator.py 项目： andrewparkermorgan/lapels

 def batchTestHelper(self, modFile, pool, refLens):                
     tmpName = tempfile.mkstemp('.tsv')[1]
     tmpfp = open(tmpName, 'wb')
     for line in modFile:
         tmpfp.write(line)
     tmpfp.close()
     pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                   meta_char='#', zerobased=True)
     tmpName += '.gz'
     modFile.close()
     
     self.chromoID = '1'
     self.modobj = mod.Mod(tmpName)
     self.modobj.load(self.chromoID)
     
     for tup in pool:       
         bamIter=[Read(tup[0], tup[1]+1, tup[2]) for tup in pool]        
                                
     a = annot.Annotator(self.chromoID, refLens[self.chromoID],
                             self.modobj, bamIter)
     results = a.execute()
     
     for i,res in enumerate(results):            
         self.assertEqual(polish(res[0]),pool[i][3])
         self.assertEqual(res[1], pool[i][4])
         self.assertEqual(res[2], pool[i][5])
         self.assertEqual(res[3], pool[i][6])
         self.assertEqual(res[4], pool[i][7])
     
     os.remove(tmpName)
     os.remove(tmpName+'.tbi')

示例#8

0

显示文件

文件： make_bias_track.py 项目： kesteph/NucleoATAC

def make_bias_track(args, bases = 500000, splitsize = 1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)
    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.Scores.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)

示例#9

0

显示文件

文件： tabix_test.py 项目： humburg/pysam

    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

示例#10

0

显示文件

文件： cluster.py 项目： aslett1/ariba

    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f',
            self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe,
            'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp',
                             self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe,
            'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
            ' & MIN(DV)>=' + str(self.bcf_min_dv),
            ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=',
            str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)

示例#11

0

显示文件

    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)

示例#12

0

显示文件

def classDisbyMotif(paras):
    path_dis = paras["output_dis"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    print("[INFO] Scanning the distribution file of microsatellite!")
    vcffile = pysam.VariantFile(path_dis)
    File_motif = {}

    recordNum = 0
    for rec in vcffile.fetch():
        recordNum += 1
        recordInfo = rec.info
        motif = recordInfo["motif"]
        support_reads = int(recordInfo["support_reads"].split("|")[0])
        if support_reads > min_support_reads:
            if motif not in File_motif:
                File_motif[motif] = pysam.VariantFile(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", 'w',
                                                      header=vcffile.header)
            File_motif[motif].write(rec)

    motifList = []
    for motif in File_motif:
        File_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", force=True, preset="vcf")

        motifList.append(motif)
    set_value("motifList", motifList)

示例#13

0

显示文件

    def make_index(file_name):
        """Make index file for input file"""
        f_bs, f_ext = os.path.splitext(file_name)

        def indexed(fn, ext):
            return os.path.exists(fn + ext)

        def uptodate(fn, ext):
            return os.getmtime(fn) < os.getmtime(fn + ext)

        infomsg = "{} was indexed and is uptodate. Skipping".format(file_name)
        if f_ext == ".fa":
            if indexed(file_name, ".fai") and uptodate(file_name, ".fai"):
                print(infomsg)
            else:
                pysam.faidx(file_name)
        elif f_ext in [".bam", ".cram"]:
            if indexed(file_name, ".bai") and uptodate(file_name, ".bai"):
                print(infomsg)
            else:
                pysam.index(file_name)
        elif f_ext in [".gff", ".bed", ".vcf", ".sam"]:
            if indexed(file_name, ".gz.tbi") and uptodate(
                    file_name, ".gz.tbi"):
                print(infomsg)
            else:
                pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))

示例#14

0

显示文件

文件： get_cov.py 项目： JordiAlbert/NucleoATAC

def get_cov(args, bases = 50000, splitsize = 1000):
    """function to get coverages

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.cov.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.cov.bedgraph')
    pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)

示例#15

0

显示文件

文件： estimate_error.py 项目： PengJia6/MSHunter

def class_dis_by_motif(paras):
    path_pre = paras["output_pre"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    logger.info("Scanning the distribution file of microsatellite!")
    vcf_file = pysam.VariantFile(path_pre)
    files_motif = {}
    record_num = 0
    for rec in vcf_file.fetch():
        record_num += 1
        record_info = rec.info
        motif = record_info["motif"]
        support_reads = record_info["depth"]
        if support_reads > min_support_reads:
            if motif not in files_motif:
                files_motif[motif] = pysam.VariantFile(
                    path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz",
                    'w',
                    header=vcf_file.header)
            files_motif[motif].write(rec)

    motif_list = []
    for motif in files_motif:
        files_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif +
                          ".vcf.gz",
                          force=True,
                          preset="vcf")

        motif_list.append(motif)
    set_value("motif_list", motif_list)

示例#16

0

显示文件

文件： tabular_to_dbnsfp.py 项目： BiocrucesBizkaia/galaxy-mikel

def main():
    # Read options, args.
    usage = "Usage: %prog [options] tabular_input_file bgzip_output_file"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-c',
                      '--chr-col',
                      type='int',
                      default=0,
                      dest='chrom_col')
    parser.add_option('-s',
                      '--start-col',
                      type='int',
                      default=1,
                      dest='start_col')
    parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage()
        exit(1)
    input_fname, output_fname = args
    output_dir = os.path.dirname(output_fname)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pysam.tabix_compress(input_fname, output_fname, force=True)
    # Column indices are 0-based.
    pysam.tabix_index(output_fname,
                      seq_col=options.chrom_col,
                      start_col=options.start_col,
                      end_col=options.end_col)

示例#17

0

显示文件

文件： interval_to_tabix_converter.py 项目： PeterKoza/Galaxy-fmfi

def main():
    # Read options, args.
    parser = optparse.OptionParser()
    parser.add_option('-c', '--chr-col', type='int', dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', dest='start_col')
    parser.add_option('-e', '--end-col', type='int', dest='end_col')
    parser.add_option('-P', '--preset', dest='preset')
    (options, args) = parser.parse_args()
    input_fname, index_fname, out_fname = args

    # Create index.
    if options.preset:
        # Preset type.
        pysam.tabix_index(filename=index_fname,
                          preset=options.preset,
                          keep_original=True,
                          index_filename=out_fname)
    else:
        # For interval files; column indices are 0-based.
        pysam.tabix_index(filename=index_fname,
                          seq_col=(options.chrom_col - 1),
                          start_col=(options.start_col - 1),
                          end_col=(options.end_col - 1),
                          keep_original=True,
                          index_filename=out_fname)
    if os.path.getsize(index_fname) == 0:
        sys.stderr.write(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )

示例#18

0

显示文件

def compressVcf(vcfname,forceflag=False,remove=False):
    """Runs bgzip and tabix on input VCF file.

    Using the pysam library, this function runs the bgzip and tabix utilities
    on the given input file. By default, this will not overwrite an existing
    zipped file, but will overwrite an existing index. `remove` can be set to
    delete the unzipped file.

    Parameters
    ----------
    vcfname : str
        Name of uncompressed VCF file
    forceflag : bool (False)
        If true, will overwrite (vcfname).gz if it exists
    remove : bool (False)
        If true, will delete uncompressed source file

    Returns
    -------
    cvcfname : str
        Filepath to compressed VCF file
    """
    cvcfname = vcfname+".gz"
    pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
    pysam.tabix_index(cvcfname,preset="vcf",force=True)
    if remove:
        os.remove(vcfname)
    return cvcfname

示例#19

0

显示文件

def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(
                        preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset,
                                                               file=bedPath))

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError(
            "GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath

示例#20

0

显示文件

def to_tabix(bgzip_fname,
             out_fname,
             preset=None,
             chrom_col=None,
             start_col=None,
             end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        preset=preset,
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1),
                                        end_col=(end_col - 1),
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    if os.path.getsize(out_fname) == 0:
        sys.exit(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )
    return bgzip_fname

示例#21

0

显示文件

文件： matrix.py 项目： agus-setiawan-desu/pheweb

def run(argv):

    if '-h' in argv or '--help' in argv:
        print('Make a single large tabixed file of all phenotypes data')
        exit(1)

    if should_run():
        # we don't need `ffi.new('char[]', ...)` because args are `const`
        ret = lib.cffi_make_matrix(
            sites_filepath.encode('utf8'),
            common_filepaths['pheno']('*').encode('utf8'),
            matrix_gz_tmp_filepath.encode('utf8'))
        ret_bytes = ffi.string(ret, maxlen=1000)
        if ret_bytes != b'ok':
            raise PheWebError(
                'The portion of `pheweb matrix` written in c++/cffi failed with the message '
                + repr(ret_bytes))
        os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath)
        pysam.tabix_index(
            filename=matrix_gz_filepath,
            force=True,
            seq_col=0,
            start_col=1,
            end_col=1  # note: these are 0-based, but `/usr/bin/tabix` is 1-based
        )
    else:
        print('matrix is up-to-date!')

示例#22

0

显示文件

def tabix_bedgraph(bedgraph):
    pysam.tabix_compress(bedgraph, bedgraph + '.gz')
    pysam.tabix_index(bedgraph + '.gz',
                      seq_col=0,
                      start_col=1,
                      end_col=2,
                      zerobased=True)

示例#23

0

显示文件

文件： prepare_compliance_data.py 项目： ga4gh/server

    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)

示例#24

0

显示文件

文件： gff3_indexer.py 项目： Multiscale-Genomics/mg-process-files

    def gff32tabix(self, file_sorted_gff3, file_sorted_gz_gff3, file_gff3_tbi):  # pylint: disable=no-self-use,unused-argument
        """
        GFF3 to Tabix

        Compresses the sorted GFF3 file and then uses Tabix to generate an index
        of the GFF3 file.

        Parameters
        ----------
        file_sorted_gff3 : str
            Location of a sorted GFF3 file
        file_sorted_gz_gff3 : str
            Location of the bgzip compressed GFF3 file
        file_gff3_tbi : str
            Location of the Tabix index file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.gff32tabix(self, file_sorted_gff3, gz_file, tbi_file):
               output_metadata.set_exception(
                   Exception(
                       "gff32tabix: Could not process files {}, {}.".format(*input_files)))
        """
        pysam.tabix_compress(file_sorted_gff3, file_sorted_gz_gff3)  # pylint: disable=no-member
        pysam.tabix_index(file_sorted_gz_gff3, preset='gff')  # pylint: disable=no-member
        return True

示例#25

0

显示文件

文件： snpeff.py 项目： mypandos/viral-ngs

    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join(sorted(genomes))
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)

示例#26

0

显示文件

文件： snpeff.py 项目： tom-dyar/viral-ngs

    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)])
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)

示例#27

0

显示文件

    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn, preset="vcf", force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")

示例#28

0

显示文件

文件： tabix_test.py 项目： pkaleta/pysam

    def setUp( self ):
        
        self.tmpfilename = "tmp_%s.vcf" % id(self)
        shutil.copyfile( self.filename, self.tmpfilename )
        pysam.tabix_index( self.tmpfilename, preset = "vcf" )

        self.tabix = pysam.Tabixfile( self.tmpfilename + ".gz" )
        self.compare = [ x[:-1].split("\t") for x in open( self.filename, "r") if not x.startswith("#") ]

示例#29

0

显示文件

文件： tabix_test.py 项目： msto/pysam

    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename, preset="gff",
                          index=index_path, force=True)
        self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)

示例#30

0

显示文件

文件： tabix_test.py 项目： carolinux/pysam

    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

示例#31

0

显示文件

文件： tabix_test.py 项目： humburg/pysam

    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

示例#32

0

显示文件

文件： get_sequence_from_vcf.py 项目： nlapalu/misc

    def indexingVariantFile(self, varFile):
        """Index variant file with Tabix"""

        logging.info('Trying to index file: {}'.format(varFile))
        pysam.tabix_index(varFile, force=True, preset="vcf")
        if not self.isVariantFileIndexed(varFile):
            raise Exception("Can not index file: {}".format(varFile))
        return True

示例#33

0

显示文件

def get_refseq_allele(vcf_file, fasta_file, out_file, verbose=False):
    """
    Output a reference VCF file that contains the ref/alt allele coding 
    with the ref allele matching the reference sequence. This assumes that 
    all alleles are in the forward strand. If neither the ref or alt 
    allele is not found in the reference sequence, the alleles are output 
    as missing.
    
    Attributes
    ----------
    vcf_file : str
        VCF/BCF file name.
    fasta_file : str
        Fasta file name. Must be unzipped and have an fai index.
    out_file : str
        Output VCF file name.
    """

    rdr = fasta_fai.Reader(fasta_file)
    vcf_in = pysam.VariantFile(vcf_file, mode='r')

    vcf_out = pysam.VariantFile(out_file, mode='w')
    for r in vcf_in.header.records:
        vcf_out.header.add_record(r)

    counter = 0
    for rec in vcf_in.fetch():
        counter = counter + 1
        if verbose and counter % 1000 == 0:
            print(counter, "records")
        rec_out = vcf_out.new_record()
        rec_out.id = rec.id
        rec_out.pos = rec.pos
        rec_out.chrom = rec.chrom
        o = list(range(len(rec.alleles)))
        orv = list(reversed(o))
        ref_i = 0
        for i in orv:
            a = rec.alleles[i]
            refseq_base = rdr.get_seq(rec.chrom, rec.pos - 1, len(a))
            if a == refseq_base:
                ref_i = i
        o[ref_i] = 0
        o[0] = ref_i
        alleles = list()
        for i in o:
            alleles.append(rec.alleles[i])
        if (len(alleles) == 1):
            alleles.append('.')
        rec_out.alleles = tuple(alleles)
        alleles_set = set(rec.alleles)
        vcf_out.write(rec_out)
    rdr.close()
    vcf_in.close()
    vcf_out.close()
    if out_file[-3:] == ".gz":
        pysam.tabix_index(out_file, preset="vcf", force=True)

示例#34

0

显示文件

 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))

示例#35

0

显示文件

文件： VariantFile_test.py 项目： msto/pysam

 def test_vcf_with_tbi_index(self):
     with get_temp_context("tmp_fn.vcf") as fn:
         shutil.copyfile(self.vcf_filename, fn)
         pysam.tabix_index(fn, preset="vcf", force=True)
         self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
         self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))
         
         with pysam.VariantFile(fn + ".gz") as inf:
             self.assertEqual(len(list(inf.fetch("20"))), 3)

示例#36

0

显示文件

文件： customdb_prep.py 项目： sicotteh/CAVA

def indexFile(input_file):
    sys.stdout.write('Compressing file... ')
    sys.stdout.flush()
    pysam.tabix_compress(input_file, input_file + '.gz', force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write('Indexing output file... ')
    sys.stdout.flush()
    pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True)
    sys.stdout.write('OK\n')

示例#37

0

显示文件

    def test_vcf_with_tbi_index(self):
        with get_temp_context("tmp_fn.vcf") as fn:
            shutil.copyfile(self.vcf_filename, fn)
            pysam.tabix_index(fn, preset="vcf", force=True)
            self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
            self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))

            with pysam.VariantFile(fn + ".gz") as inf:
                self.assertEqual(len(list(inf.fetch("20"))), 3)

示例#38

0

显示文件

文件： run_nfr.py 项目： JordiAlbert/NucleoATAC

def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    if args.fasta is not None:
        chrs_fasta = read_chrom_sizes_from_fasta(args.fasta)
        pwm = PWM.open(args.pwm)
        chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down))
    else:
        chunks = ChunkList.read(args.bed)
    if args.bam is not None:
        chrs_bam = read_chrom_sizes_from_bam(args.bam)
        chunks.checkChroms(chrs_bam, chrom_source = "BAM file") 
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)

示例#39

0

显示文件

文件： tabix_test.py 项目： msto/pysam

 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(
         self.tmpfilename + ".tbi", self.filename_idx))

示例#40

0

显示文件

文件： make_non_somatic_panel.py 项目： sunliang3361/RNAIndel

def make_non_somatic_panel(file_lst, panelname, genome, cosmic_db, cnt):

    indel_lst = filter_indels(file_lst, genome, cosmic_db, cnt)
    vcf_data = to_vcf_data(indel_lst)

    with open(panelname, "w") as f:
        f.write(vcf_header() + "\n")
        f.write("\n".join(vcf_data))

    pysam.tabix_index(panelname, preset="vcf")

示例#41

0

显示文件

    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))

示例#42

0

显示文件

文件： VariantFile_test.py 项目： msto/pysam

    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn,
                              preset="vcf",
                              force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")

示例#43

0

显示文件

文件： main_refseq.py 项目： sicotteh/CAVA

def indexFile(f, options):
    sys.stdout.write(f'Compressing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write(f'Indexing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')

示例#44

0

显示文件

文件： tabix_test.py 项目： jmarshall/pysam

    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename,
                          preset="gff",
                          index=index_path,
                          force=True)
        self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)

示例#45

0

显示文件

    def _index_with_tabix(self):
        """Compress and index output file by Tabix"""

        pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True)
        pysam.tabix_index(self._fn + '.gz',
                          seq_col=self.idx_chrom,
                          start_col=self.idx_start,
                          end_col=self.idx_end,
                          meta_char='#',
                          force=True)

示例#46

0

显示文件

def process_vcf(archive, vcf, vcf_index, output_prefix):
    """
    Extracts and processes the caveman vcf file.
    """
    out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix)
    logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf))
    extract_file(archive, vcf, out_raw_vcf)

    out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix)
    logger.info(
        "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index))
    extract_file(archive, vcf_index, out_raw_vcf_index)

    # Update the sample name using BGZFile which doesn't assert any VCF format
    logger.info("Processing raw VCF to change TUMOUR -> TUMOR...")
    out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix)
    logger.info("Creating final vcf {0}".format(out_formatted_vcf))
    writer = pysam.BGZFile(out_formatted_vcf, mode='wb')
    reader = pysam.BGZFile(out_raw_vcf, mode='rb')
    try:
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('##'):
                if line.startswith('##SAMPLE=<ID=TUMOUR'):
                    new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n'
                    writer.write(new_line.encode('utf-8'))
                else:
                    new_line = line + '\n'
                    writer.write(new_line.encode('utf-8'))
            elif line.startswith('#CHROM'):
                new_line = line.replace('TUMOUR', 'TUMOR') + '\n'
                writer.write(new_line.encode('utf-8'))
            else:
                # BINF-306: fix rare case of alt == ref in caveman vcf.
                cols = line.split('\t')
                if cols[3] == cols[4]:
                    logger.warn(
                        "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}"
                        .format(cols[0], cols[1], cols[3], cols[4]))
                    continue
                new_line = line + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final vcf index {0}".format(out_formatted_vcf +
                                                      '.tbi'))
    pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True)

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_vcf)
    os.remove(out_raw_vcf_index)

示例#47

0

显示文件

文件： tabix_test.py 项目： msto/pysam

    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(
            self.tmpfilename + ".tbi", self.filename_idx))

示例#48

0

显示文件

文件： file_utils.py 项目： statgen/pheweb

def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    )

示例#49

0

显示文件

文件： merge.py 项目： JordiAlbert/NucleoATAC

def run_merge(args):
    if not args.out:
        args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3])
    occ = NucList.read(args.occpeaks, "occ", args.min_occ)
    nuc = NucList.read(args.nucpos, "nuc", args.min_occ)
    new = merge(occ, nuc, args.sep)
    out = open(args.out + '.nucmap_combined.bed','w')
    out.write(new.asBed())
    out.close()
    pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nucmap_combined.bed')
    pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)

示例#50

0

显示文件

文件： VariantFile_test.py 项目： bayolau/pysam

    def testEmptyFileVCFGZWithIndex(self):
        with open("tmp_testEmptyFile.vcf", "w"):
            pass

        pysam.tabix_index("tmp_testEmptyFile.vcf",
                          preset="vcf",
                          force=True)

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFile.vcf.gz")

        os.unlink("tmp_testEmptyFile.vcf.gz")
        os.unlink("tmp_testEmptyFile.vcf.gz.tbi")

示例#51

0

显示文件

文件： TabixIndexer.py 项目： Yixf-Self/oncotator

    def index(destDir, inputFilename, fileColumnNumList=None, preset=None):
        """
        Create a tabix index file for genomic position datasource tsv files.
        Prerequisites (for genomic position indexed):
            Input file has three columns that can be mapped to chromosome, start position, and end position without any modification.
                For example, ['hg19.oreganno.chrom', 'hg19.oreganno.chromStart', 'hg19.oreganno.chromEnd'] in oreganno.hg19.txt

        This will overwrite an existing index (since the force parameter is set to True in pysam.tabix_index() call).
        Also, in cases where the inputFilename doesn't end with a ".gz", the a compressed file will be created and indexed.

        :param destDir: destination directory
        :param ds_foldername: destination folder name
        :param fileColumnNumList: ordered list.  This list contains the corresponding entries (column numbers)
            in the tsv file. Typically, this would be [chr,start,end]  or [gene, startAA, endAA]
        :param inputFilename: tsv file input
        :param preset: if preset is provided, the column coordinates are taken from a preset. Valid values for preset
        are "gff", "bed", "sam", "vcf", "psltbl", and "pileup".
        """
        fileColumnNumList = [] if fileColumnNumList is None else fileColumnNumList
        inputFilename = os.path.abspath(inputFilename)
        fileDir = os.path.dirname(inputFilename)
        fileName, fileExtension = os.path.splitext(os.path.basename(inputFilename))

        if fileExtension in (".gz",):
            # Ensure .gz.tbi file is there as well
            inputIndexFilename = os.path.join(fileDir, string.join([inputFilename, "tbi"], "."))
            if not os.path.exists(inputIndexFilename):
                msg = "Missing tabix index file %s." % inputIndexFilename
                raise TabixIndexerFileMissingError(msg)

            outputFilename = os.path.join(destDir, string.join([fileName, "gz"], "."))
            shutil.copyfile(inputFilename, outputFilename)

            outputIndexFilename = os.path.join(destDir, string.join([fileName, "gz", "tbi"], "."))
            shutil.copyfile(inputIndexFilename, outputIndexFilename)

            return outputFilename

        outputFilename = os.path.join(destDir, string.join([fileName, ".tabix_indexed", fileExtension], ""))
        # Copy the input file to output file.
        shutil.copyfile(inputFilename, outputFilename)

        # Load the file into a tsvReader.
        if preset in ("gff", "bed", "sam", "vcf", "psltbl", "pileup"):
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, preset=preset)
        else:
            # Have to specify min_size=0 in pysam 0.8.1 to get pysam to correctly output a .tbi file 
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, seq_col=fileColumnNumList[0],
                                            start_col=fileColumnNumList[1], end_col=fileColumnNumList[2])

        return tabix_index

示例#52

0

显示文件

文件： tabix.py 项目： HaoKuo/bam2x

def run(args):
    fin=IO.fopen(args.input,"r")
    outfile=args.input
    if not args.sorted:
        l = [ i for i in TableIO.parse(fin,args.format) ]
        l.sort()
        name=splitext(args.input)
        outfile = "{name[0]}.sorted{name[1]}".format(name=name)
        out = IO.fopen(outfile,"w")
        for i in l:
            print(i,file=out)
        out.close()
    format=args.format.translate(None,digits)
    tabix_index(outfile,preset=format)

示例#53

0

显示文件

文件： interval_to_tabix_converter.py 项目： ImmPortDB/immport-galaxy

def to_tabix(bgzip_fname, out_fname, preset=None, chrom_col=None, start_col=None, end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, preset=preset, keep_original=True,
                                        index=out_fname, force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1), end_col=(end_col - 1),
                                        keep_original=True, index=out_fname, force=True)
    if os.path.getsize(out_fname) == 0:
        sys.stderr.write("The converted tabix index file is empty, meaning the input data is invalid.")
    return bgzip_fname

示例#54

0

显示文件

文件： vcf_utils.py 项目： thongnt2/metasv

def merge_vcfs(in_vcfs_dir, contigs, out_vcf):
    logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir)
    header_done = False
    out_vcf_file = open(out_vcf, "w")
    for contig in contigs:
        chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name)
        if os.path.isfile(chr_vcf):
            chr_tabix_file = pysam.Tabixfile(chr_vcf)
            if not header_done:
                print_header(chr_tabix_file.header, out_vcf_file)
            for entry in chr_tabix_file.fetch():
                out_vcf_file.write("%s\n" % entry)
            chr_tabix_file.close()
    out_vcf_file.close()
    pysam.tabix_index(out_vcf, force=True, preset="vcf")

示例#55

0

显示文件

文件： dbprep.py 项目： RahmanTeam/OpEx

def indexFile(options):
    filename=options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ') 
        sys.stdout.flush()
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        sys.stdout.write('OK\n') 	
        sys.stdout.write('Indexing output file... ') 
        sys.stdout.flush()
        pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)

示例#56

0

显示文件

文件： gtf.py 项目： soh-i/Ivy

 def _prepare(self):
     if not os.path.isfile(self.ingtf + ".gz.tbi"):
         print "Generate indexed GTF (tabix) file: '{0}'...".format(self.ingtf)
         compressed_gtf = pysam.tabix_index(self.ingtf, preset="gff")
     else:
         compressed_gtf = self.ingtf + ".gz"
     self.tabixfile = pysam.Tabixfile(compressed_gtf)

示例#57

0

显示文件

文件： prepare_compliance_data.py 项目： holtlab/server

 def addVariantSet(self, variantFileName, dataset, referenceSet, ontology):
     inputVcf = os.path.join(
         self.inputDirectory, variantFileName)
     outputVcf = os.path.join(
         self.outputDirectory, variantFileName)
     shutil.copy(inputVcf, outputVcf)
     pysam.tabix_index(outputVcf, preset="vcf")
     variantSet = variants.HtslibVariantSet(
         dataset, variantFileName.split('_')[1])
     variantSet.setReferenceSet(referenceSet)
     variantSet.populateFromFile(
         [outputVcf + ".gz"], [outputVcf + ".gz.tbi"])
     variantSet.checkConsistency()
     self.repo.insertVariantSet(variantSet)
     for annotationSet in variantSet.getVariantAnnotationSets():
         annotationSet.setOntology(ontology)
         self.repo.insertVariantAnnotationSet(annotationSet)

示例#58

0

显示文件

文件： g2g_fileutils.py 项目： churchill-lab/g2gtools

def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))

示例#59

0

显示文件

文件： control.py 项目： bujaraty/trainer-and-predictor

 def __tabix(self, file_name):
     """ tabix into gz and tbi file """
     return pysam.tabix_index(file_name,
                              force     = True,
                              seq_col   = combivep_settings.LJB_PARSED_0_INDEX_CHROM,
                              start_col = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              end_col   = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              zerobased = False)

示例#60

0

显示文件

文件： tmpmod.py 项目： andrewparkermorgan/lapels

def getTabixMod(filename):
    '''Unzip a mod file, use bgzip to rezip it, and and build tabix index.'''
    logger = logging.getLogger('tmpmod') 
    logger.info('extracting MOD file ...')   
    modfp = gzip.open(filename, 'rb')
    tmpName = tempfile.mkstemp('.tsv')[1]    
    tmpfp = open(tmpName, 'wb')
    tmpfp.writelines(modfp)
    tmpfp.close()
    modfp.close()    
    pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                      meta_char='#', zerobased=True)
    tmpName += '.gz'
    logger.info('temporary file %s created', tmpName)
    return tmpName

#print(getTabixMod("../data/B.mod"))