Пример #1
0
def extracAllels(chrom, vcf_coord, var_descr, genref):
    '''compute alternative allel from description in bed file.
    must be one of sub(C->T), ins(CCCT), del(5)
    '''
    ref_allel = pysam.faidx(genref, chrom + ':' + vcf_coord + '-' +
                            vcf_coord)[1].strip()
    if 'sub' in var_descr:
        yy = var_descr.split('->')
        ref = yy[0][-1]
        if ref == ref_allel:
            return '\t'.join([ref, yy[1][0]])
        else:
            print >> sys.stderr, 'ref allels do not match, exiting...'
            print >> sys.api_version, chrom, vcf_coord, var_descr
            sys.exit(1)
    elif 'ins' in var_descr:
        yy = var_descr.split('(')[1]
        return '\t'.join([ref_allel, ref_allel + yy[:-1]])
    elif 'del' in var_descr:
        yy = var_descr.split('(')[1]
        del_len = int(yy[:-1])
        vcf_coord_end = str(int(vcf_coord) + int(del_len))
        ref = pysam.faidx(genref, chrom + ':' + vcf_coord + '-' +
                          vcf_coord_end)[1].strip()
        if ref[0] == ref_allel:
            return '\t'.join([ref, ref_allel])
        else:
            print >> sys.api_version, 'ref allels do not match in del, exiting...'
            print >> sys.api_version, chrom, vcf_coord, var_descr
            sys.exit(1)
    else:
        print >> sys.api_version, 'format not found, exiting...'
        sys.exit(1)
Пример #2
0
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None,
                   out_file=None, deep_coverage=False):
    """Realign a BAM file around indels using GATK, returning sorted BAM.
    """
    runner = broad.runner_from_config(config)
    runner.run_fn("picard_index", align_bam)
    runner.run_fn("picard_index_ref", ref_file)
    if not os.path.exists("%s.fai" % ref_file):
        pysam.faidx(ref_file)
    if region:
        align_bam = subset_bam_by_region(align_bam, region, out_file)
        runner.run_fn("picard_index", align_bam)
    if has_aligned_reads(align_bam, region):
        variant_regions = config["algorithm"].get("variant_regions", None)
        realign_target_file = gatk_realigner_targets(runner, align_bam,
                                                     ref_file, dbsnp, region,
                                                     out_file, deep_coverage,
                                                     variant_regions)
        realign_bam = gatk_indel_realignment(runner, align_bam, ref_file,
                                             realign_target_file, region,
                                             out_file, deep_coverage)
        # No longer required in recent GATK (> Feb 2011) -- now done on the fly
        # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
        return realign_bam
    elif out_file:
        shutil.copy(align_bam, out_file)
        return out_file
    else:
        return align_bam
Пример #3
0
    def __init__(self, filename):
        if not os.path.exists(filename + '.fai'):
            import pysam
            pysam.faidx(filename)

        self.fasta = open(filename)
        self.index = self.load_index(filename + '.fai')
Пример #4
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    dataset_path = rc.task.output_files[1]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[2]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    if rc.task.options[Constants.DIPLOID_MODE_ID]:
        args.append("--diploid")
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
def upstream_and_downstream_seq(args):
    chromosome = split_coords(args.coords)[0]
    start = str(split_coords(args.coords)[1])
    downstream = str(int(start)-1000)
    end = str(split_coords(args.coords.replace('"', ""))[2])
    upstream = str(int(end)+1000)

    #using the samtools faidx function to take the appropriate sequence from a reference genome
    downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna)

    upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna)

    # Selecting only the sequence and converting to uppercase
    downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper()
    # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence
    reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement()

    # Making sequence records with ID header and sequence
    downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence")
    reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence")

    if os.path.isdir(args.directory+"tmp/") == False:
        os.mkdir(args.directory+"tmp/")

    # Writing sequences to fasta file
    downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w")
    downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq))

    upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w")
    upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta) # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Пример #7
0
        def do_download(output_path):
            real_url = self.base_url + url
            raw = get_page(real_url)
            if not raw:  # pragma: no cover
                raise ValueError("Retrieving url failed: %s" % real_url)
            for aregexps in regexps:
                matches = re.findall(aregexps, raw)
                if len(matches) == 1:
                    Path(str(output_path / output_filename) + ".url").write_text(
                        (real_url + matches[0])
                    )
                    download_func(
                        real_url + match_transformer(matches[0]),
                        output_path / output_filename,
                    )
                    break
            else:
                raise ValueError(  # pragma: no cover - defensive
                    "Found either too few or too many for every regexps. \nRaw was %s"
                    % (raw,)
                )

            if Path(output_filename).suffix == ".fasta":
                import pysam

                pysam.faidx(str((output_path / output_filename).absolute()))
Пример #8
0
def chrom_length(fasta_in):
    """
    Compute chromosome lengths of fasta file and store them into a file.

    More about the .fai file format can be found here:
    http://www.htslib.org/doc/faidx.html

    Parameters
    ----------
    fasta_in : str
        Path to genome FASTA file (can be .gz).

    Returns
    -------
    str
        Absolute path to output file.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    temp = iCount.files.decompress_to_tempfile(fasta_in)
    pysam.faidx(temp)  # pylint: disable=no-member

    fai_file = os.path.abspath(fasta_in + '.fai')
    shutil.move(temp + '.fai', fai_file)
    LOGGER.info('Fai file saved to : %s', fai_file)
    return fai_file
Пример #9
0
    def __init__(self, filename):
        if not os.path.exists(filename + '.fai'):
            import pysam
            pysam.faidx(filename)

        self.fasta = open(filename)
        self.index = self.load_index(filename + '.fai')
def merge_mut2(mutation_file_list, output_file, reference):


    mut2sample = {}
    sample_ind = 0
    for mut_file in mutation_file_list:
        sample_ind = sample_ind + 1
        is_vcf = True if mut_file.endswith(".vcf") or mut_file.endswith(".vcf.gz") else False
        hin2 = gzip.open(mut_file, 'r') if mut_file.endswith(".gz") else open(mut_file, 'r')

        for line2 in hin2:
            F2 = line2.rstrip('\n').split('\t')
            if F2[0].startswith('#'): continue
            if F2[0] == "Chr": continue

            if is_vcf == False:
                pos, ref, alt = F2[1], F2[3], F2[4]
        
                # insertion
                if F2[3] == "-":
                    # get the sequence for the reference base
                    seq = ""    
                    for item in pysam.faidx(reference, F2[0] + ":" + str(F2[1]) + "-" + str(F2[1])):
                        seq = seq + item.rstrip('\n')
                    seq = seq.replace('>', '')
                    seq = seq.replace(F2[0] + ":" + str(F2[1]) + "-" + str(F2[1]), '')
                    ref, alt = seq, seq + F2[4]

                # deletion
                if F2[4] == "-":
                    # get the sequence for the reference base
                    seq = ""    
                    for item in pysam.faidx(reference, F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1)):
                        seq = seq + item.rstrip('\n')
                    seq = seq.replace('>', '')
                    seq = seq.replace(F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1), '')
                    pos, ref, alt = str(int(F2[1]) - 1), seq + F2[3], seq

                QUAL = 60
                INFO = "SOMATIC"

                key = '\t'.join([F2[0], pos, '.', ref, alt, str(QUAL), "PASS", INFO])

            else:

                key = '\t'.join(F2[0:8])

            if key not in mut2sample:
                mut2sample[key] = []

            mut2sample[key].append(str(sample_ind))

    sample_num = sample_ind

    hout = open(output_file, 'w')
    for mut in sorted(mut2sample):
        if len(mut2sample[mut]) == sample_num: continue
        print >> hout, mut + '\t' + ','.join(mut2sample[mut])

    hout.close()
Пример #11
0
def extracAllels(chrom, vcf_coord, var_descr, genref):
    '''compute alternative allel from description in bed file.
    must be one of sub(C->T), ins(CCCT), del(5)
    '''
    ref_allel = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord)[1].strip()
    if 'sub' in var_descr:
        yy = var_descr.split('->')
        ref = yy[0][-1]
        if ref == ref_allel:
            return '\t'.join([ref, yy[1][0]])
        else:
            print 'ref allels do not match, exiting...'
            print chrom, vcf_coord, var_descr
            sys.exit(1)
    elif 'ins' in var_descr:
        yy = var_descr.split('(')[1]
        return '\t'.join([ref_allel, ref_allel + yy[:-1]])
    elif 'del' in var_descr:
        yy = var_descr.split('(')[1]
        del_len = int(yy[:-1])
        vcf_coord_end = str(int(vcf_coord) + int(del_len))
        ref = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord_end)[1].strip()
        if ref[0] == ref_allel:
            return '\t'.join([ref, ref_allel])
        else:
            print 'ref allels do not match in del, exiting...'
            print chrom, vcf_coord, var_descr
            sys.exit(1)
    else:
        print 'format not found, exiting...'
        sys.exit(1)
Пример #12
0
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout):
    if not os.path.exists('%s.fai' % ref_fasta):
        pysam.faidx(ref_fasta)

    fasta = pysam.Fastafile(ref_fasta)

    refs = set()
    with open('%s.fai' % ref_fasta) as f:
        for line in f:
            refs.add(line.split('\t')[0].strip())

    name = ''
    for region in bed:
        if include_name:
            name = '%s|' % (region.name.strip())

        if region.end - region.start >= min_size and region.chrom in refs:
            seq = fasta.fetch(region.chrom, region.start, region.end)
            if stranded and region.strand:
                if region.strand == '-':
                    seq = revcomp(seq)
                out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq))
            else:
                out.write('>%s%s:%d-%d%s\n%s\n' % (name, region.chrom, region.start, region.end, seq))

    fasta.close()
Пример #13
0
def finalize_outputs(options, tdb_writer, out_fasta, out_genepred,
                     out_genepred_annovar, out_fasta_annovar, gbk_dir, out_id,
                     out_excl):

    tdb_writer.finalize(options)

    out_fasta.close()

    out_id.close()

    out_excl.close()

    pysam.faidx(options.output + '.fa')

    out_genepred.close()

    if options.annovar:
        out_genepred_annovar.close()
        out_fasta_annovar.close()
        pysam.faidx('{}_refGeneMrna.fa'.format(options.output))

    if options.gbk:
        shutil.make_archive('{}_gbk'.format(options.output), "zip", './',
                            gbk_dir)
        shutil.rmtree(gbk_dir)
Пример #14
0
def gen_restricted_reference(reference,
                             regions_bed,
                             out_reference,
                             use_short_contigs_names=False):
    logger = logging.getLogger(gen_restricted_reference.__name__)

    reference_handle = pysam.Fastafile(reference)
    regions_bedtool = pybedtools.BedTool(regions_bed)

    with open(out_reference, "w") as out_fasta:
        for region_index, region in enumerate(regions_bedtool, start=1):
            sequence = reference_handle.fetch(reference=str(region.chrom),
                                              start=region.start,
                                              end=region.end)
            region_name = str(region_index) if use_short_contigs_names else (
                "%s_%d_%d" % (str(region.chrom), region.start, region.end))
            if region_index == 1:
                out_fasta.write(">{}\n{}".format(region_name, sequence))
            else:
                out_fasta.write("\n>{}\n{}".format(region_name, sequence))
    pysam.faidx(out_reference)
    logger.info("Lifted over the reference to {}".format(out_reference))

    reference_handle.close()

    return out_reference
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options,
        log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta)  # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(
            fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Пример #16
0
    def make_index(file_name):
        """Make index file for input file"""
        f_bs, f_ext = os.path.splitext(file_name)

        def indexed(fn, ext):
            return os.path.exists(fn + ext)

        def uptodate(fn, ext):
            return os.getmtime(fn) < os.getmtime(fn + ext)

        infomsg = "{} was indexed and is uptodate. Skipping".format(file_name)
        if f_ext == ".fa":
            if indexed(file_name, ".fai") and uptodate(file_name, ".fai"):
                print(infomsg)
            else:
                pysam.faidx(file_name)
        elif f_ext in [".bam", ".cram"]:
            if indexed(file_name, ".bai") and uptodate(file_name, ".bai"):
                print(infomsg)
            else:
                pysam.index(file_name)
        elif f_ext in [".gff", ".bed", ".vcf", ".sam"]:
            if indexed(file_name, ".gz.tbi") and uptodate(
                    file_name, ".gz.tbi"):
                print(infomsg)
            else:
                pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))
Пример #17
0
def bed_tofasta(bed,
                ref_fasta,
                min_size=50,
                stranded=True,
                include_name=False,
                out=sys.stdout):
    if not os.path.exists('%s.fai' % ref_fasta):
        pysam.faidx(ref_fasta)

    fasta = pysam.Fastafile(ref_fasta)

    refs = set()
    with open('%s.fai' % ref_fasta) as f:
        for line in f:
            refs.add(line.split('\t')[0].strip())

    name = ''
    for region in bed:
        if include_name:
            name = '%s|' % (region.name.strip())

        if region.end - region.start >= min_size and region.chrom in refs:
            seq = fasta.fetch(region.chrom, region.start, region.end)
            if stranded and region.strand:
                if region.strand == '-':
                    seq = revcomp(seq)
                out.write('>%s%s:%d-%d[%s]\n%s\n' %
                          (name, region.chrom, region.start, region.end,
                           region.strand, seq))
            else:
                out.write('>%s%s:%d-%d\n%s\n' %
                          (name, region.chrom, region.start, region.end, seq))

    fasta.close()
Пример #18
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
Пример #19
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
Пример #20
0
def gatk_realigner(align_bam,
                   ref_file,
                   config,
                   dbsnp=None,
                   region=None,
                   out_file=None,
                   deep_coverage=False):
    """Realign a BAM file around indels using GATK, returning sorted BAM.
    """
    runner = broad.runner_from_config(config)
    runner.run_fn("picard_index", align_bam)
    runner.run_fn("picard_index_ref", ref_file)
    if not os.path.exists("%s.fai" % ref_file):
        pysam.faidx(ref_file)
    if region:
        align_bam = subset_bam_by_region(align_bam, region, out_file)
        runner.run_fn("picard_index", align_bam)
    if has_aligned_reads(align_bam, region):
        variant_regions = config["algorithm"].get("variant_regions", None)
        realign_target_file = gatk_realigner_targets(runner, align_bam,
                                                     ref_file, dbsnp, region,
                                                     out_file, deep_coverage,
                                                     variant_regions)
        realign_bam = gatk_indel_realignment(runner, align_bam, ref_file,
                                             realign_target_file, region,
                                             out_file, deep_coverage)
        # No longer required in recent GATK (> Feb 2011) -- now done on the fly
        # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
        return realign_bam
    elif out_file:
        shutil.copy(align_bam, out_file)
        return out_file
    else:
        return align_bam
Пример #21
0
def generate_data_files(dir_name=None):
    import logging
    logging.basicConfig(level=logging.INFO)
    if dir_name is not None:
        os.chdir(dir_name)
    with open("tst1.fasta", "w") as f:
        f.write(">ecoliK12_pbi_March2013_2955000_to_2980000\n")
        f.write("AAAGAGAGAG" * 2500)
    pysam.faidx("tst1.fasta")
    for i in range(len(sam_strings)):
        sam_file = "tst_%d_subreads.sam" % (i + 1)
        bam_file = "tst_%d_subreads.bam" % (i + 1)
        with open(sam_file, "w") as sam_out:
            sam_out.write(sam_strings[i])
        logging.info("Converting {s} to BAM".format(s=sam_file))
        # FIXME pysam is way broken - can't handle unmapped input?
        # convert to bam using pysam
        # with pysam.AlignmentFile(sam_file, "r", check_sq=False) as sam_in:
        #    with pysam.AlignmentFile(bam_file, "wb",
        #                             template=sam_in) as bam_out:
        #        for s in sam_in:
        #            bam_out.write(s)
        args = ["samtools", "view", "-b", "-o", bam_file, sam_file]
        assert subprocess.call(args) == 0, args
        os.remove(sam_file)
        # XXX don't create .pbi for this file, we want it to be absent
        if bam_file != "tst_2_subreads.bam":
            logging.info("Indexing {b}".format(b=bam_file))
            subprocess.call(["pbindex", bam_file])
Пример #22
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam")
        localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted")

        samToBamFile(self.samFile, localBamFile)
        pysam.sort(localBamFile, localSortedBamFile)
        pysam.index(localSortedBamFile + ".bam")
        pysam.faidx(self.referenceFastaFile)
        
        file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] +  "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1]
        consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf")
        consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq")

        system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \
                % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf))
        system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq))
        system("rm -rf %s" % (self.referenceFastaFile + ".fai"))
        
        formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq")
        
        formatConsensusFastq(consensus_fastq, formatted_consensus_fastq)
        system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq))
        
        self.finish()
Пример #23
0
def fetch_file(options):
    if len(options) != 4:
        sys.exit('fetch_ucsc.py hg19/hg38/mm9/mm10 ref/kg/ens/fa out')
    if options[1] in {'hg19', 'hg38', 'mm9', 'mm10'}:
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/' % options[1]
    else:
        sys.exit('Only support human or mouse!')
    s = {32: 95}
    if options[2] == 'ref':  # RefSeq gene annotations
        download_file(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz')
        with open(options[3], 'wb') as outf:
            outf.write(gzip.open('refFlat.txt.gz', 'rb').read())
    elif options[2] == 'kg':  # KnownGenes gene annotations
        download_file(path + 'database/knownGene.txt.gz', 'knownGene.txt.gz')
        download_file(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz')
        kg_iso = {}
        with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f:
            for line in kg_id_f:
                iso = line.decode().split('\t')[0]
                gene = line.decode().split('\t')[4].translate(s)
                kg_iso[iso] = gene
        with gzip.open('knownGene.txt.gz', 'rb') as kg_f:
            with open(options[3], 'w') as outf:
                for line in kg_f:
                    entry = line.decode().split('\t')
                    iso = entry[0]
                    outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n')
    elif options[2] == 'ens':  # Ensembl gene annotations
        if options[1] == 'hg38' or options[1] == 'mm10':
            sys.exit('No Ensembl gene annotations for hg38 or mm10!')
        download_file(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz')
        download_file(path + 'database/ensemblToGeneName.txt.gz',
                      'ensemblToGeneName.txt.gz')
        ens_iso = {}
        with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f:
            for line in ens_id_f:
                iso, gene = line.decode().split()
                ens_iso[iso] = gene
        with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
            with open(options[3], 'w') as outf:
                for line in ens_f:
                    entry = line.decode().split()
                    iso = entry[1]
                    outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
    elif options[2] == 'fa':  # Genome sequences
        if options[1] == 'hg38':
            fa_path = 'bigZips/hg38.chromFa.tar.gz'
        else:
            fa_path = 'bigZips/chromFa.tar.gz'
        download_file(path + fa_path, 'chromFa.tar.gz')
        with tarfile.open('chromFa.tar.gz', 'r:gz') as fa:
            with open(options[3], 'w') as outf:
                for f in fa:
                    if f.isfile():
                        content = fa.extractfile(f).read()
                        outf.write(content.decode())
        pysam.faidx(options[3])
    else:
        sys.exit('Only support ref/kg/ens/fa!')
Пример #24
0
def fetch_file(options):
    if len(options) != 4:
        sys.exit('fetch_ucsc.py hg19/hg38/mm10 ref/kg/ens/fa out')
    if options[1] in {'hg19', 'hg38', 'mm10'}:
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/' % options[1]
    else:
        sys.exit('Only support human or mouse!')
    s = string.maketrans(' ', '_')
    if options[2] == 'ref':  # RefSeq gene annotations
        urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz')
        with open(options[3], 'w') as outf:
            outf.write(gzip.open('refFlat.txt.gz', 'rb').read())
    elif options[2] == 'kg':  # KnownGenes gene annotations
        urllib.urlretrieve(path + 'database/knownGene.txt.gz',
                           'knownGene.txt.gz')
        urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz')
        kg_iso = {}
        with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f:
            for line in kg_id_f:
                iso = line.split('\t')[0]
                gene = line.split('\t')[4].translate(s)
                kg_iso[iso] = gene
        with gzip.open('knownGene.txt.gz', 'rb') as kg_f:
            with open(options[3], 'w') as outf:
                for line in kg_f:
                    entry = line.split('\t')
                    iso = entry[0]
                    outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n')
    elif options[2] == 'ens':  # Ensembl gene annotations
        if options[1] == 'hg38':
            sys.exit('No Ensembl gene annotations for hg38!')
        urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz')
        urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz',
                           'ensemblToGeneName.txt.gz')
        ens_iso = {}
        with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f:
            for line in ens_id_f:
                iso, gene = line.split()
                ens_iso[iso] = gene
        with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
            with open(options[3], 'w') as outf:
                for line in ens_f:
                    entry = line.split()
                    iso = entry[1]
                    outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
    elif options[2] == 'fa':  # Genome sequences
        if options[1] == 'hg38':
            fa_path = 'bigZips/hg38.chromFa.tar.gz'
        else:
            fa_path = 'bigZips/chromFa.tar.gz'
        urllib.urlretrieve(path + fa_path, 'chromFa.tar.gz')
        with tarfile.open('chromFa.tar.gz', 'r:gz') as fa:
            with open(options[3], 'w') as outf:
                for f in fa:
                    if f.isfile():
                        outf.write(fa.extractfile(f).read())
        pysam.faidx(options[3])
    else:
        sys.exit('Only support ref/kg/ens/fa!')
Пример #25
0
Файл: faidx.py Проект: ys4/ariba
def write_fa_subset(seq_names, infile, outfile):
    if not os.path.exists(infile + '.fai'):
        pysam.faidx(infile)

    f = pyfastaq.utils.open_file_write(outfile)
    for name in seq_names:
        print(pysam.faidx(infile, name), end='', file=f)
    pyfastaq.utils.close(f)
 def _generate_chunk_output_file(self, i=None):
     fn = tempfile.NamedTemporaryFile(suffix=".fasta").name
     suffix = "|arrow"
     with open(fn, "w") as f:
         header, seq = self.CHUNK_CONTIGS[i]
         f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
     pysam.faidx(fn)
     return self._make_dataset_file(fn)
Пример #27
0
def check_fasta(fa_f, pysam_flag=True):
    if not os.path.isfile(fa_f + '.fai'):
        pysam.faidx(fa_f)
    if pysam_flag:  # return pysam FastaFile object
        fa = pysam.FastaFile(fa_f)
        return fa
    else:  # return fasta file path
        return fa_f
Пример #28
0
def index_fasta(infile):
	'''index fasta file using samTools'''
	if os.path.isfile(infile):
		pass
	else:
		print >>sys.stderr, "Indexing " + infile + ' ...',
		pysam.faidx(infile)
		print >>sys.stderr, "Done!"
Пример #29
0
def index_fasta(infile):
    """index fasta file using samTools"""
    if os.path.isfile(infile):
        pass
    else:
        print >>sys.stderr, "Indexing " + infile + " ...",
        pysam.faidx(infile)
        print >>sys.stderr, "Done!"
Пример #30
0
def check_fasta(fa_f, pysam_flag=True):
    if not os.path.isfile(fa_f + '.fai'):
        pysam.faidx(fa_f)
    if pysam_flag:  # return pysam FastaFile object
        fa = pysam.FastaFile(fa_f)
        return fa
    else:  # return fasta file path
        return fa_f
Пример #31
0
def ensure_fasta_index(fasta_fname):
    """Ensure a FASTA file is indexed for samtools, to enable fast lookup."""
    fai_fname = fasta_fname + '.fai'
    if not is_newer_than(fai_fname, fasta_fname):
        echo("Indexing FASTA file", fasta_fname)
        pysam.faidx(fasta_fname)
    assert os.path.isfile(fai_fname), "Failed to generate index " + fai_fname
    return fai_fname
Пример #32
0
    def __init__(self, num, refname):
        self.num = int(num)
        self.refname = refname

        if not os.path.exists('%s.fai' % refname):
            pysam.faidx(refname)

        self.ref = pysam.Fastafile(refname)
Пример #33
0
    def __init__(self, num, refname):
        self.num = int(num)
        self.refname = refname

        if not os.path.exists('%s.fai' % refname):
            pysam.faidx(refname)

        self.ref = pysam.Fastafile(refname)
Пример #34
0
def ensure_fasta_index(fasta_fname):
    """Ensure a FASTA file is indexed for samtools, to enable fast lookup."""
    fai_fname = fasta_fname + '.fai'
    if not is_newer_than(fai_fname, fasta_fname):
        echo("Indexing FASTA file", fasta_fname)
        pysam.faidx(fasta_fname)
    assert os.path.isfile(fai_fname), "Failed to generate index " + fai_fname
    return fai_fname
Пример #35
0
def index_fasta(infile):
    '''index fasta file using samTools'''
    if os.path.isfile(infile):
        pass
    else:
        print("Indexing " + infile + ' ...', end=' ', file=sys.stderr)
        pysam.faidx(infile)
        print("Done!", file=sys.stderr)
Пример #36
0
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
                        max_interval_size=SPADES_MAX_INTERVAL_SIZE,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
                        svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
                        stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    pybedtools.set_tempdir(work)

    logger.info("Running SPAdes on the intervals in %s" % bed)
    if not bed:
        logger.info("No BED file specified")
        return None, None

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                                all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                               all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
                       "max_read_pairs": max_read_pairs}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    if os.path.getsize(assembled_fasta) > 0:
        logger.info("Indexing the assemblies")
        pysam.faidx(assembled_fasta)
    else:
        logger.error("No assembly generated")
        assembled_fasta = None

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
Пример #37
0
def checkFASTA( fastaFileStr ):
	fastaIndex = fastaFileStr + '.fai'
	if os.path.isfile(fastaFileStr) == False:
		print('ERROR: FASTA file does not exist')
		exit()
	elif os.path.isfile(fastaIndex) == False:
		print('WARNING: FASTA index file does not exist...creating')
		pysam.faidx( fastaFileStr )
	return True
Пример #38
0
def checkFASTA(fastaFileStr):
    fastaIndex = fastaFileStr + '.fai'
    if os.path.isfile(fastaFileStr) == False:
        print('ERROR: FASTA file does not exist')
        exit()
    elif os.path.isfile(fastaIndex) == False:
        print('WARNING: FASTA index file does not exist...creating')
        pysam.faidx(fastaFileStr)
    return True
Пример #39
0
def write_fasta(seqs, fasta_path, index=True):
    with open(fasta_path, 'w') as fasta:
        for k in seqs:
            fasta.write('\n'.join(
                ['>%s' % k] +
                [seqs[k][i:(i + 80)]
                 for i in range(0, len(seqs[k]), 80)] + ['\n']))
    if index: pysam.faidx(fasta_path)  #reindex
    return True
Пример #40
0
    def __init__(self, num, refname, dbsnpname):
        sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n')
        self.num = int(num)
        self.refname = refname
        self.dbsnp = DBSNP(dbsnpname)

        if not os.path.exists('%s.fai' % refname):
            pysam.faidx(refname)

        self.ref = pysam.Fastafile(refname)
Пример #41
0
    def __init__(self, num, refname, dbsnpname):
        sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n')
        self.num = int(num)
        self.refname = refname
        self.dbsnp = DBSNP(dbsnpname)

        if not os.path.exists('%s.fai' % refname):
            pysam.faidx(refname)

        self.ref = pysam.Fastafile(refname)
Пример #42
0
def get_genome_stats(genome_fasta):
    reference_fasta_index = genome_fasta + '.fai'
    if not os.path.exists(reference_fasta_index):
        print("\nIndexing %s\n" % os.path.abspath(genome_fasta))
        pysam.faidx(genome_fasta)

    reference_genome = pysam.FastaFile(genome_fasta)
    total_bases = sum(reference_genome.lengths)

    return reference_genome.nreferences, total_bases
Пример #43
0
 def __faidx(self):
     if not os.path.isfile(fafile + '.fai'):
         try:
             pysam.faidx(self.fafile)
             return True
         except:
             raise RuntimeError()
     else:
         print "already exist"
         return False 
Пример #44
0
def get_reference_sequence(ref_location, contig, start_pos, end_pos):
    # ensure faidx
    if not os.path.isfile("{}.fai".format(ref_location)):
        subprocess.check_call(['samtools', 'faidx', ref_location])
    if not os.path.isfile("{}.fai".format(ref_location)):
        pysam.faidx(ref_location)

    # use pysam
    with closing(pysam.FastaFile(ref_location)) as ref:
        return ref.fetch(reference=contig, start=start_pos, end=end_pos)
Пример #45
0
 def index_genomefq(self):
     """
     Index whole genome fasta with samtools
     :return:
     """
     try:
         pysam.faidx(self.whole_genome)
     except Exception as e:
         print('Problem in pysam faidx')
         print(e)
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
        f.flush()
    if make_faidx:
        pysam.faidx(fasta_file)
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file, strict=make_faidx)
        cs.write(file_name)
Пример #47
0
def main():
    if len(sys.argv) != 4:
        sys.exit('fetch_ucsc.py human/mouse ref/kg/ens/fa out')
    if sys.argv[1] == 'human':
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/hg19/'
    elif sys.argv[1] == 'mouse':
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/'
    else:
        sys.exit('Only support human or mouse!')
    s = string.maketrans(' ', '_')
    if sys.argv[2] == 'ref':  # RefSeq gene annotations
        urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz')
        with open(sys.argv[3], 'w') as outf:
            outf.write(gzip.open('refFlat.txt.gz', 'rb').read())
    elif sys.argv[2] == 'kg':  # KnownGenes gene annotations
        urllib.urlretrieve(path + 'database/knownGene.txt.gz',
                           'knownGene.txt.gz')
        urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz')
        kg_iso = {}
        with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f:
            for line in kg_id_f:
                iso = line.split('\t')[0]
                gene = line.split('\t')[4].translate(s)
                kg_iso[iso] = gene
        with gzip.open('knownGene.txt.gz', 'rb') as kg_f:
            with open(sys.argv[3], 'w') as outf:
                for line in kg_f:
                    entry = line.split('\t')
                    iso = entry[0]
                    outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n')
    elif sys.argv[2] == 'ens':  # Ensembl gene annotations
        urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz')
        urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz',
                           'ensemblToGeneName.txt.gz')
        ens_iso = {}
        with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f:
            for line in ens_id_f:
                iso, gene = line.split()
                ens_iso[iso] = gene
        with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
            with open(sys.argv[3], 'w') as outf:
                for line in ens_f:
                    entry = line.split()
                    iso = entry[1]
                    outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
    elif sys.argv[2] == 'fa':  # Genome sequences
        urllib.urlretrieve(path + 'bigZips/chromFa.tar.gz', 'chromFa.tar.gz')
        with tarfile.open('chromFa.tar.gz', 'r:gz') as seq:
            with open(sys.argv[3], 'w') as outf:
                for f in seq:
                    outf.write(seq.extractfile(f).read())
        pysam.faidx(sys.argv[3])
    else:
        sys.exit('Only support ref/kg/ens/fa!')
Пример #48
0
        def prep(output_filename):
            import pysam

            with open(output_filename, "wb") as op:
                for fn in filenames:
                    for key, seq in iter_fasta(
                            fn, lambda x: x[:x.find(b" ")]
                            if b" " in x else x):
                        op.write(b">%s\n%s\n" %
                                 (key, b"\n".join(wrappedIterator(80)(seq))))
            pysam.faidx(output_filename)
Пример #49
0
    def create_index(cls, fasta_file, force_overwrite=False):
        logger = logging.getLogger(cls.__name__)

        fasta_file = Path(fasta_file)
        if not fasta_file.is_file():
            logger.error("File {} not found".format(fasta_file))
            exit(1)

        if not fasta_file.with_name(fasta_file.name +
                                    '.fai').is_file() or force_overwrite:
            pysam.faidx(str(fasta_file))
Пример #50
0
def main():
    if len(sys.argv) != 4:
        sys.exit('fetch_ucsc.py human/mouse ref/kg/ens/fa out')
    if sys.argv[1] == 'human':
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/hg19/'
    elif sys.argv[1] == 'mouse':
        path = 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/'
    else:
        sys.exit('Only support human or mouse!')
    s = string.maketrans(' ', '_')
    if sys.argv[2] == 'ref':  # RefSeq gene annotations
        urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz')
        with open(sys.argv[3], 'w') as outf:
            outf.write(gzip.open('refFlat.txt.gz', 'rb').read())
    elif sys.argv[2] == 'kg':  # KnownGenes gene annotations
        urllib.urlretrieve(path + 'database/knownGene.txt.gz',
                           'knownGene.txt.gz')
        urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz')
        kg_iso = {}
        with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f:
            for line in kg_id_f:
                iso = line.split('\t')[0]
                gene = line.split('\t')[4].translate(s)
                kg_iso[iso] = gene
        with gzip.open('knownGene.txt.gz', 'rb') as kg_f:
            with open(sys.argv[3], 'w') as outf:
                for line in kg_f:
                    entry = line.split('\t')
                    iso = entry[0]
                    outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n')
    elif sys.argv[2] == 'ens':  # Ensembl gene annotations
        urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz')
        urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz',
                           'ensemblToGeneName.txt.gz')
        ens_iso = {}
        with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f:
            for line in ens_id_f:
                iso, gene = line.split()
                ens_iso[iso] = gene
        with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
            with open(sys.argv[3], 'w') as outf:
                for line in ens_f:
                    entry = line.split()
                    iso = entry[1]
                    outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
    elif sys.argv[2] == 'fa':  # Genome sequences
        urllib.urlretrieve(path + 'bigZips/chromFa.tar.gz', 'chromFa.tar.gz')
        with tarfile.open('chromFa.tar.gz', 'r:gz') as seq:
            with open(sys.argv[3], 'w') as outf:
                for f in seq:
                    outf.write(seq.extractfile(f).read())
        pysam.faidx(sys.argv[3])
    else:
        sys.exit('Only support ref/kg/ens/fa!')
Пример #51
0
 def build_index(self, force=False):
     self._import_pysam()
     if not isinstance(self.fos, str):
         raise TypeError, "This function only works with FastaReader objects " + "connected to a fasta file via file name"
     index_filename = self.fos + ".fai"
     if os.access(index_filename, os.R_OK):
         if (not force) and os.stat(self.filename_or_sequence).st_mtime <= os.stat(index_filename).st_mtime:
             # index is up to date
             return
     pysam.faidx(self.fos)
     if not os.access(index_filename, os.R_OK):
         raise SystemError, "Building of Fasta index failed due to unknown error."
Пример #52
0
def index_fasta(infile):
	'''
	Index fasta file using samTools.
	'''
	try:
		if os.path.getsize(infile + '.fai'):
			logging.debug("\"%s\" exists. Skip indexing!" % (infile + '.fai'))
			pass
	except OSError:
		logging.warning ("Can not find the index file: \"%s\"" % (infile + '.fai'))
		logging.info("Indexing \"%s\" using the \"pysam\" module..." % infile)
		pysam.faidx(infile)
		logging.info("Done!")
def _make_barcodes(file_name=None):
    if file_name is None:
        file_name = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
    fasta_file_name = file_name
    if file_name.endswith(".barcodeset.xml"):
        fasta_file_name = re.sub(".barcodeset.xml", ".fasta", file_name)
    with FastaWriter(fasta_file_name) as fa_out:
        for i in range(1010):
            fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
    pysam.faidx(fasta_file_name, catch_stdout=False)
    ds = BarcodeSet(fasta_file_name, strict=True)
    ds.write(file_name)
    return file_name
Пример #54
0
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, deep_coverage=False):
    """Realign a BAM file around indels using GATK, returning sorted BAM.
    """
    runner = broad.runner_from_config(config)
    runner.run_fn("picard_index", align_bam)
    runner.run_fn("picard_index_ref", ref_file)
    if not os.path.exists("%s.fai" % ref_file):
        pysam.faidx(ref_file)
    realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, deep_coverage)
    realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, deep_coverage)
    # No longer required in recent GATK (> Feb 2011) -- now done on the fly
    # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
    return realign_bam
Пример #55
0
def check_fasta(fa, return_handle=True):
    '''
    Check fasta files.
    http://pysam.readthedocs.io/en/latest/api.html?highlight=faidx#fasta-files
    '''
    if not os.path.isfile(fa):
        sys.exit('No such file: %s!' % fa)
    if not os.path.isfile(fa + '.fai'):
        pysam.faidx(fa)
    if return_handle:
        return pysam.FastaFile(fa)
    else:
        return fa
    def makeTwoReference(self, chr,start,end,ref,alt, output):

        hOUT = open(output, 'w')
        
        seq = ""
        label = ','.join([chr, str(start), str(end), ref, alt])
        range = chr + ":" + str(int(start) - self.window + 1) +"-"+ str(int(end) + self.window)
        for item in pysam.faidx(self.reference_genome, range):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + label + "_ref"
        print >> hOUT, seq

        # for insertion
        if ref == "-":   seq = seq[0:(self.window + 1)] + alt + seq[-self.window:]
        # for deletion
        elif alt == "-": seq = seq[0:self.window] + seq[-self.window:]
         # for SNV
        else:            seq = seq[0:self.window] + alt + seq[-self.window:]

        print >> hOUT, '>' + label + "_alt"
        print >> hOUT, seq

        hOUT.close()
Пример #57
0
def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))
Пример #58
0
def read_pysam(f, headers):
    tstart = time.time()
    for k in islice(headers, 0, None, 100):
        for start, end in intervals:
            if time.time() - tstart > 300:
                print(k)
            tstart = time.time()
            str(pysam.faidx(f, '{0}:{1}-{2}'.format(k, start + 1, end)))