예제 #1
0
def split_vcf(in_file, config, out_dir=None):
    """
    split a VCF file into separate files by chromosome
    requires tabix to be installed

    """
    if out_dir is None:
        out_dir = os.path.join(os.path.dirname(in_file), "split")

    fasta_file = config["ref"]["fasta"]
    fasta_index = fasta_file + ".fai"
    samtools_path = config["program"].get("samtools", "samtools")
    tabix_path = config["program"].get("tabix", "tabix")

    if not file_exists(fasta_index):
        samtools = sh.Command(samtools_path)
        samtools.faidx(fasta_file)

    # if in_file is not compressed, compress it
    (_, ext) = os.path.splitext(in_file)
    if ext is not ".gz":
        gzip_file = in_file + ".gz"
        if not file_exists(gzip_file):
            sh.bgzip("-c", in_file, _out=gzip_file)
        in_file = gzip_file

    # create tabix index
    tabix_index(in_file)

    # find the chromosome names from the fasta index file
    chroms = str(sh.cut("-f1", fasta_index)).split()

    # make outfile from chromosome name
    def chr_out(chrom):
        out_file = replace_suffix(append_stem(in_file, chrom), ".vcf")
        return os.path.join(out_dir, os.path.basename(out_file))

    # run tabix to break up the vcf file
    def run_tabix(chrom):
        tabix = sh.Command(tabix_path)
        out_file = chr_out(chrom)
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            tabix("-h", in_file, chrom, _out=tmp_out_file)
        return out_file

    out_files = map(run_tabix, chroms)
    return out_files
예제 #2
0
def index_gff(gff, compress):
    """Index and optionally compress a GFF file.

        \b
    Examples:
        bionorm index_gff Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.ann1.FAKE.gene_models_main.gff3
    """
    from sh import bgzip  # isort:skip
    from sh import tabix  # isort:skip

    target = Path(gff)
    if len(target.suffixes) < 1:
        error_message = f"Target {target} does not have a file extension."
        logger.error(error_message)
        sys.exit(1)
    if target.suffix.lstrip(".") in COMPRESSED_TYPES:
        logger.error(f"Uncompress {target} befor indexing.")
        sys.exit(1)
    if target.suffix.lstrip(".") not in GFF_TYPES:
        logger.error(
            f"File {target} does not have a recognized GFF extension.")
        sys.exit(1)
    if compress:
        output = bgzip(["-f", str(target)])
        target = Path(target.parent) / f"{target.name}.gz"
    output = tabix(["-p", "gff", str(target)])
    output = tabix(["--csi", "-p", "gff", str(target)])
    return target
예제 #3
0
def index_fasta(fasta, compress):
    """Index and optionally compress a fasta file.


        \b
    Examples:
        bionorm index_fasta Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.FAKE.genome_main.fna
    """
    from sh import bgzip  # isort:skip
    from sh import samtools  # isort:skip

    target = Path(fasta)
    if len(target.suffixes) < 1:
        error_message = f"Target {target} does not have a file extension."
        logger.error(error_message)
        sys.exit(1)
    if target.suffix.lstrip(".") in COMPRESSED_TYPES:
        logger.error(f"Uncompress {target} befor indexing.")
        sys.exit(1)
    if target.suffix.lstrip(".") not in FASTA_TYPES:
        logger.error(
            f"File {target} does not have a recognized FASTA extension.")
        sys.exit(1)
    if compress:
        output = bgzip(["-f", "--index", str(target)])
        target = Path(target.parent) / f"{target.name}.gz"
    output = samtools(["faidx", str(target)])
    return target
예제 #4
0
    def _break_vcf(self, in_file):
        if not file_exists(self.fasta_index):
            sh.samtools.faidx(self.fasta_file)

        # if file is not compressed, compress it
        (_, ext) = os.path.splitext(in_file)
        if ext is not ".gz":
            gzip_file = in_file + ".gz"
            sh.bgzip("-c", in_file, _out=gzip_file)
            in_file = gzip_file

        # create tabix index if it does not exist already
        if not file_exists(in_file + ".tbi"):
            sh.tabix("-p", "vcf", in_file)

        # find the chromosome names from the fasta index file
        chroms = str(sh.cut("-f1", self.fasta_index)).split()
        break_dir = os.path.join(os.path.dirname(in_file), "break")
        safe_makedir(break_dir)

        def chr_out(chrom):
            out_file = os.path.join(break_dir, append_stem(in_file, chrom))
            out_file = replace_suffix(out_file, "vcf")
            return out_file

        def tabix(chrom):
            out_file = chr_out(chrom)
            if file_exists(out_file):
                return out_file
            with file_transaction(out_file) as tmp_out_file:
                sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
            return out_file

        # use tabix to separate out the variants based on chromosome
        out_files = map(tabix, chroms)

        return out_files
예제 #5
0
파일: stages.py 프로젝트: anindya028/bipy
    def _break_vcf(self, in_file):
        if not file_exists(self.fasta_index):
            sh.samtools.faidx(self.fasta_file)

        # if file is not compressed, compress it
        (_, ext) = os.path.splitext(in_file)
        if ext is not ".gz":
            gzip_file = in_file + ".gz"
            sh.bgzip("-c", in_file, _out=gzip_file)
            in_file = gzip_file

        # create tabix index if it does not exist already
        if not file_exists(in_file + ".tbi"):
            sh.tabix("-p", "vcf", in_file)

        # find the chromosome names from the fasta index file
        chroms = str(sh.cut("-f1", self.fasta_index)).split()
        break_dir = os.path.join(os.path.dirname(in_file), "break")
        safe_makedir(break_dir)

        def chr_out(chrom):
            out_file = os.path.join(break_dir, append_stem(in_file, chrom))
            out_file = replace_suffix(out_file, "vcf")
            return out_file

        def tabix(chrom):
            out_file = chr_out(chrom)
            if file_exists(out_file):
                return out_file
            with file_transaction(out_file) as tmp_out_file:
                sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
            return out_file

        # use tabix to separate out the variants based on chromosome
        out_files = map(tabix, chroms)

        return out_files