def index_gff(gff, compress): """Index and optionally compress a GFF file. \b Examples: bionorm index_gff Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.ann1.FAKE.gene_models_main.gff3 """ from sh import bgzip # isort:skip from sh import tabix # isort:skip target = Path(gff) if len(target.suffixes) < 1: error_message = f"Target {target} does not have a file extension." logger.error(error_message) sys.exit(1) if target.suffix.lstrip(".") in COMPRESSED_TYPES: logger.error(f"Uncompress {target} befor indexing.") sys.exit(1) if target.suffix.lstrip(".") not in GFF_TYPES: logger.error( f"File {target} does not have a recognized GFF extension.") sys.exit(1) if compress: output = bgzip(["-f", str(target)]) target = Path(target.parent) / f"{target.name}.gz" output = tabix(["-p", "gff", str(target)]) output = tabix(["--csi", "-p", "gff", str(target)]) return target
def tabix(chrom): out_file = chr_out(chrom) if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.tabix("-h", in_file, chrom, _out=tmp_out_file) return out_file
def _break_vcf(self, in_file): if not file_exists(self.fasta_index): sh.samtools.faidx(self.fasta_file) # if file is not compressed, compress it (_, ext) = os.path.splitext(in_file) if ext is not ".gz": gzip_file = in_file + ".gz" sh.bgzip("-c", in_file, _out=gzip_file) in_file = gzip_file # create tabix index if it does not exist already if not file_exists(in_file + ".tbi"): sh.tabix("-p", "vcf", in_file) # find the chromosome names from the fasta index file chroms = str(sh.cut("-f1", self.fasta_index)).split() break_dir = os.path.join(os.path.dirname(in_file), "break") safe_makedir(break_dir) def chr_out(chrom): out_file = os.path.join(break_dir, append_stem(in_file, chrom)) out_file = replace_suffix(out_file, "vcf") return out_file def tabix(chrom): out_file = chr_out(chrom) if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.tabix("-h", in_file, chrom, _out=tmp_out_file) return out_file # use tabix to separate out the variants based on chromosome out_files = map(tabix, chroms) return out_files