def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath} > {fpath}'.format(**locals()) res = run_simple(cmdline) if not silent: info() if not res: return None return fpath
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath}'.format(**locals()) res = run(cmdline, output_fpath=fpath) if not silent: info() if not res: return None return fpath
def tx2genefile(gtf, out_file=None): """ write out a file of transcript->gene mappings. use the installed tx2gene.csv if it exists, else write a new one out """ installed_tx2gene = os.path.join(os.path.dirname(gtf), "tx2gene.csv") if file_exists(installed_tx2gene): return installed_tx2gene if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for k, v in transcript_to_gene(gtf).items(): out_handle.write(",".join([k, v]) + "\n") return out_file
def partition_gtf(gtf, coding=False, out_file=False): """ return a GTF file of all non-coding or coding transcripts. the GTF must be annotated with gene_biotype = "protein_coding" or to have the source column set to the biotype for all coding transcripts. set coding to True to get only the coding, false to get only the non-coding """ if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gtf").name if coding: pred = lambda biotype: biotype and biotype == "protein_coding" else: pred = lambda biotype: biotype and biotype != "protein_coding" biotype_lookup = _biotype_lookup_fn(gtf) db = get_gtf_db(gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in db.all_features(): biotype = biotype_lookup(feature) if pred(biotype): out_handle.write(str(feature) + "\n") return out_file
def gtf_to_bed(gtf, alt_out_dir=None): """ create a BED file of transcript-level features with attached gene name or gene ids """ out_file = os.path.splitext(gtf)[0] + '.bed' if file_exists(out_file): return out_file if not os.access(os.path.dirname(out_file), os.W_OK | os.X_OK): if not alt_out_dir: raise IOError('Cannot write transcript BED output file %s' % out_file) else: out_file = os.path.join(alt_out_dir, os.path.basename(out_file)) with open(out_file, "w") as out_handle: db = get_gtf_db(gtf) for feature in db.features_of_type('transcript', order_by=("seqid", "start", "end")): chrom = feature.chrom start = feature.start end = feature.end attributes = feature.attributes.keys() strand = feature.strand name = (feature['gene_name'][0] if 'gene_name' in attributes else feature['gene_id'][0]) line = "\t".join([str(x) for x in [chrom, start, end, name, ".", strand]]) out_handle.write(line + "\n") return out_file
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB """ db_file = gtf + '.db' if gtf.endswith('.gz'): db_file = gtf[:-3] + '.db' if file_exists(db_file): return gffutils.FeatureDB(db_file) db_file = ':memory:' if in_memory else db_file if in_memory or not file_exists(db_file): debug('GTF database does not exist, creating...') infer_extent = guess_infer_extent(gtf) db = gffutils.create_db(gtf, dbfn=db_file, infer_gene_extent=infer_extent) return db else: return gffutils.FeatureDB(db_file)