def add_promoter(genes_gff, chromsize, promoter_length): """This function adds a promoter of a certain length to each gene in the input file and exports an output pyBedTools object""" # output = open(output_location, 'w') #make output file with write capability # parse gff file containing only genes. genes = BedTool(genes_gff) # extract promoters upsteam using chromsize file and specified promoter length. r, no. of bp to add to end coordinate. s, based on strand. promoters = genes.flank(g=chromsize, l=promoter_length, r=0, s=True) return promoters
class GenomicSubset(object): def __init__(self, name, path=paths.genome_subsets, assembly='hg19'): self.assembly = assembly self.name = name self.bedtool = BedTool(path + name + '.bed').sort() # Intersect the pathway with the appropriate genome build # TODO: this step should be unnecessary if the pathways are correct if name != self.assembly: self.bedtool = GenomicSubset.reference_genome( self.assembly).bedtool.intersect(self.bedtool).sort().saveas() def expand_by(self, expansion_in_each_direction_Mb): window_size_str = str(expansion_in_each_direction_Mb) + 'Mb' print('total size before window addition:', self.bedtool.total_coverage(), 'bp') # compute the flanks # TODO: use 1cM instead of 1Mb print('computing flanks') flanks = self.bedtool.flank( genome=self.assembly, b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas() # compute the union of the flanks and the pathway print('computing union') union = self.bedtool.cat(flanks, postmerge=False).sort() merged = union.merge().saveas() print('total size after window addition:', merged.total_coverage(), 'bp') self.bedtool = merged def restricted_to_chrom_bedtool(self, chrnum): return self.bedtool.filter( lambda x : x[0] == 'chr' + str(int(chrnum))).saveas() @classmethod def reference_genome(cls, assembly='hg19'): return GenomicSubset(assembly, path=paths.reference, assembly=assembly) @classmethod def reference_chrom_bedtool(cls, chrnum, assembly='hg19'): return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum) @classmethod def whole_genome(cls, assembly='hg19'): return cls(assembly, path=paths.reference)
def get_gene_dataframe(self, peak_bed, up=100000, down=100000): # all overlap Enh-TSS(100000-tss-100000) pair with distance peaks = BedTool(peak_bed) b = BedTool(self.gene_bed) b = b.flank(l=1, r=0, s=True, g=self.gsize).slop( # noqa: E741 l=up, r=down, g=self.gsize, s=True # noqa: E741 ) # bedtools flank -r 0 -l 1 -i b.bed -g # #all gene upstream 1bp position (TSS), Chr01 12800 12801 in Chr01 4170 12800 Xetrov90000001m.g 0 - # bedtools slop -r down -l up -i b.bed -g # # |100000--TSS--100000| vals = [] # for f in b.intersect(peaks, wo=True, nonamecheck=True): for f in b.intersect(peaks, wo=True): # bedtools intersect -wo -nonamecheck -b peaks.bed -a b.bed chrom = f[0] strand = f[5] if strand == "+": tss = f.start + up else: tss = f.start + down gene = f[3] peak_start, peak_end = int(f[13]), int(f[14]) vals.append([chrom, tss, gene, peak_start, peak_end]) p = pd.DataFrame( vals, columns=["chrom", "tss", "gene", "peak_start", "peak_end"]) p["peak"] = [int(i) for i in (p["peak_start"] + p["peak_end"]) / 2] # peak with int function, let distance int p["dist"] = np.abs(p["tss"] - p["peak"]) p["loc"] = (p["chrom"] + ":" + p["peak_start"].astype(str) + "-" + p["peak_end"].astype(str)) p = p.sort_values("dist").drop_duplicates( ["loc", "gene"], keep="first")[["gene", "loc", "dist"]] p = p[p["dist"] < up - 1] # remove distance more than 100k interaction, for weight calculate p.gene = [i.upper() for i in list(p.gene)] return p
def get_promoter_dataframe(self, peak_bed, up=2000, down=2000): # all overlap Enh-TSS(up2000 to down2000) pair peaks = BedTool(peak_bed) b = BedTool(self.gene_bed) b = b.flank(l=1, r=0, s=True, g=self.gsize).slop( # noqa: E741 l=up, r=down, g=self.gsize, s=True # noqa: E741 ) vals = [] # for f in b.intersect(peaks, wo=True, nonamecheck=True): for f in b.intersect(peaks, wo=True): chrom = f[0] gene = f[3] peak_start, peak_end = int(f[13]), int(f[14]) vals.append([chrom, gene, peak_start, peak_end]) prom = pd.DataFrame( vals, columns=["chrom", "gene", "peak_start", "peak_end"]) prom["loc"] = (prom["chrom"] + ":" + prom["peak_start"].astype(str) + "-" + prom["peak_end"].astype(str)) prom.gene = [i.upper() for i in list(prom.gene)] return prom
def mk_matrix(inputfile=None, outputfile=None, bigwiglist=None, ft_type=None, pseudo_count=0, upstream=1000, downstream=1000, bin_around_frac=0.1, chrom_info=None, bin_nb=100, nb_proc=None, labels=None, no_stranded=False, zero_to_na=False): """ Description: Create a matrix to be used by 'profile' and 'heatmap' commands. """ # ------------------------------------------------------------------------- # Check argument consistency # # ------------------------------------------------------------------------- if ft_type in ['single_nuc', 'promoter', 'tts']: region_size = upstream + downstream + 1 if region_size < bin_nb: message( "The region (-u/-d) needs to be extended given the number " "of bins (--bin-nb)", type="ERROR") # ------------------------------------------------------------------------- # Check output file name does not ends with .zip # # ------------------------------------------------------------------------- if outputfile.name.endswith(".zip"): outfn = outputfile.name.replace(".zip", "") outputfile = open(outfn, "w") # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True if ft_type == 'user_regions': message( "--ft-type can not be set to user_regions" " when a gtf is provided.", type="ERROR") else: try: region_bo = BedTool(inputfile.name) len(region_bo) except IndexError: message("Unable to read the input file. Check format", type="ERROR") if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': message('Loading the GTF file.') gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False if ft_type != 'user_regions' and ft_type != 'single_nuc': message( "Set --ft-type to 'user_regions' or 'single_nuc'" " when using input bed file.", type="ERROR") # Check that the strand is provided and # check it is located in the right column # (not checked by BedTool...). if region_bo.field_count() < 6: if not no_stranded: message("Strand is undefined. Use -nst.", type="ERROR") else: region_name = dict() for i in region_bo: if region_name.get(i.name, None) is None: region_name[i.name] = 1 else: message( "Regions in bed file should have " "unique identifier (col 4).", type="ERROR") if i.strand[0] not in ['.', '+', '-']: message("Strand should be one of '+','-' or '.'.", type="ERROR") if ft_type == 'single_nuc': if i.end - i.start != 1: message( "Region length should be 1 nucleotide " "long when 'single_nuc' is set. Use 'user_regions'.", type="ERROR") elif ft_type == 'user_regions': if i.end - i.start == 1: message( "Region length should not be 1 nucleotide " "long when 'user_regions' is set. Use 'single_nuc'.", type="ERROR") # ------------------------------------------------------------------------- # Create a list of labels for the diagrams. # Take user input in account # ------------------------------------------------------------------------- message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bigwiglist): message( "The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bigwiglist)): labels += [ os.path.splitext(os.path.basename(bigwiglist[i].name))[0] ] # ------------------------------------------------------------------------- # # Get the requested transcrit lines in bed format # Tx are restricted to those found on chromosome # declared in the bigwig file. # ------------------------------------------------------------------------- message('Getting the list of chromosomes declared in bigwig files.') bw_chrom = list() for i in bigwiglist: bw_chrom += list(pyBigWig.open(i.name).chroms().keys()) bed_col = [0, 1, 2, 3, 4, 5] if is_gtf: message('Selecting chromosomes declared in bigwig from gtf.') tmp = gtf.select_by_key("feature", "transcript").select_by_key( "seqid", ",".join(bw_chrom)) tmp = gtf.select_by_key("feature", "transcript") tmp_tx_name = tmp.extract_data("transcript_id", as_list=True) # If several trancript records are associated to # the same transcript_id, raise an error. if len(tmp_tx_name) > len(set(tmp_tx_name)): message('Transcripts should have a unique identifier.', type="ERROR") message('Selecting requested regions.') # ---------------------------------------------------------------------- # # Slop tss and promoters. # No need if transcript was requested (it will be flanked by upstream # and doswnstream regions later on). # ---------------------------------------------------------------------- if ft_type == 'transcript': message("Getting transcript boundaries (input gtf).") main_region_bo = tmp.to_bed(name=["transcript_id"]) elif ft_type == 'promoter': message("Getting promoter regions [-%d,+%d]." % (upstream, downstream)) main_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) elif ft_type == 'tts': main_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Loading regions") if ft_type == 'user_regions': main_region_bo = BedTool(inputfile.name).cut(bed_col) elif ft_type == 'single_nuc': main_region_bo = BedTool(inputfile.name).cut(bed_col).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Unknown method.") # Save for tracability main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed") main_region_bo.saveas(main_region_bed.name) # ------------------------------------------------------------------------- # # Print a header in the output file # # ------------------------------------------------------------------------- message("Preparing comments") comments = "#" comments += "ft_type:" + ft_type + ";" comments += "from:" + str(upstream) + ";" comments += "to:" + str(downstream) + ";" comments += "labels:" + ",".join(labels) + ";" # ------------------------------------------------------------------------- # Compute coverage of requested region # Each worker will send a file # ------------------------------------------------------------------------- outputfile_list = {} message("Using %d bins for main region." % bin_nb) tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="main", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["main"] = tmp_file # ------------------------------------------------------------------------- # If transcript was requested # we must process flanking regions # We need to retrieve coverage of promoter [-upstream, 0] # as transcript coverage window size will depend on transcript length. # For promoter the length of windows will be fixed. # ------------------------------------------------------------------------- if ft_type in ['transcript', 'user_regions']: # Number of bins for TTS and TSS around_bin_nb = int(round(bin_nb * bin_around_frac)) if around_bin_nb < 1: around_bin_nb = 1 if upstream > 0: if ft_type == 'transcript': message("Getting promoter (using %d bins)." % around_bin_nb) ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col) else: message("Getting upstream regions (%d bins)." % around_bin_nb) ups_region_bo = main_region_bo.flank(s=True, l=upstream, r=0, g=chrom_info.name) upstream_bed_file = make_tmp_file(prefix="upstream_region" + ft_type, suffix=".bed") ups_region_bo.saveas(upstream_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="upstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["upstream"] = tmp_file if downstream > 0: if ft_type == 'transcript': message("Getting TTS (using %d bins)." % around_bin_nb) dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col) else: message("Getting downstream regions (%d bins)." % around_bin_nb) dws_region_bo = main_region_bo.flank(s=True, l=0, r=downstream, g=chrom_info.name) dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type, suffix=".bed") dws_region_bo.saveas(dws_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="downstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["downstream"] = tmp_file # ------------------------------------------------------------------------- # # Merge file using pandas # # ------------------------------------------------------------------------- message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG") df_main = pd.read_csv(outputfile_list["main"].name, sep="\t") # save strand and end # They will re-joined added later df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']] df_start = df_main.pop('start') df_end = df_main.pop('end') if "upstream" in outputfile_list: message("Merging upstream file") message("Reading (pandas): " + outputfile_list["upstream"].name, type="DEBUG") df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t") df_up = df_up.drop(['start', 'end'], 1) df_main = df_up.merge(df_main.loc[:, df_main.columns], on=['bwig', 'chrom', 'gene', 'strand']) if "downstream" in outputfile_list: message("Merging downstream file") message("Reading (pandas): " + outputfile_list["downstream"].name, type="DEBUG") df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t") df_dws = df_dws.drop(['start', 'end'], 1) df_main = df_main.merge(df_dws.loc[:, df_dws.columns], on=['bwig', 'chrom', 'gene', 'strand']) # join start and end. df_main = df_main.merge(df_copy.loc[:, df_copy.columns], on=['bwig', 'chrom', 'gene', 'strand']) df_start = df_main.pop('start') df_end = df_main.pop('end') df_main.insert(2, 'start', df_start) df_main.insert(3, 'end', df_end) message("Writing to file") outputfile.close() with open(outputfile.name, 'a') as f: f.write(comments + "\n") df_main.to_csv(f, sep="\t", index=False, mode='a', columns=df_main.columns, na_rep='NA') # ------------------------------------------------------------------------- # # Compress # # ------------------------------------------------------------------------- message("Compressing") path = os.path.abspath(outputfile.name) filename = os.path.basename(path) message("filename: " + filename, type="DEBUG") zip_filename = filename + '.zip' message("zip_filename: " + zip_filename, type="DEBUG") zip_path = os.path.join(os.path.dirname(path), zip_filename) message("zip_path: " + zip_path, type="DEBUG") with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf: zf.write(filename=path, arcname=filename) for i in outputfile_list: message("deleting " + outputfile_list[i].name) os.remove(outputfile_list[i].name) os.remove(outputfile.name) gc.disable() close_properly(inputfile, outputfile)
temp_string = l_split[1] + '\t' + 'na' + '\t' + 'na' + '\t' + l_split[2] + '\t' + l_split[2] + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + l_split[0] + '\n' gff_string = gff_string + temp_string i += 1 target_gff = BedTool(gff_string, from_string=True) ## Annotation files. print('Calculating annotations ...') gencode_ann = BedTool(path_to_gencode_ann).sort() protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter(lambda x: 'gene_type "protein_coding"' in x[8]).sort() CGI_ann = BedTool(path_to_CGI).sort() shore_ann = CGI_ann.flank(g=path_to_chr_lengths, b=2000).sort() shelf_ann = CGI_ann.flank(g=path_to_chr_lengths, b=4000).subtract(shore_ann).sort() ChrHMM_ann = BedTool(path_to_ChrHMM).sort() ## Intersections print('Performing gene bodies / CGI intersections ...') in_gene_bodies_cgs = list(set(list(target_gff.intersect(protein_coding_genes_ann).sort().to_dataframe()['attributes']))) # 15319 / 21368 CpGs are in gene bodies in_CGI_cgs = list(set(list(target_gff.intersect(CGI_ann).sort().to_dataframe()['attributes']))) # 9319 / 21368 CpGs are in CGIs in_shore_cgs = list(set(list(target_gff.intersect(shore_ann).sort().to_dataframe()['attributes']))) # 7920 / 21368 CpGs are in shores in_shelf_cgs = list(set(list(target_gff.intersect(shelf_ann).sort().to_dataframe()['attributes']))) # 1138 / 21368 CpGs are in CGIs # 2991 / 21368 CpGs are in open sea
def create( cls: Type[T], outdir: str, data_files: List[str], enhancer_file: str, annotation_file: str, genome: str, window: Optional[int] = 2000, anno_file: Optional[str] = None, anno_from: Optional[str] = None, anno_to: Optional[str] = None, gene_mapping: Optional[str] = None, threshold: Optional[float] = 1.0, version: Optional[str] = "0.1.0", ) -> T: outdir = Path(outdir) basename = outdir.name meanstd_file = outdir / f"{basename}.{genome}.meanstd.tsv.gz" target_file = outdir / f"{basename}.{genome}.target.npz" gene_file = outdir / "annotation.tss.merged1kb.bed" link_file = outdir / "enhancers2genes.feather" g = Genome(genome) if not os.path.exists(outdir): os.makedirs(outdir) info = { "genes": "genes.txt", "enhancers": "enhancers.feather", "link_file": os.path.basename(link_file), "genome": genome, "window": window, "meanstd_file": os.path.basename(meanstd_file), "target_file": os.path.basename(target_file), "gene_file": os.path.basename(gene_file), "version": version, "schema_version": __schema_version__, } if anno_file is not None: if not os.path.exists(anno_file): raise ValueError(f"{anno_file} does not exist") if anno_from is None or anno_to is None: raise ValueError("Need anno_from and anno_to columns!") copyfile(anno_file, outdir / os.path.basename(anno_file)) info.update({ "anno_file": os.path.basename(anno_file), "anno_from": anno_from, "anno_to": anno_to, }) if gene_mapping is not None: if not os.path.exists(gene_mapping): raise ValueError(f"{gene_mapping} does not exist") copyfile(gene_mapping, outdir / os.path.basename(gene_mapping)) info["gene_mapping"] = os.path.basename(gene_mapping) logger.info("processing gene annotation") # Convert gene annotation b = BedTool(annotation_file) chroms = set([f.chrom for f in BedTool(enhancer_file)]) b = b.filter(lambda x: x.chrom in chroms) b = (b.flank(g=g.sizes_file, l=1, r=0).sort().merge(d=1000, c=4, o="distinct")) # noqa: E741 b.saveas(str(gene_file)) logger.info("processing data files") # create coverage_table df = coverage_table( enhancer_file, data_files, window=window, log_transform=True, normalization="quantile", ncpus=12, ) df.index.rename("loc", inplace=True) df.reset_index().to_feather(f"{outdir}/enhancers.feather") np.savez(target_file, target=df.iloc[:, 0].sort_values()) meanstd = pd.DataFrame(index=df.index, ) meanstd["mean"] = df.mean(1) meanstd["std"] = df.std(1) meanstd = meanstd.reset_index().rename(columns={"loc": "index"}) meanstd.to_csv(meanstd_file, compression="gzip", index=False, sep="\t") df.index.rename("loc", inplace=True) df = df.sub(df.mean(1), axis=0) df = df.div(df.std(1), axis=0) df.reset_index().to_feather(f"{outdir}/enhancers.feather") link = create_link_file(meanstd_file, gene_file, genome=genome) link.to_feather(link_file) genes = _create_gene_table( df, meanstd_file, gene_file, gene_mapping, genome=genome, link_file=link_file, threshold=threshold, ) genes.to_csv(f"{outdir}/genes.txt", sep="\t") with open(f"{outdir}/info.yaml", "w") as f: yaml.dump(info, f) return ScepiaDataset(outdir)
import numpy as np from scipy import stats from pybedtools import BedTool def filter_chipseq_files(infile): df = pd.read_table(infile, sep='\t', header=None, names=[ 'chrom', 'start', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'peak' ]) df['signal_zscore'] = stats.zscore(df.signal) fltr_df = df[df.signal_zscore > 1.64] return BedTool.from_dataframe(fltr_df) # make promoter file | general promoter set genes = BedTool('rnaseq_ensembl_genemodels_filter.bed') tss_window = genes.flank(l=2000, r=0, s=True, genome='hg19') tss_window.saveas('hg19_tss_2kb.bed') # make promoter file | requires overlap with high signal H3K27ac/H3K4me3 h3k27ac = filter_chipseq_files('E073-H3K27ac.narrowPeak.gz') h3k4me3 = filter_chipseq_files('E073-H3K4me3.narrowPeak.gz') tss_high_conf = (tss_window.intersect(h3k27ac, u=True)).intersect(h3k4me3, u=True) tss_high_conf.saveas('hg19_tss_high_conf.bed')