def convert_bigwig(cls, bigwig, species, bigwig_cmd_path): log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') if not os.path.exists(cls._get_genome_bin_path(species)): log.append('Writing bins ...') cls._write_genome_bins(species) try: temp = tempfile.NamedTemporaryFile('w', delete=False) temp.close() process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True) if process.returncode == 0: with open(temp.name, 'r') as cmd_output: for line in cmd_output: fields = line.strip().split('\t') coverage_array[int(fields[0])] = fields[4] return coverage_array else: raise AssertionError(process.stderr.decode('utf-8')) finally: os.remove(temp.name)
def convert_bigwig(cls, bigwig, species, log=None): if log is None: log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True) try: coverage_bw = bw.open(bigwig) log.append(bar, update_line=True) for i, window in enumerate(genome.list_windows()): if window.chromosome in coverage_bw.chroms(): mean_coverage = coverage_bw.stats(*window.to_tuple())[0] coverage_array[i] = mean_coverage if i % 1000 == 0: log.append(bar, update_line=True) return np.nan_to_num(coverage_array) finally: coverage_bw.close()
def main(species, motif_bed, window_size, gamma_threshold=0.95): genome = DataInterface.load_genome(species, window_size) log = Log(target=stderr) factor_name = None window_nums, scores = [], [] with gzip.open(motif_bed, 'rb') as f: bed = f.readlines() bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))), len(bed), cold_start=True) for i, line in enumerate(bed): chrom, start, end, factor, relscore, log_pval, strand = line.decode( 'utf-8').strip().split('\t') if i == 0: factor_name = factor try: hit_windows = genome.get_region_windows( Region(chrom, start, end)) window_nums.extend(hit_windows) scores.extend([float(log_pval) / 100] * len(hit_windows)) except BadRegionError: pass log.append(bar, update_line=True) log.append('') log.append('Done') hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]), shape=(len(genome), 1)).tocoo().tocsc() sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1), size=10000) min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold) hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)] return hit_indices, factor_name
def main(species, window_size, path): region_fields = parse_bedfile(path, header = False) regions = [Region(*r) for r in region_fields] genome = DataInterface.load_genome(species, window_size) indices = [] for region in regions: try: windows = genome.get_region_windows(region) indices.extend(windows) except BadRegionError: pass return list(set(indices))
def main(*,species, motif_bed, window_size, dataset_id, output): genome = DataInterface.load_genome(species, window_size) factor_name = None window_nums, scores = [],[] #adjust p-val cutoff based on the filesize (only affects p-val if file is huge) pval_cutoff = 430 with open(output, 'w') as o: with gzip.open(motif_bed, 'rb') as bed: for i, line in enumerate(bed): chrom, start, end, factor, relscore, log_pval, strand = line.decode('utf-8').strip().split('\t') if i == 0: factor_name = factor print('Binning {} motifs with pval cutoff of {} ...'.format(factor_name.upper(), str(pval_cutoff)), file = stderr) neg_log10_pval = int(log_pval) if neg_log10_pval >= pval_cutoff: try: hit_windows = genome.get_region_windows(Region(chrom, start, end)) for hit_window in hit_windows: print(dataset_id, hit_window, neg_log10_pval, sep = '\t', file = o) except BadRegionError: pass print(dataset_id, factor_name.upper(), 'JASPAR', sep = '\t')