def main(species, window_size, motif_metadata, bin_sorted_hits, group_loci=100000): motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) print(data.path) raise Exception() data.create_binding_dataset(TECHNOLOGY, motif_metadata.index.values, **motif_metadata.to_dict('list')) id_to_idx_map = dict( zip(data.list_binding_datasets(TECHNOLOGY), np.arange(len(data.list_binding_datasets(TECHNOLOGY))))) current_pos = 0 last_added_chunk = 0 i = 0 rows, cols, scores = [], [], [] with open(bin_sorted_hits, 'r') as f: for line in f: motif_id, bin_num, score = line.strip().split() bin_num = int(bin_num) if bin_num < current_pos: raise Exception('Input file not sorted!') elif bin_num > current_pos and i >= group_loci: print('Adding matrix segment ...') matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr() data.append_csr(TECHNOLOGY, matrix_form) last_added_chunk = bin_num i = 0 rows, cols, scores = [], [], [] tf_idx = id_to_idx_map[motif_id] rows.append(bin_num - last_added_chunk) cols.append(tf_idx) scores.append(int(score)) current_pos = bin_num i += 1 if len(rows) > 0: matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr() data.append_csr(TECHNOLOGY, matrix_form)
def convert_bigwig(cls, bigwig, species, bigwig_cmd_path): log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') if not os.path.exists(cls._get_genome_bin_path(species)): log.append('Writing bins ...') cls._write_genome_bins(species) try: temp = tempfile.NamedTemporaryFile('w', delete=False) temp.close() process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True) if process.returncode == 0: with open(temp.name, 'r') as cmd_output: for line in cmd_output: fields = line.strip().split('\t') coverage_array[int(fields[0])] = fields[4] return coverage_array else: raise AssertionError(process.stderr.decode('utf-8')) finally: os.remove(temp.name)
def convert_bigwig(cls, bigwig, species, log=None): if log is None: log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True) try: coverage_bw = bw.open(bigwig) log.append(bar, update_line=True) for i, window in enumerate(genome.list_windows()): if window.chromosome in coverage_bw.chroms(): mean_coverage = coverage_bw.stats(*window.to_tuple())[0] coverage_array[i] = mean_coverage if i % 1000 == 0: log.append(bar, update_line=True) return np.nan_to_num(coverage_array) finally: coverage_bw.close()
def main(species, window_size, cistrome_metadata, motif_metadata, index_files): cistrome_metadata = pd.read_csv(cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) dataset_ids = motif_metadata.index.values data.create_binding_dataset('Motif', dataset_ids)
def main(species, motif_bed, window_size, gamma_threshold=0.95): genome = DataInterface.load_genome(species, window_size) log = Log(target=stderr) factor_name = None window_nums, scores = [], [] with gzip.open(motif_bed, 'rb') as f: bed = f.readlines() bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))), len(bed), cold_start=True) for i, line in enumerate(bed): chrom, start, end, factor, relscore, log_pval, strand = line.decode( 'utf-8').strip().split('\t') if i == 0: factor_name = factor try: hit_windows = genome.get_region_windows( Region(chrom, start, end)) window_nums.extend(hit_windows) scores.extend([float(log_pval) / 100] * len(hit_windows)) except BadRegionError: pass log.append(bar, update_line=True) log.append('') log.append('Done') hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]), shape=(len(genome), 1)).tocoo().tocsc() sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1), size=10000) min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold) hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)] return hit_indices, factor_name
def main(species, window_size, cistrome_metadata, motif_metadata, index_files): cistrome_metadata = pd.read_csv(cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) for index_file in index_files: with open(index_file, 'r') as f: hit_bins = np.array([int(ind.strip()) for ind in f.readlines()]) technology, dataset_id = os.path.basename(index_file).split('_') dataset_id = '.'.join(dataset_id.split('.')[:-1]) metadata_headers = data.get_metadata_headers(technology) if technology == 'Motifs': meta_dict = motif_metadata.loc[dataset_id, metadata_headers].to_dict() meta_dict['source'] = 'jaspar' else: meta_dict = cistrome_metadata.loc[dataset_id, metadata_headers].to_dict() data.add_binding_data(technology, dataset_id, hit_bins, **meta_dict)
def main(species, window_size, path): region_fields = parse_bedfile(path, header = False) regions = [Region(*r) for r in region_fields] genome = DataInterface.load_genome(species, window_size) indices = [] for region in regions: try: windows = genome.get_region_windows(region) indices.extend(windows) except BadRegionError: pass return list(set(indices))
def main(*,species, motif_bed, window_size, dataset_id, output): genome = DataInterface.load_genome(species, window_size) factor_name = None window_nums, scores = [],[] #adjust p-val cutoff based on the filesize (only affects p-val if file is huge) pval_cutoff = 430 with open(output, 'w') as o: with gzip.open(motif_bed, 'rb') as bed: for i, line in enumerate(bed): chrom, start, end, factor, relscore, log_pval, strand = line.decode('utf-8').strip().split('\t') if i == 0: factor_name = factor print('Binning {} motifs with pval cutoff of {} ...'.format(factor_name.upper(), str(pval_cutoff)), file = stderr) neg_log10_pval = int(log_pval) if neg_log10_pval >= pval_cutoff: try: hit_windows = genome.get_region_windows(Region(chrom, start, end)) for hit_window in hit_windows: print(dataset_id, hit_window, neg_log10_pval, sep = '\t', file = o) except BadRegionError: pass print(dataset_id, factor_name.upper(), 'JASPAR', sep = '\t')
def _write_genome_bins(cls, species): bedstr = DataInterface.get_window_bedfile_str(species, cls.window_size) with open(cls._get_genome_bin_path(species, window_size), 'w') as bed: bed.write(bedstr)
def main(args): cistrome_metadata = pd.read_csv(args.cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) data = DataInterface(args.species, window_size=args.window_size, download_if_not_exists=False, make_new=False, load_genes=True) rp_map_styles = data.get_rp_maps() if len(rp_map_styles) == 0: basic_rp_map, enhanced_rp_map = data.build_binned_rp_map( 'basic', 10000), data.build_binned_rp_map('enhanced', 10000) data.add_rp_map('basic_10K', basic_rp_map) data.add_rp_map('enhanced_10K', enhanced_rp_map) else: basic_rp_map = data.get_rp_map('basic_10K') enhanced_rp_map = data.get_rp_map('enhanced_10K') for arr_name in args.coverage_arrays: coverage_array = np.load(arr_name) technology, dataset_id = os.path.basename(arr_name).split('_') dataset_id = '.'.join(dataset_id.split('.')[:-1]) metadata_headers = data.get_metadata_headers(technology) meta_dict = cistrome_metadata.loc[dataset_id, metadata_headers].to_dict() data.add_profile_data(technology, dataset_id, coverage_array, [basic_rp_map, enhanced_rp_map], ['basic_10K', 'enhanced_10K'], **meta_dict)