def main(species, window_size, motif_metadata, bin_sorted_hits, group_loci=100000): motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) print(data.path) raise Exception() data.create_binding_dataset(TECHNOLOGY, motif_metadata.index.values, **motif_metadata.to_dict('list')) id_to_idx_map = dict( zip(data.list_binding_datasets(TECHNOLOGY), np.arange(len(data.list_binding_datasets(TECHNOLOGY))))) current_pos = 0 last_added_chunk = 0 i = 0 rows, cols, scores = [], [], [] with open(bin_sorted_hits, 'r') as f: for line in f: motif_id, bin_num, score = line.strip().split() bin_num = int(bin_num) if bin_num < current_pos: raise Exception('Input file not sorted!') elif bin_num > current_pos and i >= group_loci: print('Adding matrix segment ...') matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr() data.append_csr(TECHNOLOGY, matrix_form) last_added_chunk = bin_num i = 0 rows, cols, scores = [], [], [] tf_idx = id_to_idx_map[motif_id] rows.append(bin_num - last_added_chunk) cols.append(tf_idx) scores.append(int(score)) current_pos = bin_num i += 1 if len(rows) > 0: matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr() data.append_csr(TECHNOLOGY, matrix_form)
def main(args): cistrome_metadata = pd.read_csv(args.cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) data = DataInterface(args.species, window_size=args.window_size, download_if_not_exists=False, make_new=False, load_genes=True) rp_map_styles = data.get_rp_maps() if len(rp_map_styles) == 0: basic_rp_map, enhanced_rp_map = data.build_binned_rp_map( 'basic', 10000), data.build_binned_rp_map('enhanced', 10000) data.add_rp_map('basic_10K', basic_rp_map) data.add_rp_map('enhanced_10K', enhanced_rp_map) else: basic_rp_map = data.get_rp_map('basic_10K') enhanced_rp_map = data.get_rp_map('enhanced_10K') for arr_name in args.coverage_arrays: coverage_array = np.load(arr_name) technology, dataset_id = os.path.basename(arr_name).split('_') dataset_id = '.'.join(dataset_id.split('.')[:-1]) metadata_headers = data.get_metadata_headers(technology) meta_dict = cistrome_metadata.loc[dataset_id, metadata_headers].to_dict() data.add_profile_data(technology, dataset_id, coverage_array, [basic_rp_map, enhanced_rp_map], ['basic_10K', 'enhanced_10K'], **meta_dict)
def main(species, window_size, cistrome_metadata, motif_metadata, index_files): cistrome_metadata = pd.read_csv(cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) dataset_ids = motif_metadata.index.values data.create_binding_dataset('Motif', dataset_ids)
def main(species, window_size, cistrome_metadata, motif_metadata, index_files): cistrome_metadata = pd.read_csv(cistrome_metadata, sep='\t').set_index('DCid') cistrome_metadata.index = cistrome_metadata.index.astype(str) motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None) motif_metadata.columns = ['dataset_id', 'factor', 'source'] motif_metadata = motif_metadata.set_index('dataset_id') motif_metadata = motif_metadata.drop_duplicates() data = DataInterface(species, window_size=window_size, download_if_not_exists=False, make_new=False, load_genes=False) for index_file in index_files: with open(index_file, 'r') as f: hit_bins = np.array([int(ind.strip()) for ind in f.readlines()]) technology, dataset_id = os.path.basename(index_file).split('_') dataset_id = '.'.join(dataset_id.split('.')[:-1]) metadata_headers = data.get_metadata_headers(technology) if technology == 'Motifs': meta_dict = motif_metadata.loc[dataset_id, metadata_headers].to_dict() meta_dict['source'] = 'jaspar' else: meta_dict = cistrome_metadata.loc[dataset_id, metadata_headers].to_dict() data.add_binding_data(technology, dataset_id, hit_bins, **meta_dict)