def initialize_sequences(generator, init_sequences, p_init): encoder = iso.OneHotEncoder(seq_length=len(init_sequences[0])) onehot_sequences = np.concatenate([ encoder(init_sequence).reshape((1, len(init_sequence), 4, 1)) for init_sequence in init_sequences ], axis=0) onehot_logits = generator.get_layer('policy_pwm').reshape( (len(init_sequences), len(init_sequences[0]), 4, 1)) on_logit = np.log(p_init / (1. - p_init)) p_off = (1. - p_init) / 3. off_logit = np.log(p_off / (1. - p_off)) for i in range(len(init_sequences)): init_sequence = init_sequences[i] for j in range(len(init_sequence)): if init_sequence[j] == 'A': nt_ix = 0 elif init_sequence[j] == 'C': nt_ix = 1 elif init_sequence[j] == 'G': nt_ix = 2 elif init_sequence[j] == 'T': nt_ix = 3 onehot_logits[i, j, :, :] = off_logit onehot_logits[i, j, nt_ix, :] = on_logit generator.get_layer('policy_pwm').set_weights( [onehot_logits.reshape(1, -1)])
def initialize_sequence_templates(generator, sequence_templates) : embedding_templates = [] embedding_masks = [] for k in range(len(sequence_templates)) : sequence_template = sequence_templates[k] onehot_template = iso.OneHotEncoder(seq_length=len(sequence_template))(sequence_template).reshape((len(sequence_template), 20, 1)) for j in range(len(sequence_template)) : if sequence_template[j] not in ['N', 'X'] : nt_ix = np.argmax(onehot_template[j, :, 0]) onehot_template[j, :, :] = -4.0 onehot_template[j, nt_ix, :] = 10.0 elif sequence_template[j] == 'X' : onehot_template[j, :, :] = -1.0 onehot_mask = np.zeros((len(sequence_template), 20, 1)) for j in range(len(sequence_template)) : if sequence_template[j] == 'N' : onehot_mask[j, :, :] = 1.0 embedding_templates.append(onehot_template.reshape(1, -1)) embedding_masks.append(onehot_mask.reshape(1, -1)) embedding_templates = np.concatenate(embedding_templates, axis=0) embedding_masks = np.concatenate(embedding_masks, axis=0) generator.get_layer('template_dense').set_weights([embedding_templates]) generator.get_layer('template_dense').trainable = False generator.get_layer('mask_dense').set_weights([embedding_masks]) generator.get_layer('mask_dense').trainable = False
def get_apadb_encoder(): onehot_encoder = iso.OneHotEncoder(205) def encode_for_apadb(prox_sequences, dist_sequences, prox_cut_starts, prox_cut_ends, dist_cut_starts, dist_cut_ends, site_distances): prox_one_hots = np.concatenate([ np.reshape(onehot_encoder(sequence), (1, len(sequence), 4, 1)) for sequence in prox_sequences ], axis=0) dist_one_hots = np.concatenate([ np.reshape(onehot_encoder(sequence), (1, len(sequence), 4, 1)) for sequence in dist_sequences ], axis=0) return [ prox_one_hots, dist_one_hots, np.array(prox_cut_starts).reshape(-1, 1), np.array(prox_cut_ends).reshape(-1, 1), np.array(dist_cut_starts).reshape(-1, 1), np.array(dist_cut_ends).reshape(-1, 1), np.log(np.array(site_distances).reshape(-1, 1)), np.zeros((len(prox_sequences), 13)), np.ones((len(prox_sequences), 1)) ] return encode_for_apadb
def get_aparent_legacy_encoder(): onehot_encoder = iso.OneHotEncoder(185) def encode_for_aparent(sequences): one_hots = np.concatenate([ np.reshape(onehot_encoder(sequence), (1, 1, len(sequence), 4)) for sequence in sequences ], axis=0) return [ one_hots, np.zeros((len(sequences), 36)), np.ones((len(sequences), 1)) ] return encode_for_aparent
def predict_mut_map(model, seq, isoform_start=80, isoform_end=105): encoder = iso.OneHotEncoder(len(seq)) mut_map = np.zeros((len(seq), 4)) cut_map = np.zeros((len(seq), 4, len(seq) + 1)) for pos in range(len(seq)): for j, nt in enumerate(['A', 'C', 'G', 'T']): mut_seq = seq[:pos] + nt + seq[pos + 1:] one_hot = encoder(mut_seq) _, cut_pred = model.predict( x=aparent_single_example_batch(one_hot)) mut_map[pos, j] = np.sum(np.ravel(cut_pred)[isoform_start:isoform_end]) cut_map[pos, j, :] = np.ravel(cut_pred) return mut_map, cut_map
def initialize_sequence_templates(generator, sequence_templates): encoder = iso.OneHotEncoder(seq_length=len(sequence_templates[0])) onehot_templates = np.concatenate([ encoder(sequence_template).reshape((1, len(sequence_template), 4, 1)) for sequence_template in sequence_templates ], axis=0) for i in range(len(sequence_templates)): sequence_template = sequence_templates[i] for j in range(len(sequence_template)): if sequence_template[j] != 'N': if sequence_template[j] != 'X': nt_ix = np.argmax(onehot_templates[i, j, :, 0]) onehot_templates[i, j, :, :] = -4 onehot_templates[i, j, nt_ix, :] = 10 else: onehot_templates[i, j, :, :] = -1 onehot_masks = np.zeros( (len(sequence_templates), len(sequence_templates[0]), 4, 1)) for i in range(len(sequence_templates)): sequence_template = sequence_templates[i] for j in range(len(sequence_template)): if sequence_template[j] == 'N': onehot_masks[i, j, :, :] = 1.0 generator.get_layer('template_dense').set_weights( [onehot_templates.reshape(1, -1)]) generator.get_layer('template_dense').trainable = False generator.get_layer('mask_dense').set_weights( [onehot_masks.reshape(1, -1)]) generator.get_layer('mask_dense').trainable = False
def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path=''): #Load array data array_dict = isoio.load(file_path + 'apa_array_data_master_seq') array_df = array_dict['array_df'] array_cuts = array_dict['pooled_cuts'] array_index = np.arange(len(array_df), dtype=np.int) print('Designed MPRA size = ' + str(array_index.shape[0])) #Generate training and test set indexes array_index = np.arange(len(array_df), dtype=np.int) array_train_index = array_index[:-int( len(array_df) * (valid_set_size + test_set_size))] array_valid_index = array_index[ array_train_index.shape[0]:-int(len(array_df) * test_set_size)] array_test_index = array_index[array_train_index.shape[0] + array_valid_index.shape[0]:] print('Training set size = ' + str(array_train_index.shape[0])) print('Validation set size = ' + str(array_valid_index.shape[0])) print('Test set size = ' + str(array_test_index.shape[0])) #Manually set sublibrary intercept terms array_df['library_index'] = np.zeros(len(array_df), dtype=np.int) array_df['distal_pas'] = np.ones(len(array_df)) array_df.loc[array_df['gene'] == 'doubledope', 'library_index'] = 20 array_df.loc[array_df['gene'] == 'doubledope', 'distal_pas'] = 1 array_df.loc[array_df['gene'] == 'simple', 'library_index'] = 22 array_df.loc[array_df['gene'] == 'simple', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'tomm5', 'library_index'] = 8 array_df.loc[array_df['gene'] == 'tomm5', 'distal_pas'] = 1 array_df.loc[array_df['gene'] == 'aar', 'library_index'] = 30 array_df.loc[array_df['gene'] == 'aar', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'atr', 'library_index'] = 31 array_df.loc[array_df['gene'] == 'atr', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'hsp', 'library_index'] = 32 array_df.loc[array_df['gene'] == 'hsp', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'snh', 'library_index'] = 33 array_df.loc[array_df['gene'] == 'snh', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'sox', 'library_index'] = 34 array_df.loc[array_df['gene'] == 'sox', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'wha', 'library_index'] = 35 array_df.loc[array_df['gene'] == 'wha', 'distal_pas'] = 0 array_gens = { gen_id: iso.DataGenerator(idx, { 'df': array_df, 'cuts': array_cuts }, batch_size=batch_size, inputs=[{ 'id': 'seq', 'source_type': 'dataframe', 'source': 'df', 'extractor': iso.SequenceExtractor('seq_ext', start_pos=200 + 1, end_pos=200 + 1 + 185), 'encoder': iso.OneHotEncoder(seq_length=185), 'dim': (1, 185, 4), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: row['library_index'], 'encoder': iso.CategoricalEncoder( n_categories=36, categories=np.arange(36, dtype=np.int).tolist()), 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: row['distal_pas'], 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'prox_usage', 'source_type': 'matrix', 'source': 'cuts', 'extractor': iso.CountExtractor(start_pos=200 + 1, end_pos=200 + 1 + 185, static_poses=[-1], sparse_source=True), 'transformer': lambda t: iso_normalizer(t), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', array_index), ( 'train', array_train_index), ('valid', array_valid_index), ('test', array_test_index)] } return array_gens
def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path=''): #Load leslie/apadb pair-wise data native_dict = isoio.load(file_path + 'apa_leslie_apadb_pair_data') native_df = native_dict['df_pair'] native_index = np.arange(len(native_df), dtype=np.int) print('Pair-wise Native APA (APADB + Leslie) size = ' + str(native_index.shape[0])) native_train_index = native_index[:-int( len(native_df) * (valid_set_size + test_set_size))] native_valid_index = native_index[ native_train_index.shape[0]:-int(len(native_df) * test_set_size)] native_test_index = native_index[native_train_index.shape[0] + native_valid_index.shape[0]:] print('Training set size = ' + str(native_train_index.shape[0])) print('Validation set size = ' + str(native_valid_index.shape[0])) print('Test set size = ' + str(native_test_index.shape[0])) #Calculate relative APADB cut start and end positions within each sequence def get_start_pos_prox(row): if row['strand'] == '+': return row['cut_start_prox'] - row['pas_pos_prox'] + 70 else: return row['pas_pos_prox'] - row['cut_end_prox'] + 76 def get_end_pos_prox(row): if row['strand'] == '+': return row['cut_end_prox'] - row['pas_pos_prox'] + 70 else: return row['pas_pos_prox'] - row['cut_start_prox'] + 76 def get_start_pos_dist(row): if row['strand'] == '+': return row['cut_start_dist'] - row['pas_pos_dist'] + 70 else: return row['pas_pos_dist'] - row['cut_end_dist'] + 76 def get_end_pos_dist(row): if row['strand'] == '+': return row['cut_end_dist'] - row['pas_pos_dist'] + 70 else: return row['pas_pos_dist'] - row['cut_start_dist'] + 76 native_df['rel_start_prox'] = native_df.apply(get_start_pos_prox, axis=1) native_df['rel_end_prox'] = native_df.apply(get_end_pos_prox, axis=1) native_df['rel_start_dist'] = native_df.apply(get_start_pos_dist, axis=1) native_df['rel_end_dist'] = native_df.apply(get_end_pos_dist, axis=1) native_gens = { gen_id: iso.DataGenerator( idx, {'df': native_df}, batch_size=batch_size, inputs=[{ 'id': 'seq_prox', 'source': 'df', 'source_type': 'dataframe', 'extractor': iso.SequenceExtractor('wide_seq_ext_prox', start_pos=105, end_pos=105 + 205), 'encoder': iso.OneHotEncoder(seq_length=205), 'dim': (205, 4, 1), 'sparsify': False }, { 'id': 'seq_dist', 'source': 'df', 'source_type': 'dataframe', 'extractor': iso.SequenceExtractor('wide_seq_ext_dist', start_pos=105, end_pos=105 + 205), 'encoder': iso.OneHotEncoder(seq_length=205), 'dim': (205, 4, 1), 'sparsify': False }, { 'id': 'start_prox', 'source': 'df', 'source_type': 'dataframe', 'extractor': lambda row, index: row['rel_start_prox'], 'transformer': None, 'dim': (1, ), 'sparsify': False }, { 'id': 'end_prox', 'source': 'df', 'source_type': 'dataframe', 'extractor': lambda row, index: row['rel_end_prox'], 'transformer': None, 'dim': (1, ), 'sparsify': False }, { 'id': 'start_dist', 'source': 'df', 'source_type': 'dataframe', 'extractor': lambda row, index: row['rel_start_dist'], 'transformer': None, 'dim': (1, ), 'sparsify': False }, { 'id': 'end_dist', 'source': 'df', 'source_type': 'dataframe', 'extractor': lambda row, index: row['rel_end_dist'], 'transformer': None, 'dim': (1, ), 'sparsify': False }, { 'id': 'site_distance', 'source': 'df', 'source_type': 'dataframe', 'extractor': lambda row, index: np.log( np.abs(row['cut_start_dist'] - row['cut_start_prox'])), 'transformer': None, 'dim': (1, ), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: np.zeros(13), 'encoder': None, 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 1, 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'dummy_output', 'source_type': 'zeros', 'dim': (1, ), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', native_index), ( 'train', native_train_index), ( 'valid', native_valid_index), ('test', native_test_index)] } return native_gens
def load_data(batch_size=32, valid_set_size=0.025, test_set_size=0.025, file_path='', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False) : #Load plasmid data #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb')) plasmid_dict = isoio.load(file_path + 'apa_plasmid_data_legacy') plasmid_df = plasmid_dict['plasmid_df'] plasmid_cuts = plasmid_dict['plasmid_cuts'] if kept_libraries is not None : keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if canonical_pas : keep_index = np.nonzero(plasmid_df.seq.str.slice(50, 56) == 'AATAAA')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_dse_canonical_pas : keep_index = np.nonzero(~plasmid_df.seq.str.slice(56).str.contains('AATAAA'))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] #Generate training and test set indexes plasmid_index = np.arange(len(plasmid_df), dtype=np.int) plasmid_train_index, plasmid_valid_index, plasmid_test_index = None, None, None if valid_set_size <= 1.0 and test_set_size <= 1.0 : plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] else : plasmid_train_index = plasmid_index[:-(valid_set_size + test_set_size)] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-test_set_size] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] print('Training set size = ' + str(plasmid_train_index.shape[0])) print('Validation set size = ' + str(plasmid_valid_index.shape[0])) print('Test set size = ' + str(plasmid_test_index.shape[0])) plasmid_prediction_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('seq', start_pos=1, end_pos=1 + 185), 'encoder' : iso.OneHotEncoder(seq_length=185), 'dim' : (1, 185, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library_index'], 'encoder' : iso.CategoricalEncoder(n_categories=36, categories=np.arange(36, dtype=np.int).tolist()), 'sparsify' : False }, { 'id' : 'distal_pas', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: 1 if row['library_index'] in [2, 5, 8, 11, 20] else 0, 'encoder' : None, 'sparsify' : False } ], outputs = [ { 'id' : 'prox_usage', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['proximal_count'] / row['total_count'], 'transformer' : lambda t: t, 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=0, end_pos=186, sparse_source=False), 'transformer' : lambda t: t, 'dim' : (186,), 'sparsify' : False } ], randomizers = [], shuffle = False, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } return plasmid_prediction_gens
def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path='', data_version=''): #Load array data array_dict = isoio.load(file_path + 'apa_array_data' + data_version) array_df = array_dict['array_df'] array_cuts = array_dict['pooled_cuts'] array_index = np.arange(len(array_df), dtype=np.int) print('Designed MPRA size = ' + str(array_index.shape[0])) #Generate training and test set indexes array_index = np.arange(len(array_df), dtype=np.int) array_train_index = array_index[:-int( len(array_df) * (valid_set_size + test_set_size))] array_valid_index = array_index[ array_train_index.shape[0]:-int(len(array_df) * test_set_size)] array_test_index = array_index[array_train_index.shape[0] + array_valid_index.shape[0]:] print('Training set size = ' + str(array_train_index.shape[0])) print('Validation set size = ' + str(array_valid_index.shape[0])) print('Test set size = ' + str(array_test_index.shape[0])) unique_libraries = np.array([ 'tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar' ], dtype=np.object) array_gens = { gen_id: iso.DataGenerator(idx, { 'df': array_df, 'cuts': array_cuts }, batch_size=batch_size, inputs=[{ 'id': 'seq', 'source_type': 'dataframe', 'source': 'df', 'extractor': iso.SequenceExtractor('seq_ext', start_pos=180, end_pos=180 + 205), 'encoder': iso.OneHotEncoder(seq_length=205), 'dim': (205, 4, 1), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 'array', 'encoder': iso.CategoricalEncoder( n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 1, 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'prox_usage', 'source_type': 'matrix', 'source': 'cuts', 'extractor': iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=True), 'transformer': lambda t: iso_normalizer(t), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', array_index), ( 'train', array_train_index), ('valid', array_valid_index), ('test', array_test_index)] } return array_gens
def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path=''): #Load array data native_dict = isoio.load(file_path + 'apa_leslie_apadb_data') native_df = native_dict['df'] native_index = np.arange(len(native_df), dtype=np.int) print('Native pA (APADB + Leslie) size = ' + str(native_index.shape[0])) native_train_index = native_index[:-int( len(native_df) * (valid_set_size + test_set_size))] native_valid_index = native_index[ native_train_index.shape[0]:-int(len(native_df) * test_set_size)] native_test_index = native_index[native_train_index.shape[0] + native_valid_index.shape[0]:] print('Training set size = ' + str(native_train_index.shape[0])) print('Validation set size = ' + str(native_valid_index.shape[0])) print('Test set size = ' + str(native_test_index.shape[0])) native_gens = { gen_id: iso.DataGenerator(idx, {'df': native_df}, batch_size=batch_size, inputs=[{ 'id': 'seq_prox', 'source': 'df', 'source_type': 'dataframe', 'extractor': iso.SequenceExtractor('wide_seq_ext', start_pos=105, end_pos=105 + 205), 'encoder': iso.OneHotEncoder(seq_length=205), 'dim': (205, 4, 1), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: np.zeros(13), 'encoder': None, 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 1, 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'dummy_output', 'source_type': 'zeros', 'dim': (1, ), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', native_index), ( 'train', native_train_index), ( 'valid', native_valid_index), ('test', native_test_index)] } return native_gens
def load_data(batch_size=64, valid_set_size=0.025, test_set_size=0.025, file_path='', data_version='_v2', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False, no_clinvar_wt=True) : #Load plasmid data #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb')) plasmid_dict = isoio.load(file_path + 'apa_plasmid_data' + data_version) plasmid_df = plasmid_dict['plasmid_df'] plasmid_cuts = plasmid_dict['plasmid_cuts'] unique_libraries = np.array(['tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'], dtype=np.object)#plasmid_df['library'].unique() if kept_libraries is not None : keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if canonical_pas : keep_index = np.nonzero(plasmid_df.seq.str.slice(70, 76) == 'AATAAA')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_dse_canonical_pas : keep_index = np.nonzero(~plasmid_df.seq.str.slice(76).str.contains('AATAAA'))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_clinvar_wt : print("size before filtering out clinvar_wt = " + str(len(plasmid_df))) keep_index = np.nonzero(plasmid_df.sublibrary != 'clinvar_wt')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] print("size after filtering out clinvar_wt = " + str(len(plasmid_df))) #Generate training and test set indexes plasmid_index = np.arange(len(plasmid_df), dtype=np.int) plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] print('Training set size = ' + str(plasmid_train_index.shape[0])) print('Validation set size = ' + str(plasmid_valid_index.shape[0])) print('Test set size = ' + str(plasmid_test_index.shape[0])) prox_range = (np.arange(30, dtype=np.int) + 80).tolist() norm_range = np.arange(206).tolist() plasmid_training_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205), 'encoder' : iso.OneHotEncoder(seq_length=205), 'dim' : (1, 205, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library'], 'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify' : False }, { 'id' : 'total_count', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: np.sum(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_usage', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: iso_normalizer(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: cut_normalizer(t), 'dim' : (206,), 'sparsify' : False } ], outputs = [ { 'id' : 'dummy_output', 'source_type' : 'zeros', 'dim' : (1,), 'sparsify' : False } ], randomizers = [], shuffle = True, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } plasmid_prediction_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205), 'encoder' : iso.OneHotEncoder(seq_length=205), 'dim' : (1, 205, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library'], 'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify' : False } ], outputs = [ { 'id' : 'prox_usage', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: iso_normalizer(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: cut_normalizer(t), 'dim' : (206,), 'sparsify' : False } ], randomizers = [], shuffle = False, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } return plasmid_training_gens, plasmid_prediction_gens
def predict_ref(model, seq, isoform_start=80, isoform_end=105): one_hot = iso.OneHotEncoder(len(seq))(seq) _, cut_pred = model.predict(x=aparent_single_example_batch(one_hot)) return np.sum( np.ravel(cut_pred)[isoform_start:isoform_end]), np.ravel(cut_pred)