def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path=''): #Load array data array_dict = isoio.load(file_path + 'apa_array_data_master_seq') array_df = array_dict['array_df'] array_cuts = array_dict['pooled_cuts'] array_index = np.arange(len(array_df), dtype=np.int) print('Designed MPRA size = ' + str(array_index.shape[0])) #Generate training and test set indexes array_index = np.arange(len(array_df), dtype=np.int) array_train_index = array_index[:-int( len(array_df) * (valid_set_size + test_set_size))] array_valid_index = array_index[ array_train_index.shape[0]:-int(len(array_df) * test_set_size)] array_test_index = array_index[array_train_index.shape[0] + array_valid_index.shape[0]:] print('Training set size = ' + str(array_train_index.shape[0])) print('Validation set size = ' + str(array_valid_index.shape[0])) print('Test set size = ' + str(array_test_index.shape[0])) #Manually set sublibrary intercept terms array_df['library_index'] = np.zeros(len(array_df), dtype=np.int) array_df['distal_pas'] = np.ones(len(array_df)) array_df.loc[array_df['gene'] == 'doubledope', 'library_index'] = 20 array_df.loc[array_df['gene'] == 'doubledope', 'distal_pas'] = 1 array_df.loc[array_df['gene'] == 'simple', 'library_index'] = 22 array_df.loc[array_df['gene'] == 'simple', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'tomm5', 'library_index'] = 8 array_df.loc[array_df['gene'] == 'tomm5', 'distal_pas'] = 1 array_df.loc[array_df['gene'] == 'aar', 'library_index'] = 30 array_df.loc[array_df['gene'] == 'aar', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'atr', 'library_index'] = 31 array_df.loc[array_df['gene'] == 'atr', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'hsp', 'library_index'] = 32 array_df.loc[array_df['gene'] == 'hsp', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'snh', 'library_index'] = 33 array_df.loc[array_df['gene'] == 'snh', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'sox', 'library_index'] = 34 array_df.loc[array_df['gene'] == 'sox', 'distal_pas'] = 0 array_df.loc[array_df['gene'] == 'wha', 'library_index'] = 35 array_df.loc[array_df['gene'] == 'wha', 'distal_pas'] = 0 array_gens = { gen_id: iso.DataGenerator(idx, { 'df': array_df, 'cuts': array_cuts }, batch_size=batch_size, inputs=[{ 'id': 'seq', 'source_type': 'dataframe', 'source': 'df', 'extractor': iso.SequenceExtractor('seq_ext', start_pos=200 + 1, end_pos=200 + 1 + 185), 'encoder': iso.OneHotEncoder(seq_length=185), 'dim': (1, 185, 4), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: row['library_index'], 'encoder': iso.CategoricalEncoder( n_categories=36, categories=np.arange(36, dtype=np.int).tolist()), 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: row['distal_pas'], 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'prox_usage', 'source_type': 'matrix', 'source': 'cuts', 'extractor': iso.CountExtractor(start_pos=200 + 1, end_pos=200 + 1 + 185, static_poses=[-1], sparse_source=True), 'transformer': lambda t: iso_normalizer(t), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', array_index), ( 'train', array_train_index), ('valid', array_valid_index), ('test', array_test_index)] } return array_gens
def load_data(batch_size=32, valid_set_size=0.0, test_set_size=1.0, file_path='', data_version=''): #Load array data array_dict = isoio.load(file_path + 'apa_array_data' + data_version) array_df = array_dict['array_df'] array_cuts = array_dict['pooled_cuts'] array_index = np.arange(len(array_df), dtype=np.int) print('Designed MPRA size = ' + str(array_index.shape[0])) #Generate training and test set indexes array_index = np.arange(len(array_df), dtype=np.int) array_train_index = array_index[:-int( len(array_df) * (valid_set_size + test_set_size))] array_valid_index = array_index[ array_train_index.shape[0]:-int(len(array_df) * test_set_size)] array_test_index = array_index[array_train_index.shape[0] + array_valid_index.shape[0]:] print('Training set size = ' + str(array_train_index.shape[0])) print('Validation set size = ' + str(array_valid_index.shape[0])) print('Test set size = ' + str(array_test_index.shape[0])) unique_libraries = np.array([ 'tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar' ], dtype=np.object) array_gens = { gen_id: iso.DataGenerator(idx, { 'df': array_df, 'cuts': array_cuts }, batch_size=batch_size, inputs=[{ 'id': 'seq', 'source_type': 'dataframe', 'source': 'df', 'extractor': iso.SequenceExtractor('seq_ext', start_pos=180, end_pos=180 + 205), 'encoder': iso.OneHotEncoder(seq_length=205), 'dim': (205, 4, 1), 'sparsify': False }, { 'id': 'lib', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 'array', 'encoder': iso.CategoricalEncoder( n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify': False }, { 'id': 'distal_pas', 'source_type': 'dataframe', 'source': 'df', 'extractor': lambda row, index: 1, 'encoder': None, 'sparsify': False }], outputs=[{ 'id': 'prox_usage', 'source_type': 'matrix', 'source': 'cuts', 'extractor': iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=True), 'transformer': lambda t: iso_normalizer(t), 'sparsify': False }], randomizers=[], shuffle=False) for gen_id, idx in [('all', array_index), ( 'train', array_train_index), ('valid', array_valid_index), ('test', array_test_index)] } return array_gens
def load_data(batch_size=32, valid_set_size=0.025, test_set_size=0.025, file_path='', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False) : #Load plasmid data #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb')) plasmid_dict = isoio.load(file_path + 'apa_plasmid_data_legacy') plasmid_df = plasmid_dict['plasmid_df'] plasmid_cuts = plasmid_dict['plasmid_cuts'] if kept_libraries is not None : keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if canonical_pas : keep_index = np.nonzero(plasmid_df.seq.str.slice(50, 56) == 'AATAAA')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_dse_canonical_pas : keep_index = np.nonzero(~plasmid_df.seq.str.slice(56).str.contains('AATAAA'))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] #Generate training and test set indexes plasmid_index = np.arange(len(plasmid_df), dtype=np.int) plasmid_train_index, plasmid_valid_index, plasmid_test_index = None, None, None if valid_set_size <= 1.0 and test_set_size <= 1.0 : plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] else : plasmid_train_index = plasmid_index[:-(valid_set_size + test_set_size)] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-test_set_size] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] print('Training set size = ' + str(plasmid_train_index.shape[0])) print('Validation set size = ' + str(plasmid_valid_index.shape[0])) print('Test set size = ' + str(plasmid_test_index.shape[0])) plasmid_prediction_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('seq', start_pos=1, end_pos=1 + 185), 'encoder' : iso.OneHotEncoder(seq_length=185), 'dim' : (1, 185, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library_index'], 'encoder' : iso.CategoricalEncoder(n_categories=36, categories=np.arange(36, dtype=np.int).tolist()), 'sparsify' : False }, { 'id' : 'distal_pas', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: 1 if row['library_index'] in [2, 5, 8, 11, 20] else 0, 'encoder' : None, 'sparsify' : False } ], outputs = [ { 'id' : 'prox_usage', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['proximal_count'] / row['total_count'], 'transformer' : lambda t: t, 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=0, end_pos=186, sparse_source=False), 'transformer' : lambda t: t, 'dim' : (186,), 'sparsify' : False } ], randomizers = [], shuffle = False, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } return plasmid_prediction_gens
def load_data(batch_size=64, valid_set_size=0.025, test_set_size=0.025, file_path='', data_version='_v2', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False, no_clinvar_wt=True) : #Load plasmid data #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb')) plasmid_dict = isoio.load(file_path + 'apa_plasmid_data' + data_version) plasmid_df = plasmid_dict['plasmid_df'] plasmid_cuts = plasmid_dict['plasmid_cuts'] unique_libraries = np.array(['tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'], dtype=np.object)#plasmid_df['library'].unique() if kept_libraries is not None : keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if canonical_pas : keep_index = np.nonzero(plasmid_df.seq.str.slice(70, 76) == 'AATAAA')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_dse_canonical_pas : keep_index = np.nonzero(~plasmid_df.seq.str.slice(76).str.contains('AATAAA'))[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] if no_clinvar_wt : print("size before filtering out clinvar_wt = " + str(len(plasmid_df))) keep_index = np.nonzero(plasmid_df.sublibrary != 'clinvar_wt')[0] plasmid_df = plasmid_df.iloc[keep_index].copy() plasmid_cuts = plasmid_cuts[keep_index, :] print("size after filtering out clinvar_wt = " + str(len(plasmid_df))) #Generate training and test set indexes plasmid_index = np.arange(len(plasmid_df), dtype=np.int) plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))] plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)] plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:] print('Training set size = ' + str(plasmid_train_index.shape[0])) print('Validation set size = ' + str(plasmid_valid_index.shape[0])) print('Test set size = ' + str(plasmid_test_index.shape[0])) prox_range = (np.arange(30, dtype=np.int) + 80).tolist() norm_range = np.arange(206).tolist() plasmid_training_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205), 'encoder' : iso.OneHotEncoder(seq_length=205), 'dim' : (1, 205, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library'], 'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify' : False }, { 'id' : 'total_count', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: np.sum(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_usage', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: iso_normalizer(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: cut_normalizer(t), 'dim' : (206,), 'sparsify' : False } ], outputs = [ { 'id' : 'dummy_output', 'source_type' : 'zeros', 'dim' : (1,), 'sparsify' : False } ], randomizers = [], shuffle = True, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } plasmid_prediction_gens = { gen_id : iso.DataGenerator( idx, {'df' : plasmid_df, 'cuts' : plasmid_cuts}, batch_size=batch_size, inputs = [ { 'id' : 'seq', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205), 'encoder' : iso.OneHotEncoder(seq_length=205), 'dim' : (1, 205, 4), 'sparsify' : False }, { 'id' : 'lib', 'source_type' : 'dataframe', 'source' : 'df', 'extractor' : lambda row, index: row['library'], 'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries), 'sparsify' : False } ], outputs = [ { 'id' : 'prox_usage', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: iso_normalizer(t), 'dim' : (1,), 'sparsify' : False }, { 'id' : 'prox_cuts', 'source_type' : 'matrix', 'source' : 'cuts', 'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False), 'transformer' : lambda t: cut_normalizer(t), 'dim' : (206,), 'sparsify' : False } ], randomizers = [], shuffle = False, densify_batch_matrices=True ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)] } return plasmid_training_gens, plasmid_prediction_gens