示例#1
0
def initialize_sequences(generator, init_sequences, p_init):

    encoder = iso.OneHotEncoder(seq_length=len(init_sequences[0]))
    onehot_sequences = np.concatenate([
        encoder(init_sequence).reshape((1, len(init_sequence), 4, 1))
        for init_sequence in init_sequences
    ],
                                      axis=0)

    onehot_logits = generator.get_layer('policy_pwm').reshape(
        (len(init_sequences), len(init_sequences[0]), 4, 1))

    on_logit = np.log(p_init / (1. - p_init))

    p_off = (1. - p_init) / 3.
    off_logit = np.log(p_off / (1. - p_off))

    for i in range(len(init_sequences)):
        init_sequence = init_sequences[i]

        for j in range(len(init_sequence)):
            if init_sequence[j] == 'A':
                nt_ix = 0
            elif init_sequence[j] == 'C':
                nt_ix = 1
            elif init_sequence[j] == 'G':
                nt_ix = 2
            elif init_sequence[j] == 'T':
                nt_ix = 3

            onehot_logits[i, j, :, :] = off_logit
            onehot_logits[i, j, nt_ix, :] = on_logit

    generator.get_layer('policy_pwm').set_weights(
        [onehot_logits.reshape(1, -1)])
示例#2
0
def initialize_sequence_templates(generator, sequence_templates) :

	embedding_templates = []
	embedding_masks = []

	for k in range(len(sequence_templates)) :
		sequence_template = sequence_templates[k]
		onehot_template = iso.OneHotEncoder(seq_length=len(sequence_template))(sequence_template).reshape((len(sequence_template), 20, 1))
		
		for j in range(len(sequence_template)) :
			if sequence_template[j] not in ['N', 'X'] :
				nt_ix = np.argmax(onehot_template[j, :, 0])
				onehot_template[j, :, :] = -4.0
				onehot_template[j, nt_ix, :] = 10.0
			elif sequence_template[j] == 'X' :
				onehot_template[j, :, :] = -1.0

		onehot_mask = np.zeros((len(sequence_template), 20, 1))
		for j in range(len(sequence_template)) :
			if sequence_template[j] == 'N' :
				onehot_mask[j, :, :] = 1.0
		
		embedding_templates.append(onehot_template.reshape(1, -1))
		embedding_masks.append(onehot_mask.reshape(1, -1))

	embedding_templates = np.concatenate(embedding_templates, axis=0)
	embedding_masks = np.concatenate(embedding_masks, axis=0)

	generator.get_layer('template_dense').set_weights([embedding_templates])
	generator.get_layer('template_dense').trainable = False

	generator.get_layer('mask_dense').set_weights([embedding_masks])
	generator.get_layer('mask_dense').trainable = False
示例#3
0
def get_apadb_encoder():
    onehot_encoder = iso.OneHotEncoder(205)

    def encode_for_apadb(prox_sequences, dist_sequences, prox_cut_starts,
                         prox_cut_ends, dist_cut_starts, dist_cut_ends,
                         site_distances):
        prox_one_hots = np.concatenate([
            np.reshape(onehot_encoder(sequence), (1, len(sequence), 4, 1))
            for sequence in prox_sequences
        ],
                                       axis=0)
        dist_one_hots = np.concatenate([
            np.reshape(onehot_encoder(sequence), (1, len(sequence), 4, 1))
            for sequence in dist_sequences
        ],
                                       axis=0)

        return [
            prox_one_hots, dist_one_hots,
            np.array(prox_cut_starts).reshape(-1, 1),
            np.array(prox_cut_ends).reshape(-1, 1),
            np.array(dist_cut_starts).reshape(-1, 1),
            np.array(dist_cut_ends).reshape(-1, 1),
            np.log(np.array(site_distances).reshape(-1, 1)),
            np.zeros((len(prox_sequences), 13)),
            np.ones((len(prox_sequences), 1))
        ]

    return encode_for_apadb
示例#4
0
def get_aparent_legacy_encoder():
    onehot_encoder = iso.OneHotEncoder(185)

    def encode_for_aparent(sequences):
        one_hots = np.concatenate([
            np.reshape(onehot_encoder(sequence), (1, 1, len(sequence), 4))
            for sequence in sequences
        ],
                                  axis=0)

        return [
            one_hots,
            np.zeros((len(sequences), 36)),
            np.ones((len(sequences), 1))
        ]

    return encode_for_aparent
示例#5
0
def predict_mut_map(model, seq, isoform_start=80, isoform_end=105):
    encoder = iso.OneHotEncoder(len(seq))

    mut_map = np.zeros((len(seq), 4))
    cut_map = np.zeros((len(seq), 4, len(seq) + 1))

    for pos in range(len(seq)):
        for j, nt in enumerate(['A', 'C', 'G', 'T']):
            mut_seq = seq[:pos] + nt + seq[pos + 1:]
            one_hot = encoder(mut_seq)

            _, cut_pred = model.predict(
                x=aparent_single_example_batch(one_hot))

            mut_map[pos,
                    j] = np.sum(np.ravel(cut_pred)[isoform_start:isoform_end])
            cut_map[pos, j, :] = np.ravel(cut_pred)

    return mut_map, cut_map
示例#6
0
def initialize_sequence_templates(generator, sequence_templates):

    encoder = iso.OneHotEncoder(seq_length=len(sequence_templates[0]))
    onehot_templates = np.concatenate([
        encoder(sequence_template).reshape((1, len(sequence_template), 4, 1))
        for sequence_template in sequence_templates
    ],
                                      axis=0)

    for i in range(len(sequence_templates)):
        sequence_template = sequence_templates[i]

        for j in range(len(sequence_template)):
            if sequence_template[j] != 'N':
                if sequence_template[j] != 'X':
                    nt_ix = np.argmax(onehot_templates[i, j, :, 0])
                    onehot_templates[i, j, :, :] = -4
                    onehot_templates[i, j, nt_ix, :] = 10
                else:
                    onehot_templates[i, j, :, :] = -1

    onehot_masks = np.zeros(
        (len(sequence_templates), len(sequence_templates[0]), 4, 1))
    for i in range(len(sequence_templates)):
        sequence_template = sequence_templates[i]

        for j in range(len(sequence_template)):
            if sequence_template[j] == 'N':
                onehot_masks[i, j, :, :] = 1.0

    generator.get_layer('template_dense').set_weights(
        [onehot_templates.reshape(1, -1)])
    generator.get_layer('template_dense').trainable = False

    generator.get_layer('mask_dense').set_weights(
        [onehot_masks.reshape(1, -1)])
    generator.get_layer('mask_dense').trainable = False
def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path=''):

    #Load array data
    array_dict = isoio.load(file_path + 'apa_array_data_master_seq')
    array_df = array_dict['array_df']
    array_cuts = array_dict['pooled_cuts']

    array_index = np.arange(len(array_df), dtype=np.int)

    print('Designed MPRA size = ' + str(array_index.shape[0]))

    #Generate training and test set indexes
    array_index = np.arange(len(array_df), dtype=np.int)

    array_train_index = array_index[:-int(
        len(array_df) * (valid_set_size + test_set_size))]
    array_valid_index = array_index[
        array_train_index.shape[0]:-int(len(array_df) * test_set_size)]
    array_test_index = array_index[array_train_index.shape[0] +
                                   array_valid_index.shape[0]:]

    print('Training set size = ' + str(array_train_index.shape[0]))
    print('Validation set size = ' + str(array_valid_index.shape[0]))
    print('Test set size = ' + str(array_test_index.shape[0]))

    #Manually set sublibrary intercept terms
    array_df['library_index'] = np.zeros(len(array_df), dtype=np.int)
    array_df['distal_pas'] = np.ones(len(array_df))
    array_df.loc[array_df['gene'] == 'doubledope', 'library_index'] = 20
    array_df.loc[array_df['gene'] == 'doubledope', 'distal_pas'] = 1
    array_df.loc[array_df['gene'] == 'simple', 'library_index'] = 22
    array_df.loc[array_df['gene'] == 'simple', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'tomm5', 'library_index'] = 8
    array_df.loc[array_df['gene'] == 'tomm5', 'distal_pas'] = 1
    array_df.loc[array_df['gene'] == 'aar', 'library_index'] = 30
    array_df.loc[array_df['gene'] == 'aar', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'atr', 'library_index'] = 31
    array_df.loc[array_df['gene'] == 'atr', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'hsp', 'library_index'] = 32
    array_df.loc[array_df['gene'] == 'hsp', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'snh', 'library_index'] = 33
    array_df.loc[array_df['gene'] == 'snh', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'sox', 'library_index'] = 34
    array_df.loc[array_df['gene'] == 'sox', 'distal_pas'] = 0
    array_df.loc[array_df['gene'] == 'wha', 'library_index'] = 35
    array_df.loc[array_df['gene'] == 'wha', 'distal_pas'] = 0

    array_gens = {
        gen_id:
        iso.DataGenerator(idx, {
            'df': array_df,
            'cuts': array_cuts
        },
                          batch_size=batch_size,
                          inputs=[{
                              'id':
                              'seq',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              iso.SequenceExtractor('seq_ext',
                                                    start_pos=200 + 1,
                                                    end_pos=200 + 1 + 185),
                              'encoder':
                              iso.OneHotEncoder(seq_length=185),
                              'dim': (1, 185, 4),
                              'sparsify':
                              False
                          }, {
                              'id':
                              'lib',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              lambda row, index: row['library_index'],
                              'encoder':
                              iso.CategoricalEncoder(
                                  n_categories=36,
                                  categories=np.arange(36,
                                                       dtype=np.int).tolist()),
                              'sparsify':
                              False
                          }, {
                              'id':
                              'distal_pas',
                              'source_type':
                              'dataframe',
                              'source':
                              'df',
                              'extractor':
                              lambda row, index: row['distal_pas'],
                              'encoder':
                              None,
                              'sparsify':
                              False
                          }],
                          outputs=[{
                              'id':
                              'prox_usage',
                              'source_type':
                              'matrix',
                              'source':
                              'cuts',
                              'extractor':
                              iso.CountExtractor(start_pos=200 + 1,
                                                 end_pos=200 + 1 + 185,
                                                 static_poses=[-1],
                                                 sparse_source=True),
                              'transformer':
                              lambda t: iso_normalizer(t),
                              'sparsify':
                              False
                          }],
                          randomizers=[],
                          shuffle=False)
        for gen_id, idx in [('all', array_index), (
            'train',
            array_train_index), ('valid',
                                 array_valid_index), ('test',
                                                      array_test_index)]
    }

    return array_gens
def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path=''):

    #Load leslie/apadb pair-wise data
    native_dict = isoio.load(file_path + 'apa_leslie_apadb_pair_data')
    native_df = native_dict['df_pair']

    native_index = np.arange(len(native_df), dtype=np.int)

    print('Pair-wise Native APA (APADB + Leslie) size = ' +
          str(native_index.shape[0]))

    native_train_index = native_index[:-int(
        len(native_df) * (valid_set_size + test_set_size))]
    native_valid_index = native_index[
        native_train_index.shape[0]:-int(len(native_df) * test_set_size)]
    native_test_index = native_index[native_train_index.shape[0] +
                                     native_valid_index.shape[0]:]

    print('Training set size = ' + str(native_train_index.shape[0]))
    print('Validation set size = ' + str(native_valid_index.shape[0]))
    print('Test set size = ' + str(native_test_index.shape[0]))

    #Calculate relative APADB cut start and end positions within each sequence
    def get_start_pos_prox(row):
        if row['strand'] == '+':
            return row['cut_start_prox'] - row['pas_pos_prox'] + 70
        else:
            return row['pas_pos_prox'] - row['cut_end_prox'] + 76

    def get_end_pos_prox(row):
        if row['strand'] == '+':
            return row['cut_end_prox'] - row['pas_pos_prox'] + 70
        else:
            return row['pas_pos_prox'] - row['cut_start_prox'] + 76

    def get_start_pos_dist(row):
        if row['strand'] == '+':
            return row['cut_start_dist'] - row['pas_pos_dist'] + 70
        else:
            return row['pas_pos_dist'] - row['cut_end_dist'] + 76

    def get_end_pos_dist(row):
        if row['strand'] == '+':
            return row['cut_end_dist'] - row['pas_pos_dist'] + 70
        else:
            return row['pas_pos_dist'] - row['cut_start_dist'] + 76

    native_df['rel_start_prox'] = native_df.apply(get_start_pos_prox, axis=1)
    native_df['rel_end_prox'] = native_df.apply(get_end_pos_prox, axis=1)

    native_df['rel_start_dist'] = native_df.apply(get_start_pos_dist, axis=1)
    native_df['rel_end_dist'] = native_df.apply(get_end_pos_dist, axis=1)

    native_gens = {
        gen_id: iso.DataGenerator(
            idx, {'df': native_df},
            batch_size=batch_size,
            inputs=[{
                'id':
                'seq_prox',
                'source':
                'df',
                'source_type':
                'dataframe',
                'extractor':
                iso.SequenceExtractor('wide_seq_ext_prox',
                                      start_pos=105,
                                      end_pos=105 + 205),
                'encoder':
                iso.OneHotEncoder(seq_length=205),
                'dim': (205, 4, 1),
                'sparsify':
                False
            }, {
                'id':
                'seq_dist',
                'source':
                'df',
                'source_type':
                'dataframe',
                'extractor':
                iso.SequenceExtractor('wide_seq_ext_dist',
                                      start_pos=105,
                                      end_pos=105 + 205),
                'encoder':
                iso.OneHotEncoder(seq_length=205),
                'dim': (205, 4, 1),
                'sparsify':
                False
            }, {
                'id': 'start_prox',
                'source': 'df',
                'source_type': 'dataframe',
                'extractor': lambda row, index: row['rel_start_prox'],
                'transformer': None,
                'dim': (1, ),
                'sparsify': False
            }, {
                'id': 'end_prox',
                'source': 'df',
                'source_type': 'dataframe',
                'extractor': lambda row, index: row['rel_end_prox'],
                'transformer': None,
                'dim': (1, ),
                'sparsify': False
            }, {
                'id': 'start_dist',
                'source': 'df',
                'source_type': 'dataframe',
                'extractor': lambda row, index: row['rel_start_dist'],
                'transformer': None,
                'dim': (1, ),
                'sparsify': False
            }, {
                'id': 'end_dist',
                'source': 'df',
                'source_type': 'dataframe',
                'extractor': lambda row, index: row['rel_end_dist'],
                'transformer': None,
                'dim': (1, ),
                'sparsify': False
            }, {
                'id':
                'site_distance',
                'source':
                'df',
                'source_type':
                'dataframe',
                'extractor':
                lambda row, index: np.log(
                    np.abs(row['cut_start_dist'] - row['cut_start_prox'])),
                'transformer':
                None,
                'dim': (1, ),
                'sparsify':
                False
            }, {
                'id': 'lib',
                'source_type': 'dataframe',
                'source': 'df',
                'extractor': lambda row, index: np.zeros(13),
                'encoder': None,
                'sparsify': False
            }, {
                'id': 'distal_pas',
                'source_type': 'dataframe',
                'source': 'df',
                'extractor': lambda row, index: 1,
                'encoder': None,
                'sparsify': False
            }],
            outputs=[{
                'id': 'dummy_output',
                'source_type': 'zeros',
                'dim': (1, ),
                'sparsify': False
            }],
            randomizers=[],
            shuffle=False)
        for gen_id, idx in [('all', native_index), (
            'train', native_train_index), (
                'valid', native_valid_index), ('test', native_test_index)]
    }

    return native_gens
def load_data(batch_size=32, valid_set_size=0.025, test_set_size=0.025, file_path='', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False) :

    #Load plasmid data
    #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb'))
    plasmid_dict = isoio.load(file_path + 'apa_plasmid_data_legacy')
    plasmid_df = plasmid_dict['plasmid_df']
    plasmid_cuts = plasmid_dict['plasmid_cuts']
    
    if kept_libraries is not None :
        keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if canonical_pas :
        keep_index = np.nonzero(plasmid_df.seq.str.slice(50, 56) == 'AATAAA')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if no_dse_canonical_pas :
        keep_index = np.nonzero(~plasmid_df.seq.str.slice(56).str.contains('AATAAA'))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
    
    #Generate training and test set indexes
    plasmid_index = np.arange(len(plasmid_df), dtype=np.int)

    plasmid_train_index, plasmid_valid_index, plasmid_test_index = None, None, None

    if valid_set_size <= 1.0 and test_set_size <= 1.0 :
        plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))]
        plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)]
        plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]
    else :
        plasmid_train_index = plasmid_index[:-(valid_set_size + test_set_size)]
        plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-test_set_size]
        plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]

    print('Training set size = ' + str(plasmid_train_index.shape[0]))
    print('Validation set size = ' + str(plasmid_valid_index.shape[0]))
    print('Test set size = ' + str(plasmid_test_index.shape[0]))
    
    

    plasmid_prediction_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('seq', start_pos=1, end_pos=1 + 185),
                    'encoder' : iso.OneHotEncoder(seq_length=185),
                    'dim' : (1, 185, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library_index'],
                    'encoder' : iso.CategoricalEncoder(n_categories=36, categories=np.arange(36, dtype=np.int).tolist()),
                    'sparsify' : False
                },
                {
                    'id' : 'distal_pas',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: 1 if row['library_index'] in [2, 5, 8, 11, 20] else 0,
                    'encoder' : None,
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'prox_usage',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['proximal_count'] / row['total_count'],
                    'transformer' : lambda t: t,
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=0, end_pos=186, sparse_source=False),
                    'transformer' : lambda t: t,
                    'dim' : (186,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = False,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    return plasmid_prediction_gens
示例#10
0
def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path='',
              data_version=''):

    #Load array data
    array_dict = isoio.load(file_path + 'apa_array_data' + data_version)
    array_df = array_dict['array_df']
    array_cuts = array_dict['pooled_cuts']

    array_index = np.arange(len(array_df), dtype=np.int)

    print('Designed MPRA size = ' + str(array_index.shape[0]))

    #Generate training and test set indexes
    array_index = np.arange(len(array_df), dtype=np.int)

    array_train_index = array_index[:-int(
        len(array_df) * (valid_set_size + test_set_size))]
    array_valid_index = array_index[
        array_train_index.shape[0]:-int(len(array_df) * test_set_size)]
    array_test_index = array_index[array_train_index.shape[0] +
                                   array_valid_index.shape[0]:]

    print('Training set size = ' + str(array_train_index.shape[0]))
    print('Validation set size = ' + str(array_valid_index.shape[0]))
    print('Test set size = ' + str(array_test_index.shape[0]))

    unique_libraries = np.array([
        'tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20',
        'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope',
        'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'
    ],
                                dtype=np.object)

    array_gens = {
        gen_id: iso.DataGenerator(idx, {
            'df': array_df,
            'cuts': array_cuts
        },
                                  batch_size=batch_size,
                                  inputs=[{
                                      'id':
                                      'seq',
                                      'source_type':
                                      'dataframe',
                                      'source':
                                      'df',
                                      'extractor':
                                      iso.SequenceExtractor('seq_ext',
                                                            start_pos=180,
                                                            end_pos=180 + 205),
                                      'encoder':
                                      iso.OneHotEncoder(seq_length=205),
                                      'dim': (205, 4, 1),
                                      'sparsify':
                                      False
                                  }, {
                                      'id':
                                      'lib',
                                      'source_type':
                                      'dataframe',
                                      'source':
                                      'df',
                                      'extractor':
                                      lambda row, index: 'array',
                                      'encoder':
                                      iso.CategoricalEncoder(
                                          n_categories=len(unique_libraries),
                                          categories=unique_libraries),
                                      'sparsify':
                                      False
                                  }, {
                                      'id': 'distal_pas',
                                      'source_type': 'dataframe',
                                      'source': 'df',
                                      'extractor': lambda row, index: 1,
                                      'encoder': None,
                                      'sparsify': False
                                  }],
                                  outputs=[{
                                      'id':
                                      'prox_usage',
                                      'source_type':
                                      'matrix',
                                      'source':
                                      'cuts',
                                      'extractor':
                                      iso.CountExtractor(start_pos=180,
                                                         end_pos=180 + 205,
                                                         static_poses=[-1],
                                                         sparse_source=True),
                                      'transformer':
                                      lambda t: iso_normalizer(t),
                                      'sparsify':
                                      False
                                  }],
                                  randomizers=[],
                                  shuffle=False)
        for gen_id, idx in [('all', array_index), (
            'train',
            array_train_index), ('valid',
                                 array_valid_index), ('test',
                                                      array_test_index)]
    }

    return array_gens
示例#11
0
def load_data(batch_size=32,
              valid_set_size=0.0,
              test_set_size=1.0,
              file_path=''):

    #Load array data
    native_dict = isoio.load(file_path + 'apa_leslie_apadb_data')
    native_df = native_dict['df']

    native_index = np.arange(len(native_df), dtype=np.int)

    print('Native pA (APADB + Leslie) size = ' + str(native_index.shape[0]))

    native_train_index = native_index[:-int(
        len(native_df) * (valid_set_size + test_set_size))]
    native_valid_index = native_index[
        native_train_index.shape[0]:-int(len(native_df) * test_set_size)]
    native_test_index = native_index[native_train_index.shape[0] +
                                     native_valid_index.shape[0]:]

    print('Training set size = ' + str(native_train_index.shape[0]))
    print('Validation set size = ' + str(native_valid_index.shape[0]))
    print('Test set size = ' + str(native_test_index.shape[0]))

    native_gens = {
        gen_id: iso.DataGenerator(idx, {'df': native_df},
                                  batch_size=batch_size,
                                  inputs=[{
                                      'id':
                                      'seq_prox',
                                      'source':
                                      'df',
                                      'source_type':
                                      'dataframe',
                                      'extractor':
                                      iso.SequenceExtractor('wide_seq_ext',
                                                            start_pos=105,
                                                            end_pos=105 + 205),
                                      'encoder':
                                      iso.OneHotEncoder(seq_length=205),
                                      'dim': (205, 4, 1),
                                      'sparsify':
                                      False
                                  }, {
                                      'id':
                                      'lib',
                                      'source_type':
                                      'dataframe',
                                      'source':
                                      'df',
                                      'extractor':
                                      lambda row, index: np.zeros(13),
                                      'encoder':
                                      None,
                                      'sparsify':
                                      False
                                  }, {
                                      'id': 'distal_pas',
                                      'source_type': 'dataframe',
                                      'source': 'df',
                                      'extractor': lambda row, index: 1,
                                      'encoder': None,
                                      'sparsify': False
                                  }],
                                  outputs=[{
                                      'id': 'dummy_output',
                                      'source_type': 'zeros',
                                      'dim': (1, ),
                                      'sparsify': False
                                  }],
                                  randomizers=[],
                                  shuffle=False)
        for gen_id, idx in [('all', native_index), (
            'train', native_train_index), (
                'valid', native_valid_index), ('test', native_test_index)]
    }

    return native_gens
示例#12
0
def load_data(batch_size=64, valid_set_size=0.025, test_set_size=0.025, file_path='', data_version='_v2', kept_libraries=None, canonical_pas=False, no_dse_canonical_pas=False, no_clinvar_wt=True) :

    #Load plasmid data
    #plasmid_dict = pickle.load(open('apa_plasmid_data' + data_version + '.pickle', 'rb'))
    plasmid_dict = isoio.load(file_path + 'apa_plasmid_data' + data_version)
    plasmid_df = plasmid_dict['plasmid_df']
    plasmid_cuts = plasmid_dict['plasmid_cuts']
    
    unique_libraries = np.array(['tomm5_up_n20c20_dn_c20', 'tomm5_up_c20n20_dn_c20', 'tomm5_up_n20c20_dn_n20', 'tomm5_up_c20n20_dn_n20', 'doubledope', 'simple', 'atr', 'hsp', 'snh', 'sox', 'wha', 'array', 'aar'], dtype=np.object)#plasmid_df['library'].unique()
    
    if kept_libraries is not None :
        keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if canonical_pas :
        keep_index = np.nonzero(plasmid_df.seq.str.slice(70, 76) == 'AATAAA')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]

    if no_dse_canonical_pas :
        keep_index = np.nonzero(~plasmid_df.seq.str.slice(76).str.contains('AATAAA'))[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
     
    if no_clinvar_wt :
        print("size before filtering out clinvar_wt = " + str(len(plasmid_df)))
        keep_index = np.nonzero(plasmid_df.sublibrary != 'clinvar_wt')[0]
        plasmid_df = plasmid_df.iloc[keep_index].copy()
        plasmid_cuts = plasmid_cuts[keep_index, :]
        print("size after filtering out clinvar_wt = " + str(len(plasmid_df)))
    
    #Generate training and test set indexes
    plasmid_index = np.arange(len(plasmid_df), dtype=np.int)

    plasmid_train_index = plasmid_index[:-int(len(plasmid_df) * (valid_set_size + test_set_size))]
    plasmid_valid_index = plasmid_index[plasmid_train_index.shape[0]:-int(len(plasmid_df) * test_set_size)]
    plasmid_test_index = plasmid_index[plasmid_train_index.shape[0] + plasmid_valid_index.shape[0]:]

    print('Training set size = ' + str(plasmid_train_index.shape[0]))
    print('Validation set size = ' + str(plasmid_valid_index.shape[0]))
    print('Test set size = ' + str(plasmid_test_index.shape[0]))
    
    prox_range = (np.arange(30, dtype=np.int) + 80).tolist()
    norm_range = np.arange(206).tolist()

    plasmid_training_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205),
                    'encoder' : iso.OneHotEncoder(seq_length=205),
                    'dim' : (1, 205, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library'],
                    'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries),
                    'sparsify' : False
                },
                {
                    'id' : 'total_count',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: np.sum(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_usage',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: iso_normalizer(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: cut_normalizer(t),
                    'dim' : (206,),
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'dummy_output',
                    'source_type' : 'zeros',
                    'dim' : (1,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = True,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    plasmid_prediction_gens = {
        gen_id : iso.DataGenerator(
            idx,
            {'df' : plasmid_df, 'cuts' : plasmid_cuts},
            batch_size=batch_size,
            inputs = [
                {
                    'id' : 'seq',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : iso.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205),
                    'encoder' : iso.OneHotEncoder(seq_length=205),
                    'dim' : (1, 205, 4),
                    'sparsify' : False
                },
                {
                    'id' : 'lib',
                    'source_type' : 'dataframe',
                    'source' : 'df',
                    'extractor' : lambda row, index: row['library'],
                    'encoder' : iso.CategoricalEncoder(n_categories=len(unique_libraries), categories=unique_libraries),
                    'sparsify' : False
                }
            ],
            outputs = [
                {
                    'id' : 'prox_usage',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: iso_normalizer(t),
                    'dim' : (1,),
                    'sparsify' : False
                },
                {
                    'id' : 'prox_cuts',
                    'source_type' : 'matrix',
                    'source' : 'cuts',
                    'extractor' : iso.CountExtractor(start_pos=180, end_pos=180 + 205, static_poses=[-1], sparse_source=False),
                    'transformer' : lambda t: cut_normalizer(t),
                    'dim' : (206,),
                    'sparsify' : False
                }
            ],
            randomizers = [],
            shuffle = False,
            densify_batch_matrices=True
        ) for gen_id, idx in [('all', plasmid_index), ('train', plasmid_train_index), ('valid', plasmid_valid_index), ('test', plasmid_test_index)]
    }

    return plasmid_training_gens, plasmid_prediction_gens
示例#13
0
def predict_ref(model, seq, isoform_start=80, isoform_end=105):
    one_hot = iso.OneHotEncoder(len(seq))(seq)
    _, cut_pred = model.predict(x=aparent_single_example_batch(one_hot))

    return np.sum(
        np.ravel(cut_pred)[isoform_start:isoform_end]), np.ravel(cut_pred)