def violin_plot(dataset): data = Gene_Wrapper.seq_data_loader(False, dataset) X = [len(gene.seq) for gene in data] y = np.array([np.array(gene.dist) / sum(gene.dist) for gene in data]) def set_axis_style(ax, labels): ax.get_xaxis().set_tick_params(direction='out') ax.xaxis.set_ticks_position('bottom') ax.set_xticks(np.arange(1, len(labels) + 1)) ax.set_xticklabels(labels) ax.set_xlim(0.25, len(labels) + 0.75) ax.set_ylabel('Localization values') fig = plt.figure(figsize=(12, 12)) plt.violinplot([y[:, i] for i in range(4)], showmeans=True) ax = fig.axes[0] if dataset == "cefra-seq": locations = ["cytosol", "insoluble", "membrane", "nucleus"] elif dataset == "apex-rip": locations = ['KDEL', 'Mito', 'NES', 'NLS'] else: raise RuntimeError('No such dataset') set_axis_style(ax, locations) plt.xticks(rotation=-20) plt.savefig('Graph/violin_{}.png'.format(dataset))
def preprocess_data(dataset): gene_data = Gene_Wrapper.seq_data_loader(True, dataset, 0, 4000, permute=False) X_seq = pad_sequences( [[seq_encoding_keys.index(c.upper()) for c in gene.seq] for gene in gene_data], maxlen=4000, dtype=np.int8, value=seq_encoding_keys.index('UNK')) # , truncating='post') X_ann = pad_sequences( [[annotation_encoding_keys.index(a.upper()) for a in gene.ann] for gene in gene_data], maxlen=4000, dtype=np.int8, value=annotation_encoding_keys.index('UNK')) # , truncating='post') y = np.array([label_dist(gene.dist) for gene in gene_data]) from sklearn.model_selection import KFold kf = KFold(n_splits=10, shuffle=True, random_state=1234) folds = kf.split(X_seq, y) return X_seq, X_ann, y, folds
def preprocess_data(left, right, dataset): gene_data = Gene_Wrapper.load_sequence(dataset, left, right) print('padding and indexing data') encoding_keys = seq_encoding_keys encoding_vectors = seq_encoding_vectors X_left = pad_sequences([[encoding_keys.index(c) for c in gene.seqleft] for gene in gene_data], maxlen=left, dtype=np.int8, value=encoding_keys.index('UNK'), padding='post') #padding after sequence X_right = pad_sequences([[encoding_keys.index(c) for c in gene.seqright] for gene in gene_data], maxlen=right, dtype=np.int8, value=encoding_keys.index('UNK'), padding='pre') # padding before sequence print("X_left shape is " + str(X_left.shape)) print("X_right shape is " + str(X_right.shape)) X = np.concatenate([X_left, X_right], axis=-1) print("X shape is " + str(X.shape)) y = np.array([label_dist(gene.dist) for gene in gene_data]) mask_label_left = np.array([ np.concatenate( [np.ones(len(gene.seqleft)), np.zeros(left - len(gene.seqleft))]) for gene in gene_data ], dtype='float32') mask_label_right = np.array([ np.concatenate([ np.zeros(right - len(gene.seqright)), np.ones(len(gene.seqright)) ]) for gene in gene_data ], dtype='float32') mask_label = np.concatenate([mask_label_left, mask_label_right], axis=-1) print("training shapes" + str(X.shape) + " " + str(y.shape)) print("Example y is " + str(y[0, :])) return X, y, mask_label, encoding_keys, encoding_vectors
return np.array(dist) / np.sum(dist) encoding_seq = OrderedDict([ ('UNK', [0, 0, 0, 0]), ('A', [1, 0, 0, 0]), ('C', [0, 1, 0, 0]), ('G', [0, 0, 1, 0]), ('T', [0, 0, 0, 1]), ('N', [0.25, 0.25, 0.25, 0.25]), # A or C or G or T ]) encoding_keys = list(encoding_seq.keys()) encoding_vectors = np.array(list(encoding_seq.values())) reverse_mapping = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'U'} gene_data = Gene_Wrapper.seq_data_loader(False, 'cefra-seq', 0, np.inf) X = np.array([ np.array([encoding_keys.index(c) for c in gene.seq]) for gene in gene_data ]) Y = np.array([label_dist(gene.dist) for gene in gene_data]) def cnn_bilstm_model(pooling_size=3, nb_filters=32, filters_length=10, lstm_units=32, attention_size=50): '''build model''' input = Input(shape=(None, ), dtype='int8') embedding_layer = Embedding(len(encoding_vectors),
encoding_annotation = OrderedDict([ ('UNK', [0, 0, 0, 0, 0, 0]), # for padding use ('f', [1, 0, 0, 0, 0, 0]), # 'dangling start', ('t', [0, 1, 0, 0, 0, 0]), # dangling end', ('i', [0, 0, 1, 0, 0, 0]), # 'internal loop', ('h', [0, 0, 0, 1, 0, 0]), # 'hairpin loop', ('m', [0, 0, 0, 0, 1, 0]), # 'multi loop', ('s', [0, 0, 0, 0, 0, 1]) # 'stem' ]) seq_encoding_keys = list(encoding_seq.keys()) seq_encoding_vectors = np.array(list(encoding_seq.values())) annotation_encoding_keys = list(encoding_annotation.keys()) annotation_encoding_vectors = np.array(list(encoding_annotation.values())) gene_data = Gene_Wrapper.seq_data_loader(False, False, 0, 4000) encoding_keys = seq_encoding_keys encoding_vectors = seq_encoding_vectors X = pad_sequences([[encoding_keys.index(c) for c in gene.seq] for gene in gene_data], maxlen=4000, dtype=np.int8, value=encoding_keys.index('UNK')) # , truncating='post') y = np.array([label_dist(gene.dist) for gene in gene_data]) ids = np.array([gene.id for gene in gene_data]) true_length = np.array([len(gene.seq) for gene in gene_data]) if args.saved_expr == "": print('New experiment') OUTPATH = os.path.join( basedir, 'Results', 'cefra-seq', 'SGDModel-10foldcv',
def preprocess_data(lower_bound, upper_bound, use_annotations, dataset, max_len, randomization_test=False): gene_data = Gene_Wrapper.seq_data_loader(use_annotations, dataset, lower_bound, upper_bound, permute=randomization_test) print('padding and indexing data') if use_annotations: print( 'Using unified one-hot encoding for both sequence and annotation features' ) '''create unifed encoding scheme''' template = [0] * 24 # dim([a,c,g,t]) * dim([f,t,i,h,m,s]) combined_encoding = OrderedDict() combined_encoding['UNK'] = template for i, (key_seq, key_ann) in enumerate( itertools.product(['A', 'C', 'G', 'T', 'N'], ['F', 'T', 'I', 'H', 'M', 'S'])): tmp = template.copy() if key_seq == 'N': for n in ['A', 'C', 'G', 'T']: tmp[np.nonzero(combined_encoding[n + key_ann])[0][0]] = 0.25 combined_encoding[key_seq + key_ann] = tmp else: tmp[i] = 1 # normal one-hot encoding as it is... combined_encoding[key_seq + key_ann] = tmp encoding_keys = list(combined_encoding.keys()) encoding_vectors = np.array(list(combined_encoding.values())) print('padding and indexing data') X = pad_sequences([[ encoding_keys.index(s.upper() + a.upper()) for s, a in zip(gene.seq, gene.ann) ] for gene in gene_data], maxlen=max_len, dtype=np.int8, value=encoding_keys.index('UNK')) y = np.array([label_dist(gene.dist) for gene in gene_data]) else: encoding_keys = seq_encoding_keys encoding_vectors = seq_encoding_vectors X = pad_sequences( [[encoding_keys.index(c) for c in gene.seq] for gene in gene_data], maxlen=max_len, dtype=np.int8, value=encoding_keys.index('UNK')) # , truncating='post') y = np.array([label_dist(gene.dist) for gene in gene_data]) global gene_ids gene_ids = np.array([gene.id for gene in gene_data]) from sklearn.model_selection import KFold, StratifiedKFold # '''lame kfolds splitting''' # length = len(X) # fold_split_index = [] # folds_X = [] # folds_y = [] # for i in range(1,10): # fold_split_index.append(int(length*i/10)) # index: 0~8 # for i in range(10): # if i == 0: # folds_X.append(X[:fold_split_index[0], :]) # folds_y.append(y[:fold_split_index[0], :]) # elif i == 9: # folds_X.append(X[fold_split_index[8]:, :]) # folds_y.append(y[fold_split_index[8]:, :]) # else: # folds_X.append(X[fold_split_index[i-1]:fold_split_index[i], :]) # folds_y.append(y[fold_split_index[i-1]:fold_split_index[i], :]) # # return folds_X, folds_y, encoding_keys, encoding_vectors '''sklearn kfolds splitting''' kf = KFold(n_splits=10, shuffle=True, random_state=1234) folds = kf.split(X, y) # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234) # modes = [] # for label in y: # modes.append(np.argmax(label)) # folds = kf.split(X, modes) return X, y, folds, encoding_keys, encoding_vectors
type=str, default='cefra-seq', choices=['cefra-seq', 'apex-rip'], help='choose from cefra-seq and apex-rip') parser.add_argument('--model', type=str, default='cnn_bilstm', choices=['cnn', 'cnn_bilstm', 'resnet'], help='') parser.add_argument('--message', type=str, default="", help='') parser.add_argument('--epochs', type=int, default=100, help='') args = parser.parse_args() # no clipping, no padding gene_data = Gene_Wrapper.seq_data_loader(False, args.dataset, lower_bound=0, upper_bound=np.inf) X = np.array([[encoding_keys.index(c) for c in gene.seq] for gene in gene_data]) y = np.array([label_dist(gene.dist) for gene in gene_data]) kf = KFold(n_splits=10, shuffle=True, random_state=1234) folds = kf.split(X, y) if args.dataset == "cefra-seq": locations = ['KDEL', 'Mito', 'NES', 'NLS'] elif args.dataset == "apex-rip": locations = ["cytoplasm", "insoluble", "membrane", "nucleus"] else: raise RuntimeError('No such dataset') '''prepare extract path'''