def write_12mers(mirname, mirseq, outfile): site8 = utils.rev_comp(mirseq[1:8]) + 'A' all_12mers = generate_12mers(site8) if len(all_12mers) != 262144: raise (ValueError("all_12mers should be 262144 in length")) with tf.python_io.TFRecordWriter(outfile) as tfwriter: for siteseq in all_12mers: aligned_stype = utils.get_centered_stype(site8, siteseq) if aligned_stype == 'no site': keep_prob = 0.001 else: keep_prob = 1.0 feature_dict = { 'mir': tf_utils._bytes_feature(mirname.encode('utf-8')), 'mir_1hot': tf_utils._float_feature(utils.one_hot_encode(mirseq)), 'seq_1hot': tf_utils._float_feature(utils.one_hot_encode(siteseq)), 'log_kd': tf_utils._float_feature([-0.0]), 'keep_prob': tf_utils._float_feature([keep_prob]), 'stype': tf_utils._bytes_feature(aligned_stype.encode('utf-8')), } example_proto = tf.train.Example(features=tf.train.Features( feature=feature_dict)) example_proto = example_proto.SerializeToString() tfwriter.write(example_proto)
def load_npz(path, input_shape, label_shape, batch_size = 16, onehot_encode = True): print("Loading data from {} ...".format(path)) ds = np.load(path) x, y = ds['x'], ds['y'] x[...,2] = x[...,2]/255 train_data = x[:int(0.8*x.shape[0])] test_data = x[int(0.8*x.shape[0]):] train_labels = y[:int(0.8*y.shape[0])] if onehot_encode: train_labels = one_hot_encode(train_labels) test_labels = y[int(0.8*y.shape[0]):] if onehot_encode: test_labels = one_hot_encode(test_labels) def generator(data, labels, batch_size = batch_size): ind = 0 while True: yield data[ind:ind+batch_size], labels[ind:ind+batch_size] if ind >= data.shape[0]: ind = 0 train_dataset = generator(train_data, train_labels, batch_size) test_dataset = generator(test_data, test_labels, batch_size) print("Dataset Loaded.") print("Train dataset input shape: {} label shape: {}".format(train_data.shape, train_labels.shape)) print("Test dataset input shape: {} label shape: {}".format(test_data.shape, test_labels.shape)) return [train_dataset, test_dataset, train_data.shape[0], test_data.shape[0]]
def backward(self, x: np.ndarray, labels: np.ndarray, hs: Dict, ps: Dict): """ Makes backward pass through the network. Returns the gradients of loss w.r.t. network parameters - w_hx, w_hh, w_hy. :param x: the array of input characters, where each item is the index of character, the size of array will be the sequence length :param labels: the array of target characters, where each item is the index of character, the size of array will be the sequence length :param hs: the hidden states of network, (the first output of the self.forward method) :param ps: network predictions for given inputs, (the second output of the self.forward method) :return: gradients of w_hx, w_hh, w_hy """ inputs_matrix = one_hot_encode(x, self.vocabulary_size) labels_matrix = one_hot_encode(labels, self.vocabulary_size) dw_hx = np.zeros_like(self.w_hx) dw_hh = np.zeros_like(self.w_hh) dw_hy = np.zeros_like(self.w_hy) for t in reversed(range(len(x))): # dl / dy = p - label dy_t = ps[t] - labels_matrix[t] # dl / dw_hy = (dl / dy) * (dy / dw_hy) dw_hy += np.dot(dy_t, hs[t].T) # dl / dh = (dl / dy) * (dy / dh) = (p - label) * w_hy dh_t = np.dot(self.w_hy.T, dy_t) # dl / dz_{k} = (dl / dh_{k}) * (dh_{k} / dz_{k}) = dh_{t} * (dh_{k} / dz_{k}) dz_k = dh_t * self.f_prime(hs[t]) # dl / dw_hh = ∑ (dl / dz_{k}) * (dz_{k} / dw_hh) for all k from 1 to t # dl / dw_hx = ∑ (dl / dz_{k}) * (dz_{k} / dw_hx) for all k from 1 to t for k in reversed(range(t + 1)): # (dl / dz_{k}) (dz_{k} / dw_hh) = dz_k * h_{k-1} dw_hh += np.dot(dz_k, hs[k - 1].T) # (dl / dz_{k}) (dz_{k} / dw_h) = dz_k * x_{k} dw_hx += np.dot(dz_k, inputs_matrix[k].T) # updating dz_k using all previous derivatives (from t to t - k) # dl / dz_(k-1) = (dl / dz_{k})(dz_{k} / dh_{k-1}) * (dh_{k-1) / dz_{k-1}) dz_k = np.dot(self.w_hh.T, dz_k) * self.f_prime(hs[k - 1]) # clip to mitigate exploding gradients for d_param in (dw_hx, dw_hh, dw_hy): np.clip(d_param, -5, 5, out=d_param) return dw_hx, dw_hh, dw_hy
def fetch_batch(self, args, mode='train', sample_strategy='random', augment=True): n_classes, batch_size, seq_length = args.n_classes, args.batch_size, args.seq_length if mode == 'train': data = self.train_data elif mode == 'test': data = self.test_data classes = [ np.random.choice(range(len(data)), replace=False, size=n_classes) for _ in range(batch_size) ] if sample_strategy == 'random': # #(sample) per class may not be equal (sec 7) seq = np.random.randint(0, n_classes, [batch_size, seq_length]) elif sample_strategy == 'uniform': # #(sample) per class are equal seq = np.array([ np.concatenate([[j] * int(seq_length / n_classes) for j in range(n_classes)]) for _ in range(batch_size) ]) for i in range(batch_size): np.random.shuffle(seq[i, :]) seq_pic = [[ self.augment(data[classes[i][j]][np.random.randint( 0, len(data[classes[i][j]]))], only_resize=not augment) for j in seq[i, :] ] for i in range(batch_size)] if args.label_type == 'one_hot': seq_encoded = one_hot_encode(seq, n_classes) seq_encoded_shifted = np.concatenate([ np.zeros(shape=[batch_size, 1, n_classes]), seq_encoded[:, :-1, :] ], axis=1) elif args.label_type == 'five_hot': label_dict = [[[ int(j) for j in list(baseN(i, 5)) + [0] * (5 - len(baseN(i, 5))) ] for i in np.random.choice( range(5**5), replace=False, size=n_classes)] for _ in range(batch_size)] seq_encoded_ = np.array([[label_dict[b][i] for i in seq[b]] for b in range(batch_size)]) seq_encoded = np.reshape(one_hot_encode(seq_encoded_, dim=5), newshape=[batch_size, seq_length, -1]) seq_encoded_shifted = np.concatenate( [np.zeros(shape=[batch_size, 1, 25]), seq_encoded[:, :-1, :]], axis=1) return seq_pic, seq_encoded_shifted, seq_encoded
def obtain_data(data_dir, namefile, batch_s): # Load the training data. train_sample = pd.read_csv(os.path.join(data_dir, namefile), header=None, names=None) print('Loaded csv') train_sample_y = train_sample[train_sample.columns[0:34]] train_sample_len = train_sample[train_sample.columns[34]] train_sample_X = train_sample[train_sample.columns[34:69]] print('Size read from csv -> X: {}, Y: {}, len: {}'.format(train_sample_X.shape, train_sample_len.shape, train_sample_y.shape)) X_np = train_sample_X.to_numpy(copy=True) len_np = train_sample_len.to_numpy(copy=True) Y_np = train_sample_y.to_numpy(copy=True) print('To numpied') dict_size = 34 seq_len = 35 batch_size = len(train_sample_X) input_seq = one_hot_encode(X_np, dict_size, seq_len, batch_size) print('One hot encoded') train_torch_x = torch.from_numpy(input_seq).float().squeeze() train_torch_len = torch.from_numpy(len_np).float().squeeze().type(torch.long) train_torch_y = torch.from_numpy(Y_np).float().squeeze().type(torch.long) print('Torched') train_sample_ds = torch.utils.data.TensorDataset(train_torch_x, train_torch_y, train_torch_len) train_loader = torch.utils.data.DataLoader(train_sample_ds, batch_size=batch_s) print('Train loaded') return train_loader
def sample_points(self, n_sample=100, temp=1.0, prime_text="^", maxlen=100): valid_mols = [] print("\n\n----- SAMPLING POINTS AT TEMP %.2f -----" % temp) for x in range(n_sample): smiles = str() # final smiles string will be stored in "generated" seed_token = [] for t in list(prime_text): # prepare seed token smiles += t seed_token += [self.token_indices[t]] while smiles[-1] != '$' and len( smiles ) < maxlen: # start sampling chars until maxlen or $ is reached x_seed = one_hot_encode([seed_token], self.n_chars) preds = self.model.predict(x_seed, verbose=0)[0] next_char_ind = transform_temp(preds[-1, :], temp) next_char = self.indices_token[str(next_char_ind)] smiles += next_char seed_token += [next_char_ind] val, s = is_valid_mol(smiles, True) if val: print(s) valid_mols.append(s) return valid_mols
def __getitem__(self, index): if self.mode.lower() == 'train': data_path, label_path = self.train_data[index], self.train_labels[ index] elif self.mode.lower() == 'valid': data_path, label_path = self.valid_data[index], self.valid_labels[ index] elif self.mode.lower() == 'test': data_path, label_path = self.test_data[index], self.test_labels[ index] else: raise RuntimeError( 'Unexpected dataset mode. Supported modes are: train, valid and test' ) image, label = utils.pil_loader(data_path, label_path) if self.data_transform is not None: image = self.data_transform(image) if self.label_transform is not None: label = self.label_transform(label) # perform one-hot-encoding target = utils.one_hot_encode(label) target = torch.FloatTensor(target) return image, label, target
def train_model(model, vocab, train_dl, learning_rate=0.003, num_epochs=5): criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) total_step = len(train_dl) for epoch in range(num_epochs): for i, batch in enumerate(train_dl): sequences, lengths, functions = batch['sequence'], batch['length'], batch['function'] sequences = pad_char_sequences(sequences) sequences = one_hot_encode(sequences, vocab).float() sequences, lengths, functions = sequences.to(device), lengths.cpu(), functions.to(device) output = model(sequences, lengths) loss = criterion(output, functions) optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % 100 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch + 1, num_epochs, i + 1, total_step, loss.item())) if (i + 1) % 1000 == 0: torch.save(model.state_dict(), models_dir + 'model_{}.ckpt'.format(time.strftime("%Y%m%d-%H%M%S"))) print("Saved model state to disk") torch.save(model.state_dict(), models_dir + 'model_{}.ckpt'.format(time.strftime("%Y%m%d-%H%M%S"))) return model
def __init__(self, data_type='CT', train_ratio=None, fold_k=None, norm=None, expand_dim=None, seed=233): self.seed = seed data = np.load(datapaths[data_type]) self.data = np.concatenate([data["x_train"], data["x_test"]]) self.labels = np.concatenate([data["y_train"], data["y_test"]]) self.class_num = np.max(self.labels) + 1 self.labels = one_hot_encode(self.labels, self.class_num) del data self.divide_train_test(train_ratio, fold_k) if norm is not None: self.data = self.data / norm # [0,1] self.data = self.data[:, :, :2] self.train_cur_pos, self.test_cur_pos = 0, 0 self.expand_dim = expand_dim
def test_model(model, vocab, test_ds, test_dl) -> None: """ @rtype: NoneType """ model.eval() with torch.no_grad(): correct = 0 total = 0 for batch in test_dl: sequences, lengths, functions = batch['sequence'], batch[ 'length'], batch['function'] sequences = pad_char_sequences(sequences) sequences = one_hot_encode(sequences, vocab) sequences, lengths, functions = sequences.to( device), lengths.cpu(), functions.to(device) output = model(sequences, lengths) _, predicted = torch.max(output, 1) total += len(functions) correct += (predicted == functions).sum().item() print( 'Test Accuracy of the model on the {} test sequences: {} %'.format( len(test_ds), 100 * correct / total))
def __init__(self, data, logit, dequantize, rng): x = self._dequantize( data[0], rng) if dequantize else data[0] # dequantize pixels self.x = self._logit_transform(x) if logit else x # logit self.labels = data[1] # numeric labels self.y = utils.one_hot_encode(self.labels, 10) # 1-hot encoded labels self.N = self.x.shape[0] # number of datapoints
def train_model(X_train, X_test, y_train, y_test): # Can be configured in a seperate file n_inputs = 28 * 28 n_h1 = 300 n_h2 = 100 n_outputs = 10 n_epochs = 5 batch_size = 20 learning_rate = 0.001 y_train = one_hot_encode(y_train, n_outputs) y_test = one_hot_encode(y_test, n_outputs) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n') # Build a simple MLP model model = Sequential() # first hidden layer model.add(Dense(n_h1, activation='relu', input_shape=(n_inputs, ))) # second hidden layer model.add(Dense(n_h2, activation='relu')) # output layer model.add(Dense(n_outputs, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=learning_rate), metrics=['accuracy']) model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs, verbose=2, validation_data=(X_test, y_test)) # callbacks=[LogRunMetrics()]) score = model.evaluate(X_test, y_test, verbose=0) # log a single value print('Test loss:', score[0]) print('Test accuracy:', score[1]) metrics = {"acc": score[1]} return model, metrics
def pos_neg(motif_name, seq_length, num_seq, min_counts, max_counts, GC_fraction, central_bp=None): #positive results test positive_set,positive_embedding,positive_positions_arr,positive_motif_name_arr = motif_density(motif_name, seq_length, num_seq, min_counts, max_counts, GC_fraction, central_bp=None) #pdb.set_trace() random.shuffle(positive_set) thresh_positive=int(0.3*(len(positive_set))) validation_positive_set = positive_set[0:thresh_positive] training_positive_set = positive_set[thresh_positive:] negative_set,negative_embedding,negative_positions_arr,negative_motif_name_arr = motif_density(motif_name, seq_length, num_seq, min_counts = 0, max_counts = 0, GC_fraction = .4, central_bp=None) random.shuffle(negative_set) thresh_negative=int(0.3*(len(negative_set))) validation_negative_set = negative_set[0:thresh_negative] training_negative_set = negative_set[thresh_negative:] validation_set = np.concatenate((validation_negative_set, validation_positive_set), axis = 0) training_set = np.concatenate((training_negative_set, training_positive_set), axis = 0) positive_labels = np.ones(training_positive_set.shape) positive_labels=np.reshape(positive_labels,(len(positive_labels),1)) negative_labels = np.zeros(training_negative_set.shape) negative_labels=np.reshape(negative_labels,(len(negative_labels),1)) training_labels = np.concatenate((positive_labels, negative_labels),axis=0) pos_val_labels=np.ones(validation_positive_set.shape) pos_val_labels=np.reshape(pos_val_labels,(len(pos_val_labels),1)) neg_val_labels=np.zeros(validation_negative_set.shape) neg_val_labels=np.reshape(neg_val_labels,(len(neg_val_labels),1)) validation_labels=np.concatenate((pos_val_labels,neg_val_labels),axis=0) #pdb.set_trace() training_set=one_hot_encode(np.array([i for i in training_set])); validation_set=one_hot_encode(np.array([i for i in validation_set])); return training_labels,training_set,validation_labels,validation_set,positive_positions_arr,positive_motif_name_arr,negative_positions_arr,negative_motif_name_arr
def generate_xdy(self, indexes): """generate sequence input, descriptor input and sequence output for one batch of SMILES""" x, d, y = list(), list(), list() for idx in indexes: s = self.smiles[idx] inputs = [] targets = [] # split up into windows for i in range(0, len(s) - self.window, self.step): inputs.append(s[i:i + self.window]) targets.append(s[(i + 1):(i + self.window + 1)]) # tokenize windows input_token = tokenize_molecules(inputs, self.t2i) target_token = tokenize_molecules(targets, self.t2i) # one-hot encode tokenized windows x.extend(one_hot_encode(input_token, len(self.t2i)).tolist()) y.extend(one_hot_encode(target_token, len(self.t2i)).tolist()) return np.array(x), np.array(y)
def __getitem__(self, idx: int): warnings.filterwarnings("ignore") sample = self.df.iloc[idx, :] # wav_name = sample["resampled_filename"] wav_name = sample["filename"] # wav_name = wav_name.replace("mp3", "wav") wav_name = wav_name.replace("mp3", "npy") wav_name = wav_name.replace("wav", "npy") ebird_code = sample["ebird_code"] duration = sample["duration"] wav_path = self.datadir / ebird_code / wav_name # y, sr = sf.read(self.datadir / ebird_code / wav_name) effective_length = self.sample_rate * self.period try: if duration > self.period: offset = int(np.random.rand() * (duration - self.period - 1)) y = np.load(wav_path) y = y[offset * self.sample_rate:(offset + self.period) * self.sample_rate] # y, _ = librosa.load( # wav_path, # sr=self.sample_rate, # offset=offset, # duration=self.period, # mono=True, # ) else: # y, _ = librosa.load(wav_path, sr=self.sample_rate, mono=True) y = np.load(wav_path) y = np.tile( y, 15) # the shortest rec in the train set is 0.39 sec y = y[:effective_length] # print(y.shape) if len(y) != 160000: raise ValueError() if self.composer: y = self.composer(y) except Exception: print(wav_path) print(duration) print(len(y)) raise # if image.shape != (3, 224, 547): # print(wav_path, duration, len(y), offset) labels = utils.one_hot_encode(ebird_code) if self.secondary_label is not None: labels = utils.add_secondary_label(labels, wav_name.replace("npy", "mp3"), self.secondary_label) # print("find secondary_label !!") # print(labels) return {"image": y, "targets": labels}
def gen(): while True: random_mirseq = utils.generate_random_seq(options.MIRLEN) random_target = utils.get_target_no_match(random_mirseq, SEQLEN) random_image = np.outer(utils.one_hot_encode(random_mirseq), utils.one_hot_encode(random_target)) rbns1_mir = np.random.choice(TRAIN_MIRS_KDS) rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options. MIRLEN] rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN) rbns1_image = np.outer(utils.one_hot_encode(rbns1_mirseq), utils.one_hot_encode(rbns1_target)) rbns2_mir = np.random.choice(TRAIN_MIRS_KDS) rbns2_target = utils.generate_random_seq(3) + utils.rev_comp( MIRNA_DATA.loc[rbns2_mir]['guide_seq'] [1:7]) + utils.generate_random_seq(3) rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN) rbns2_image = np.outer(utils.one_hot_encode(rbns2_mirseq), utils.one_hot_encode(rbns2_target)) yield np.array([ b'random', rbns1_mir.encode('utf-8'), rbns2_mir.encode('utf-8') ]), np.stack([random_image, rbns1_image, rbns2_image]), np.array( [[0.0], [0.0], [0.0]]), np.array([b'no site', b'no site', b'no site'])
def predict_probs(model, hidden, character, vocab, device): # One-hot encoding our input to fit into the model character = np.array([[vocab[c] for c in character]]) character = one_hot_encode(character, len(vocab)) character = torch.from_numpy(character) character = character.to(device) with torch.no_grad(): out, hidden = model(character, hidden) prob = nn.functional.softmax(out[-1], dim=0).data return prob, hidden
def train(model, data_loader, optimizer, epoch): """Train CapsuleNet model on training set :param model: The CapsuleNet model :param data_loader: An interator over the dataset. It combines a dataset and a sampler :optimizer: Optimization algorithm :epoch: Current epoch :return: Loss """ print('===> Training mode') last_loss = None # Switch to train mode model.train() if args.cuda: # When we wrap a Module in DataParallel for multi-GPUs model = model.module for batch_idx, (data, target) in enumerate(data_loader): target_one_hot = utils.one_hot_encode( target, length=args.num_classes) data, target = Variable(data), Variable(target_one_hot) if args.cuda: data = data.cuda() target = target.cuda() optimizer.zero_grad() output = model(data) # output from DigitCaps (out_digit_caps) loss = model.loss(data, output, target) # pass in data for image reconstruction loss.backward() last_loss = loss.data[0] optimizer.step() if batch_idx % args.log_interval == 0: mesg = 'Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(data_loader.dataset), 100. * batch_idx / len(data_loader), loss.data[0]) print(mesg) if last_loss < args.loss_threshold: # Stop training early break return last_loss
def compute_eval_loss_pred(self, query_edge_losses, query_node_accs, all_label_in_edge, point_similarities, query_edge_mask, evaluation_mask, num_supports, support_label, query_label): """ compute the query classification loss and query classification accuracy :param query_edge_losses: container for losses of queries' edges :param query_node_accs: container for classification accuracy of queries :param all_label_in_edge: ground truth label in edge form of point graph :param point_similarities: prediction edges of point graph :param query_edge_mask: mask for queries :param evaluation_mask: mask for evaluation (for unsupervised setting) :param num_supports: number of samples in support set :param support_label: label of support set :param query_label: label of query set :return: query classification loss query classification accuracy """ point_similarity = point_similarities[-1] full_edge_loss = self.edge_loss(1 - point_similarity, 1 - all_label_in_edge) pos_query_edge_loss = torch.sum( full_edge_loss * query_edge_mask * all_label_in_edge * evaluation_mask) / torch.sum( query_edge_mask * all_label_in_edge * evaluation_mask) neg_query_edge_loss = torch.sum( full_edge_loss * query_edge_mask * (1 - all_label_in_edge) * evaluation_mask) / torch.sum( query_edge_mask * (1 - all_label_in_edge) * evaluation_mask) # weighted loss for balancing pos/neg query_edge_loss = pos_query_edge_loss + neg_query_edge_loss # prediction query_node_pred = torch.bmm( point_similarity[:, num_supports:, :num_supports], one_hot_encode(self.eval_opt['num_ways'], support_label.long(), self.arg.device)) # test accuracy query_node_acc = torch.eq( torch.max(query_node_pred, -1)[1], query_label.long()).float().mean() query_edge_losses += [query_edge_loss.item()] query_node_accs += [query_node_acc.item()] return query_node_accs, query_edge_losses
def __init__(self, x, l, logit, flip, dequantize, rng): D = x.shape[1] // 3 # number of pixels x = self._dequantize(x, rng) if dequantize else x # dequantize x = self._logit_transform(x) if logit else x # logit x = self._flip_augmentation(x) if flip else x # flip self.x = x # pixel values self.r = self.x[:, :D] # red component self.g = self.x[:, D:2 * D] # green component self.b = self.x[:, 2 * D:] # blue component self.labels = np.hstack([l, l]) if flip else l # numeric labels self.y = utils.one_hot_encode(self.labels, 10) # 1-hot encoded labels self.N = self.x.shape[0] # number of datapoints
def sample(self, temp=1.0, prime_text="^", maxlen=100): generated = str() seed_token = [] for t in list(prime_text): generated += t seed_token += [self.token_indices[t]] while generated[-1] != '$' and len(generated) < maxlen: x_seed = one_hot_encode([seed_token], self.n_chars) preds = self.model.predict(x_seed, verbose=0)[0] next_char_ind = transform_temp(preds[-1, :], temp) next_char = self.indices_token[str(next_char_ind)] generated += next_char seed_token += [next_char_ind] return generated
def process_data(): images, labels, label_dict = flower_photos_data.load_flower_datasets() assert np.max(images[0]) <= 1, 'The image should be scaled to 0-1' images, labels = utils.shuffle_data(images, labels) labels_onehot = utils.one_hot_encode(labels) train_images, train_labels, valid_images, valid_labels, test_images, test_labels = \ utils.split_data(images, labels_onehot, train_size=0.8, valid_size=0.1, test_size=0.1) os.mkdir('data/flower_npy') np.save('data/flower_npy/train_images.npy', train_images) np.save('data/flower_npy/train_labels.npy', train_labels) np.save('data/flower_npy/valid_images.npy', valid_images) np.save('data/flower_npy/valid_labels.npy', valid_labels) np.save('data/flower_npy/test_images.npy', test_images) np.save('data/flower_npy/test_labels.npy', test_labels)
def __init__(self, path, target, cat_preproc_type='one-hot', columns=None, drop=None, transforms=None): ''' :param path: path to dataframe :param target: target column name :param cat_preproc_type: type of preprocessing for categorical data: 'no-preproc', 'one-hot', 'binary', 'backward' :param columns: list of new column names or None, values of target and drop args have to be in this list :param transforms: class Compose or Transform or None ''' self.path = path self.data = pd.read_csv(path, index_col=0) if 'index' in list(self.data.columns): self.data = pd.read_csv(path).drop(columns=['index']) if columns is not None: self.data.columns = columns self.y = self.data[target] self.data = self.data.drop(columns=[target]) if drop is not None: self.data = self.data.drop(columns=drop) if cat_preproc_type == 'no-preproc': self.X = self.data.values else: self.cat_data = self.data.select_dtypes(include=['object']).copy() if cat_preproc_type == 'one-hot': self.cat_data = one_hot_encode(self.cat_data) elif cat_preproc_type == 'binary': self.cat_data = binary_encode(self.cat_data) elif cat_preproc_type == 'backward': self.cat_data = backward_encode(self.cat_data) else: raise ValueError( "Categorical preprocessing type is not valid.") self.X = self.cat_data.join( self.data.select_dtypes(include=['int64', 'float64'])) self.transforms = None if transforms is not None: if cat_preproc_type == 'no-preproc': print('Transforming is impossible when "no-preproc"') else: self.transforms = transforms
def _process(self, image, mask): # one-hot-encode the mask mask = one_hot_encode(mask, self.class_rgb_values).astype('float') # apply augmentations if self.augmentation: sample = self.augmentation(image=image, mask=mask) image, mask = sample['image'], sample['mask'] # apply preprocessing if self.preprocessing: sample = self.preprocessing(image=image, mask=mask) image, mask = sample['image'], sample['mask'] return image, mask
def gen(): while True: random_mirs, random_images, random_labels, random_stypes = [], [], [], [] # choose one of the RBNS miRNAs, generate target with no pairing, and assign logkd of 2 rbns1_mir = np.random.choice(TRAIN_MIRS_KDS) random_mirs.append(rbns1_mir.encode('utf-8')) rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options. MIRLEN] rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN) random_images.append( np.outer(utils.one_hot_encode(rbns1_mirseq), utils.one_hot_encode(rbns1_target))) random_labels.append([2.0]) random_stypes.append(b'extra') # generate miRNA and target with no pairing and assign log kd of 2 rbns2_mir = np.random.choice(TRAIN_MIRS_KDS) random_mirs.append(rbns2_mir.encode('utf-8')) rbns2_target = utils.generate_random_seq(3) + utils.rev_comp( MIRNA_DATA.loc[rbns2_mir]['guide_seq'] [1:7]) + utils.generate_random_seq(3) rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN) random_images.append( np.outer(utils.one_hot_encode(rbns2_mirseq), utils.one_hot_encode(rbns2_target))) random_labels.append([2.0]) random_stypes.append(b'extra') # generate random 8mer pair and assign KD of average 8mer random_mirseq = utils.generate_random_seq(options.MIRLEN) random_mirs.append(b'random') up_flank = utils.generate_random_seq(2) down_flank = utils.generate_random_seq(2) random_target = up_flank + utils.rev_comp( random_mirseq[1:8]) + 'A' + down_flank random_images.append( np.outer(utils.one_hot_encode(random_mirseq), utils.one_hot_encode(random_target))) # new_label = -5.367 new_label = -5 flank_vals = { 'A': -0.34923908, 'T': -0.24840472, 'C': 0.12640774, 'G': 0.47123606 } all_flank = up_flank + down_flank for nt, val in flank_vals.items(): new_label += val * all_flank.count(nt) random_labels.append([new_label]) random_stypes.append(b'extra') yield np.array(random_mirs), np.stack(random_images), np.array( random_labels), np.array(random_stypes)
def predict(body): runtime = boto3.client('sagemaker-runtime') test_messages = [body] one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length) encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length) response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME, Body=json.dumps( encoded_test_messages.tolist()), ContentType='application/json') responseBody = response['Body'].read().decode("utf-8") responseBody = json.loads(responseBody) return responseBody
def preprocess_and_save(batch_id): images, labels = load_cifar10_batch(batch_id) images = utils.normalize_data(images) labels = utils.one_hot_encode(labels, 10) train_images, train_labels, valid_images, valid_labels, test_images, test_labels =\ utils.split_data(images, labels, train_size=0.8, valid_size=0.1, test_size=0.1) batch = { 'train_images': train_images, 'train_labels': train_labels, 'valid_images': valid_images, 'valid_labels': valid_labels, 'test_images': test_images, 'test_labels': test_labels } batch_path = os.path.join(folder_path, 'preprocess_batch_' + str(batch_id)) np.save(batch_path, np.asarray(batch))
def gen_new_train(param): # load data X_train, y_train = utils.load_data('./data/train.p') # data augmentation X_train, y_train = utils.augment_data(X_train, y_train, param) # pre-process X_train = np.array( [utils.pre_process(X_train[i]) for i in range(len(X_train))], dtype=np.float32) # one hot oh_y_train = utils.one_hot_encode(y_train) return X_train, y_train, oh_y_train
def forward(self, x: Tensor, update_state: bool) -> Tuple[Tensor, Tensor]: """ The basic forward pass: z_{t} = w_hh * h_{t-1} + w_hx * x_{t} h_{t} = f(z_{t}) y_{t} = w_hy * h_{t} p_{t} = softmax(y_{t}) Makes forward pass through network. self.w_hx.requires_grad() :param x: the array of integers, where each item is the index of character, the size of array will be the sequence length :param update_state: bool, if True updates current state with last state :return: the tuple of states and predicted_probabilities states - tensor of states, size = (sequence length, hidden size) predicted_probabilities - tensor of predicted probabilities for each character in vocabulary, size = (sequence length, vocabulary size) """ n = len(x) # one hot encoding of input inputs_matrix = one_hot_encode(x, self.vocabulary_size, self.dtype) log_ps = torch.zeros(n, self.vocabulary_size, dtype=self.dtype) hs = torch.zeros(n, self.hidden_size, dtype=self.dtype) for t in range(len(x)): # state at t - 1, dim : (self.hidden_size, 1) h_t_1 = self.current_state.clone() if t == 0 else hs[t - 1].clone() # state at t, dim : (self.hidden_size, 1) h_t = self.f( torch.matmul(self.w_hh, h_t_1) + torch.matmul(self.w_hx, inputs_matrix[t])) # prediction from hidden state at t, # log probabilities for next chars, dim : (self.vocabulary_size, 1) p_t = F.log_softmax(torch.matmul(self.w_hy, h_t), dim=0) # updating hidden state and and predicted_probabilities keepers hs[t], log_ps[t] = h_t, p_t if update_state: self.current_state = hs[-1].clone() # updating the current state return hs, log_ps
def __init__(self, train_ratio=None, fold_k=None, norm=False, expand_dim=False, seed=233): self.seed = seed mnist = np.load("data/mnist/mnist.npz") self.data = np.concatenate([mnist["x_train"], mnist["x_test"]]) self.labels = np.concatenate([mnist["y_train"], mnist["y_test"]]) self.labels = one_hot_encode(self.labels, 10) del mnist self.divide_train_test(train_ratio, fold_k) if norm: self.data = self.data / 255. # [0,1] # self.data = self.data / 127.5 - 1. # [-1, 1] self.train_cur_pos, self.test_cur_pos = 0, 0 self.expand_dim = expand_dim
def test(model, data_loader): """Evaluate model on validation set """ print('===> Evaluate mode') # Switch to evaluate mode model.eval() if args.cuda: # When we wrap a Module in DataParallel for multi-GPUs model = model.module test_loss = 0 correct = 0 for data, target in data_loader: target_indices = target target_one_hot = utils.one_hot_encode( target_indices, length=args.num_classes) data, target = Variable(data, volatile=True), Variable(target_one_hot) if args.cuda: data = data.cuda() target = target.cuda() output = model(data) # output from DigitCaps (out_digit_caps) # sum up batch loss test_loss += model.loss(data, output, target, size_average=False).data[0] # pass in data for image reconstruction # evaluate v_magnitud = torch.sqrt((output**2).sum(dim=2, keepdim=True)) pred = v_magnitud.data.max(1, keepdim=True)[1].cpu() correct += pred.eq(target_indices.view_as(pred)).sum() test_loss /= len(data_loader.dataset) mesg = 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( test_loss, correct, len(data_loader.dataset), 100. * correct / len(data_loader.dataset)) print(mesg)