def main(): dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/") # uncomment for cancer # X, y, X_final, y_final, dataset = dataset_reader.load_cancer() X, y, X_final, y_final, dataset = dataset_reader.load_higgs() skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=42) ii = 0 for train, test in skf: x_train = X[train] x_test = X[test] y_train = y[train] y_test = y[test] nums = [5, 10, 30, 50] layer = Layer(RandomForestClassifier, { "max_depth": 1, "n_estimators": nums[ii] }, x_train, y_train, 10) predictions = layer.predictAll(x_train) lr = Layer(LogisticRegression, { "n_jobs": -1, "max_iter": 1000 }, predictions, y_train, 1) network = Network([layer, lr]) evaluate_test(network, X_final, y_final, nums[ii], dataset) ii += 1
def main(): dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/") # uncomment for cancer # X, y, X_final, y_final, dataset = dataset_reader.load_cancer() X, y, X_final, y_final, dataset = dataset_reader.load_higgs() input_s = (30,) batch_size = 25 classes = 2 num_nodes = [5,10,30,50] skf = StratifiedKFold(y, n_folds=4, shuffle = True, random_state=42) best_acc = 0 ii = 0 for train,test in skf: x_train = X[train] x_test = X[test] y_train = y[train] y_test = y[test] y_train = to_categorical(y_train, classes) y_test = to_categorical(y_test, classes) neural_net = Sequential() neural_net.add(Dense(num_nodes[ii], activation='sigmoid', input_shape = input_s, kernel_initializer="TruncatedNormal")) neural_net.add(Dropout(.01)) neural_net.add(Dense(2, activation='softmax')) neural_net.compile(optimizer="RMSProp", loss = 'binary_crossentropy', metrics = ['accuracy']) neural_net.fit(x_train, y_train, batch_size = batch_size, epochs = 100, verbose = 0, validation_data = (x_test, y_test)) predictions = neural_net.predict(x_test) predictions = [round(x[1]) for x in predictions] y_test = [x[1] for x in y_test] acc = 0.0 for i, prediction in enumerate(predictions): if prediction == y_test[i]: acc += 1 acc /= len(predictions) if acc > best_acc: best_classifier = neural_net best_num_nodes = num_nodes[ii] best_acc = acc ii += 1 evaluate_test(best_classifier, X_final, y_final, best_num_nodes, dataset)
def main(): dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/") X, y, X_final, y_final, dataset = dataset_reader.load_cancer() # uncomment for higgs # X, y, X_final, y_final, dataset = dataset_reader.load_higgs() skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=42) dtree_params = [1, 5, 30, 50] ii = 0 best_acc = 0 for train, test in skf: x_train = X[train] x_test = X[test] y_train = y[train] y_test = y[test] clf = DecisionTreeClassifier(max_depth=dtree_params[ii], max_features=1.0, random_state=42) clf.fit(x_train, y_train) predictions = clf.predict(x_test) acc = 0.0 for i, prediction in enumerate(predictions): if prediction == y_test[i]: acc += 1 acc /= len(predictions) if acc > best_acc: best_classifier = clf best_depth = dtree_params[ii] best_acc = acc ii += 1 evaluate_test(best_classifier, X_final, y_final, best_depth, dataset)
def test(test_args): """Computes test perplexity for test data :param test_args: system args """ start = time.time() data_reader = DatasetReader(test_args, train=False) test_data = data_reader.test_data # load hyperparameters and other flags with open(os.path.join(test_args.save_dir, 'config.pkl'), 'rb') as f: args = cPickle.load(f) assert test_data is not None, 'test data is not read!' args.vocab_size = data_reader.vocab_size print('vocab_size: {}'.format(args.vocab_size)) print('Start testing...') with tf.Graph().as_default(), tf.Session( config=gpu_config if args.with_gpu else None) as sess: with tf.variable_scope('train_model', reuse=None): m_test = Model(args, is_training=False) saver = tf.train.Saver(tf.global_variables()) tf.global_variables_initializer().run() ckpt = tf.train.get_checkpoint_state(args.save_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) test_pp = run_epoch(sess, m_test, test_data, data_reader, tf.no_op()) print('Test Perplexity: %.3f' % test_pp) print("Test time: %.0f min" % ((time.time() - start) / 60))
class KnapsackBruteForce: def __init__(self, n, weight, profit, max_weight): self.n = n self.weight = weight self.profit = profit self.max_weight = max_weight def run(self): self.best_value = 0 self.total_weight = 0 self.s = [] self.solve(self.n - 1, self.s, 0, 0) def solve(self, n, s, current_w, current_v): if n == -1 and current_w <= self.max_weight and current_v > self.best_value: self.best_value = current_v self.total_weight = current_w self.s = s.copy() if n == -1: return self.solve(n - 1, [0] + s, current_w, current_v) self.solve(n - 1, [1] + s, current_w + self.weight[n], current_v + self.profit[n]) dataset = DatasetReader().read('p08') kbf = KnapsackBruteForce(len(dataset[0]), dataset[0], dataset[1], dataset[2]) ExecutionLogger().run(kbf)
def main(NetClass, key_name, scale): assert scale in [32, 64, 128] torch.set_grad_enabled(False) model_id = NetClass.model_id save_dir = '{}.{}'.format(simple_net_save_dir_prefix, scale) os.makedirs(save_dir, exist_ok=True) test_dataset_path = '{}/{}/test'.format(dataset_path, key_name) ck_name = '{}/model_{}_{}.pt'.format(save_dir, model_id, key_name) cm_test_name = '{}/cm_test_{}_{}.png'.format(save_dir, model_id, key_name) test_dataset = DatasetReader(test_dataset_path, target_hw=(scale, scale)) net = NetClass(in_dim) net.load_state_dict(torch.load(ck_name, map_location='cpu')) net = net.to(device) net.eval() all_pred = [] all_label = [] for i in range(len(test_dataset)): ims, cls = test_dataset.get_im_patch_list_to_combind_predict(i, one_im=False) batch_im = torch.tensor(ims.astype(np.int32), dtype=torch.float) / 65535 # batch_cls = torch.tensor([cls]).repeat(len(batch_im)) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) # batch_cls = batch_cls.to(device) net_out = net(batch_im) out = torch.argmax(net_out, 1) all_label.append(cls) if out.sum(dtype=torch.float).item() > out.shape[0] * simple_thresh: all_pred.append(1) else: all_pred.append(0) _accuracy = accuracy_score(all_label, all_pred) _malignant_precision, _malignant_recall, _malignant_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary') _benign_precision, _benign_recall, _benign_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary') _accuracy = float(_accuracy) _malignant_precision = float(_malignant_precision) _malignant_recall = float(_malignant_recall) _malignant_f1 = float(_malignant_f1) _benign_precision = float(_benign_precision) _benign_recall = float(_benign_recall) _benign_f1 = float(_benign_f1) out_line = 'test acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\ 'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{} x{}'.format(_accuracy, _malignant_precision, _malignant_recall, _malignant_f1, _benign_precision, _benign_recall, _benign_f1, model_id, key_name, scale) print(out_line) test_out.append(out_line) cm = confusion_matrix(all_label, all_pred) draw_confusion_matrix(cm, list(test_dataset.class2id.keys()), cm_test_name)
def main(NetClass, key_name): torch.set_grad_enabled(False) target_hw=(256, 256) model_id = NetClass.model_id train_dataset_path = '{}/{}/train'.format(dataset_path, key_name) eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name) test_dataset_path = '{}/{}/test'.format(dataset_path, key_name) ck_name = '{}/model_{}_{}.pt'.format(seg_net_save_dir, model_id, key_name) train_dataset = DatasetReader(train_dataset_path, target_hw=target_hw) eval_dataset = DatasetReader(eval_dataset_path, target_hw=target_hw) test_dataset = DatasetReader(test_dataset_path, target_hw=target_hw) net = NetClass(in_dim) net.load_state_dict(torch.load(ck_name, map_location='cpu')) net = net.to(device) net.eval() for dataset_type_id, dataset in enumerate([train_dataset, eval_dataset, test_dataset]): dataset_type = ['train', 'eval', 'test'][dataset_type_id] if dataset_type not in big_dict: big_dict[dataset_type] = {} for i in range(len(dataset)): im_name, im, cm, cls = dataset.get_im_patch_list_to_combind_predict(i, need_im_name=True) im_id = os.path.splitext(im_name)[0] if im_id not in big_dict[dataset_type]: big_dict[dataset_type][im_id] = {} batch_im = torch.tensor([im], dtype=torch.float) / 65535 # batch_cm = torch.tensor([cm]) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) # batch_cm = batch_cm.to(device) net_out = net(batch_im) out = torch.argmax(net_out, 1) cls1_pixel_num = (out == 1).sum(dtype=torch.float).item() cls2_pixel_num = (out == 2).sum(dtype=torch.float).item() if cls1_pixel_num + cls2_pixel_num == 0: out_pred = 0 else: out_pred = cls2_pixel_num / (cls1_pixel_num + cls2_pixel_num) assert not np.isnan(out_pred) if 'pred' not in big_dict[dataset_type][im_id]: big_dict[dataset_type][im_id]['pred'] = out_pred else: raise AssertionError('Error, found pred setting in 2 times') if 'class' not in big_dict[dataset_type][im_id]: big_dict[dataset_type][im_id]['class'] = cls - 1 else: assert big_dict[dataset_type][im_id]['class'] == cls
def main(NetClass, key_name, scale): assert scale in [32, 64, 128] target_hw = (scale, scale) model_id = NetClass.model_id train_dataset_path = '{}/{}/train'.format(dataset_path, key_name) eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name) test_dataset_path = '{}/{}/test'.format(dataset_path, key_name) ck_name = '{}.{}/model_{}_{}.pt'.format(simple_net_save_dir_prefix, scale, model_id, key_name) train_dataset = DatasetReader(train_dataset_path, target_hw=target_hw) eval_dataset = DatasetReader(eval_dataset_path, target_hw=target_hw) test_dataset = DatasetReader(test_dataset_path, target_hw=target_hw) net = NetClass(in_dim) net.load_state_dict(torch.load(ck_name, map_location='cpu')) net = net.to(device) net.eval() torch.set_grad_enabled(False) for dataset_type_id, dataset in enumerate( [train_dataset, eval_dataset, test_dataset]): dataset_type = ['train', 'eval', 'test'][dataset_type_id] if dataset_type not in big_dict: big_dict[dataset_type] = {} for i in range(len(dataset)): im_name, ims, cls = dataset.get_im_patch_list_to_combind_predict( i, one_im=False, need_im_name=True) im_id = os.path.splitext(im_name)[0] if im_id not in big_dict[dataset_type]: big_dict[dataset_type][im_id] = {} batch_im = torch.tensor(ims.astype(np.int32), dtype=torch.float) / 65535 # batch_cls = torch.tensor([cls]).repeat(len(batch_im)) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) # batch_cls = batch_cls.to(device) net_out = net(batch_im) net_out = torch.softmax(net_out, 1) net_out = net_out[:, 1] out_pred = torch.mean(net_out).item() if 'pred{}'.format(scale) not in big_dict[dataset_type][im_id]: big_dict[dataset_type][im_id]['pred{}'.format( scale)] = out_pred else: raise AssertionError( 'Error, found pred{} setting in 2 times'.format(scale)) if 'class' not in big_dict[dataset_type][im_id]: big_dict[dataset_type][im_id]['class'] = cls else: assert big_dict[dataset_type][im_id]['class'] == cls
self.weight = weight self.profit = profit self.max_weight = max_weight def run(self): self.best_value = 0 self.total_weight = 0 self.s = [] self.solve(self.n - 1, self.s, 0, 0) def solve(self, n, s, current_w, current_v): if current_w > self.max_weight: return if n == -1: if current_w <= self.max_weight and current_v > self.best_value: self.best_value = current_v self.total_weight = current_w self.s = s.copy() return self.solve(n - 1, [0] + s, current_w, current_v) self.solve(n - 1, [1] + s, current_w + self.weight[n], current_v + self.profit[n]) dataset = DatasetReader().read('c09') kbfo = KnapsackBruteForceOpt(len(dataset[0]), dataset[0], dataset[1], dataset[2]) ExecutionLogger().run(kbfo)
def word2vec( files=[], directories=[], skip=[], save_dir=None, num_epochs=5, unigram_dictionary=None, noise_ratio=15, kernel=[1,2,3,4,5,5,4,3,2,1], t = 1.0e-5, batch_size = 1000, # Number of *signal* examples per batch num_embedding_dimensions=500, word_embedding_init=Normal(), context_embedding_init=Normal(), learning_rate=0.1, momentum=0.9, num_processes=3, load_dictionary_dir=None, min_frequency=10, macrobatch_size = 100000, max_queue_size=0, verbose=True ): ''' Helper function that handles all concerns involved in training A word2vec model using the approach of Mikolov et al. It surfaces all of the options. For customizations going beyond simply tweeking existing options and hyperparameters, substitute this function by writing your own training routine using the provided classes. This function would be a starting point for you. ''' # Make a Word2VecMinibatcher, pass through parameters sent by caller reader = DatasetReader( files=files, directories=directories, skip=skip, noise_ratio=noise_ratio, t=t, num_processes=num_processes, unigram_dictionary=unigram_dictionary, kernel=kernel, max_queue_size=max_queue_size, macrobatch_size=macrobatch_size, verbose=verbose ) # Prepare the minibatch generator # (this produces the counter_sampler stats) if load_dictionary_dir is None and unigram_dictionary is None: if verbose: print 'preparing dictionaries...' reader.prepare(save_dir=save_dir) # If min_frequency was specified, prune the dictionaries if min_frequency is not None: if verbose: print 'prunning dictionaries...' reader.prune(min_frequency) # Make a symbolic minibatcher minibatcher = NoiseContrastiveTheanoMinibatcher( batch_size=batch_size, noise_ratio=noise_ratio, dtype="int32", num_dims=2 ) # Make a Word2VecEmbedder object, feed it the combined input. # Note that the full batch includes noise examples and signal_examples # so is larger than batch_size, which is the number of signal_examples # only per batch. full_batch_size = batch_size * (1 + noise_ratio) embedder = Word2VecEmbedder( input_var=minibatcher.get_batch(), batch_size=full_batch_size, vocabulary_size=reader.get_vocab_size(), num_embedding_dimensions=num_embedding_dimensions, word_embedding_init=word_embedding_init, context_embedding_init=context_embedding_init ) # Architectue is ready. Make the loss function, and use it to create # the parameter updates responsible for learning loss = get_noise_contrastive_loss(embedder.get_output(), batch_size) updates = nesterov_momentum( loss, embedder.get_params(), learning_rate, momentum ) # Include minibatcher updates, which cause the symbolic batch to move # through the dataset like a sliding window updates.update(minibatcher.get_updates()) # Use the loss function and the updates to compile a training function. # Note that it takes no inputs because the dataset is fully loaded using # theano shared variables train = function([], loss, updates=updates) # Iterate through the dataset, training the embeddings for epoch in range(num_epochs): if verbose: print 'starting epoch %d' % epoch macrobatches = reader.generate_dataset_serial() macrobatch_num = 0 for signal_macrobatch, noise_macrobatch in macrobatches: macrobatch_num += 1 if verbose: print 'running macrobatch %d' % (macrobatch_num - 1) minibatcher.load_dataset(signal_macrobatch, noise_macrobatch) losses = [] for batch_num in range(minibatcher.get_num_batches()): if verbose: print 'running minibatch', batch_num losses.append(train()) if verbose: print '\taverage loss: %f' % np.mean(losses) # Save the model (the embeddings) if save_dir was provided if save_dir is not None: embedder.save(save_dir) # Return the trained embedder and the dictionary mapping tokens # to ids return embedder, reader
def train(args): """Train the data train corpus :param args: system args """ start = time.time() save_dir = args.save_dir if not os.path.exists(save_dir): os.mkdir(save_dir) with open(os.path.join(save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) data_reader = DatasetReader(args) train_data = data_reader.train_data assert train_data is not None, 'training data is not read!' print('Number of train running words: {}'.format(len(train_data))) dev_data = data_reader.dev_data if dev_data: print('Number of dev set running words: {}'.format(len(dev_data))) out_file = os.path.join(args.save_dir, args.output) fout = codecs.open(out_file, "w", encoding="UTF-8") args.vocab_size = data_reader.vocab_size print('vocab size: {}'.format(args.vocab_size)) fout.write('vocab size: {}\n'.format(str(args.vocab_size))) print('Start training....') with tf.Graph().as_default(), tf.Session( config=gpu_config if args.with_gpu else None) as sess: if args.init_scale: initializer = tf.random_uniform_initializer( -args.init_scale, +args.init_scale) else: initializer = tf.glorot_uniform_initializer() # build models with tf.variable_scope('train_model', reuse=None, initializer=initializer): m_train = Model(args) if dev_data: # reuse the same embedding matrix with tf.variable_scope('train_model', reuse=True, initializer=initializer): m_dev = Model(args, is_training=False) else: m_dev = None # save only the last model saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) tf.global_variables_initializer().run() best_pp = 10000000.0 # only used when we have dev e = 0 decay_counter = 1 lr = args.lr while e < args.num_epochs: # apply lr decay if e >= args.start_epoch_decay: lr_decay = args.decay_rate**decay_counter lr *= lr_decay decay_counter += 1 print('Epoch: %d' % (e + 1)) m_train.assign_lr(sess, lr) print('Learning rate: %.6f' % sess.run(m_train.lr)) fout.write("Epoch: %d\n" % (e + 1)) fout.write("Learning rate: %.3f\n" % sess.run(m_train.lr)) train_pp = run_epoch(sess, m_train, train_data, data_reader, m_train.train_op, verbose=True) print('Train Perplexity: {}'.format(train_pp)) fout.write("Train Perplexity: %.3f\n" % train_pp) if m_dev: dev_pp = run_epoch(sess, m_dev, dev_data, data_reader, tf.no_op()) print("Valid Perplexity: %.3f\n" % dev_pp) fout.write("Valid Perplexity: %.3f\n" % dev_pp) if dev_pp < best_pp: print("Achieve highest perplexity on dev set, save model.") checkpoint_path = os.path.join(save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e) print("model saved to {}".format(checkpoint_path)) best_pp = dev_pp else: break else: checkpoint_path = os.path.join(save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e) print("model saved to {}".format(checkpoint_path)) fout.flush() e += 1 print("Training time: %.0f min" % ((time.time() - start) / 60)) fout.write("Training time: %.0f min\n" % ((time.time() - start) / 60)) fout.flush()
if train_filename == test_filename: print('Dev run.') ignore_closest = True else: ignore_closest = False debug = False dummy = False print('Reading model..') if not dummy: model_reader = ModelReader(model_params_filename) model = model_reader.model else: model = DummyContextModel() dataset_reader = DatasetReader(model) print('Reading train dataset..') train_set, train_key2ind, train_ind2key = dataset_reader.read_dataset(train_filename, train_filename+'.key', True, isolate_target_sentence) knn = Knn(k, train_set, train_key2ind) print('Reading test dataset..') test_set, test_key2ind, test_ind2key = dataset_reader.read_dataset(test_filename, test_filename+'.key', False, isolate_target_sentence) print('Starting to classify test set:') with open(result_filename, 'w') as o: for ind, key_set in enumerate(test_set): key = test_ind2key[ind] if debug: print('KEY:', key) print()
def main(NetClass, key_name): torch.set_grad_enabled(False) model_id = NetClass.model_id test_dataset_path = '{}/{}/test'.format(dataset_path, key_name) ck_32_name = '{}.32/model_{}_{}.pt'.format(simple_net_save_dir_prefix, model_id, key_name) ck_64_name = '{}.64/model_{}_{}.pt'.format(simple_net_save_dir_prefix, model_id, key_name) ck_128_name = '{}.128/model_{}_{}.pt'.format(simple_net_save_dir_prefix, model_id, key_name) cm_net3_test_name = '{}_{}_{}.png'.format( simple_net_3_merge_test_cm_prefix, model_id, key_name) os.makedirs(os.path.split(cm_net3_test_name)[0], exist_ok=True) test_dataset_32 = DatasetReader(test_dataset_path, target_hw=(32, 32)) test_dataset_64 = DatasetReader(test_dataset_path, target_hw=(64, 64)) test_dataset_128 = DatasetReader(test_dataset_path, target_hw=(128, 128)) net_32 = NetClass(in_dim) net_64 = NetClass(in_dim) net_128 = NetClass(in_dim) net_32.load_state_dict( torch.load(ck_32_name, map_location=torch.device('cpu'))) net_64.load_state_dict( torch.load(ck_64_name, map_location=torch.device('cpu'))) net_128.load_state_dict( torch.load(ck_128_name, map_location=torch.device('cpu'))) net_32 = net_32.to(device) net_64 = net_64.to(device) net_128 = net_128.to(device) net_32.eval() net_64.eval() net_128.eval() all_pred = [] all_label = [] for i in range(len(test_dataset_32)): ims_32, cls_32 = test_dataset_32.get_im_patch_list_to_combind_predict( i, one_im=False) ims_64, cls_64 = test_dataset_64.get_im_patch_list_to_combind_predict( i, one_im=False) ims_128, cls_128 = test_dataset_128.get_im_patch_list_to_combind_predict( i, one_im=False) assert cls_32 == cls_64 == cls_128 tmp_x = [[net_32, ims_32, cls_32], [net_64, ims_64, cls_64], [net_128, ims_128, cls_128]] tmp_y = [] for net, ims, cls in tmp_x: batch_im = torch.tensor(ims.astype(np.int32), dtype=torch.float) / 65535 # batch_cls = torch.tensor([cls]).repeat(len(batch_im)) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) # batch_cls = batch_cls.to(device) net_out = net(batch_im) out = torch.argmax(net_out, 1) if out.sum( dtype=torch.float).item() > out.shape[0] * simple_thresh: tmp_y.append(1) else: tmp_y.append(0) all_label.append(tmp_x[0][-1]) if np.sum(tmp_y) > simple_merge_thresh: all_pred.append(1) else: all_pred.append(0) _accuracy = accuracy_score(all_label, all_pred) _malignant_precision, _malignant_recall, _malignant_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary') _benign_precision, _benign_recall, _benign_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary') _accuracy = float(_accuracy) _malignant_precision = float(_malignant_precision) _malignant_recall = float(_malignant_recall) _malignant_f1 = float(_malignant_f1) _benign_precision = float(_benign_precision) _benign_recall = float(_benign_recall) _benign_f1 = float(_benign_f1) out_line = 'test acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\ 'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{}'.format(_accuracy, _malignant_precision, _malignant_recall, _malignant_f1, _benign_precision, _benign_recall, _benign_f1, model_id, key_name) print(out_line) test_out.append(out_line) cm = confusion_matrix(all_label, all_pred) draw_confusion_matrix(cm, list(test_dataset_32.class2id.keys()), cm_net3_test_name)
def word2vec( files=[], directories=[], skip=[], save_dir=None, num_epochs=5, unigram_dictionary=None, noise_ratio=15, kernel=[1, 2, 3, 4, 5, 5, 4, 3, 2, 1], t=1.0e-5, batch_size=1000, # Number of *signal* examples per batch num_embedding_dimensions=500, word_embedding_init=Normal(), context_embedding_init=Normal(), learning_rate=0.1, momentum=0.9, num_processes=3, load_dictionary_dir=None, min_frequency=10, macrobatch_size=100000, max_queue_size=0, verbose=True): ''' Helper function that handles all concerns involved in training A word2vec model using the approach of Mikolov et al. It surfaces all of the options. For customizations going beyond simply tweeking existing options and hyperparameters, substitute this function by writing your own training routine using the provided classes. This function would be a starting point for you. ''' # Make a Word2VecMinibatcher, pass through parameters sent by caller reader = DatasetReader(files=files, directories=directories, skip=skip, noise_ratio=noise_ratio, t=t, num_processes=num_processes, unigram_dictionary=unigram_dictionary, kernel=kernel, max_queue_size=max_queue_size, macrobatch_size=macrobatch_size, verbose=verbose) # Prepare the minibatch generator # (this produces the counter_sampler stats) if load_dictionary_dir is None and unigram_dictionary is None: if verbose: print 'preparing dictionaries...' reader.prepare(save_dir=save_dir) # If min_frequency was specified, prune the dictionaries if min_frequency is not None: if verbose: print 'prunning dictionaries...' reader.prune(min_frequency) # Make a symbolic minibatcher minibatcher = NoiseContrastiveTheanoMinibatcher(batch_size=batch_size, noise_ratio=noise_ratio, dtype="int32", num_dims=2) # Make a Word2VecEmbedder object, feed it the combined input. # Note that the full batch includes noise examples and signal_examples # so is larger than batch_size, which is the number of signal_examples # only per batch. full_batch_size = batch_size * (1 + noise_ratio) embedder = Word2VecEmbedder( input_var=minibatcher.get_batch(), batch_size=full_batch_size, vocabulary_size=reader.get_vocab_size(), num_embedding_dimensions=num_embedding_dimensions, word_embedding_init=word_embedding_init, context_embedding_init=context_embedding_init) # Architectue is ready. Make the loss function, and use it to create # the parameter updates responsible for learning loss = get_noise_contrastive_loss(embedder.get_output(), batch_size) updates = nesterov_momentum(loss, embedder.get_params(), learning_rate, momentum) # Include minibatcher updates, which cause the symbolic batch to move # through the dataset like a sliding window updates.update(minibatcher.get_updates()) # Use the loss function and the updates to compile a training function. # Note that it takes no inputs because the dataset is fully loaded using # theano shared variables train = function([], loss, updates=updates) # Iterate through the dataset, training the embeddings for epoch in range(num_epochs): if verbose: print 'starting epoch %d' % epoch macrobatches = reader.generate_dataset_serial() macrobatch_num = 0 for signal_macrobatch, noise_macrobatch in macrobatches: macrobatch_num += 1 if verbose: print 'running macrobatch %d' % (macrobatch_num - 1) minibatcher.load_dataset(signal_macrobatch, noise_macrobatch) losses = [] for batch_num in range(minibatcher.get_num_batches()): if verbose: print 'running minibatch', batch_num losses.append(train()) if verbose: print '\taverage loss: %f' % np.mean(losses) # Save the model (the embeddings) if save_dir was provided if save_dir is not None: embedder.save(save_dir) # Return the trained embedder and the dictionary mapping tokens # to ids return embedder, reader
def main(NetClass, key_name, dataset_type='test'): torch.set_grad_enabled(False) model_id = NetClass.model_id test_dataset_path = '{}/{}/{}'.format(dataset_path, key_name, dataset_type) test_dataset = DatasetReader(test_dataset_path) ck_name = '{}/model_{}_{}.pt'.format(seg_net_save_dir, model_id, key_name) cm_test_name = '{}/cm_{}_{}_{}.png'.format(seg_net_save_dir, dataset_type, model_id, key_name) net = NetClass(in_dim) net.load_state_dict(torch.load(ck_name)) net = net.to(device) net.eval() all_pred = [] all_label = [] for i in range(len(test_dataset)): im, cm, cls = test_dataset.get_im_patch_list_to_combind_predict(i) batch_im = torch.tensor([im], dtype=torch.float) / 65535 batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) net_out = net(batch_im) out = torch.argmax(net_out, 1) all_label.append(cls) cls1_pixel_num = (out == 1).sum().item() cls2_pixel_num = (out == 2).sum().item() if cls1_pixel_num + cls2_pixel_num == 0: all_pred.append(1) else: if cls2_pixel_num / (cls1_pixel_num + cls2_pixel_num) > seg_thresh: all_pred.append(2) else: all_pred.append(1) _accuracy = accuracy_score(all_label, all_pred) _malignant_precision, _malignant_recall, _malignant_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=2, average='binary') _benign_precision, _benign_recall, _benign_f1, _ = \ precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary') _accuracy = float(_accuracy) _malignant_precision = float(_malignant_precision) _malignant_recall = float(_malignant_recall) _malignant_f1 = float(_malignant_f1) _benign_precision = float(_benign_precision) _benign_recall = float(_benign_recall) _benign_f1 = float(_benign_f1) out_line = '{} acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\ 'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{}'.format(dataset_type, _accuracy, _malignant_precision, _malignant_recall, _malignant_f1, _benign_precision, _benign_recall, _benign_f1, model_id, key_name) print(out_line) test_out.append(out_line) cm = confusion_matrix(all_label, all_pred) draw_confusion_matrix(cm, list(test_dataset.class2id.keys())[1:], cm_test_name)
import numpy as np import matplotlib.pyplot as plt from dataset_reader import DatasetReader from dim_reducer import DimReducer from text_embedder import TextEmbedder from utils import * # Initialization reader = DatasetReader() embedder = TextEmbedder() reducer = DimReducer(alg='umap') article_list = [] # Get article_list num_limit = 100 i = 0 for article in reader: article_list.append(article) i += 1 if i == num_limit: break corpus = get_corpus(article_list) # Get Embeddings embedding_list = embedder.fit_transform(corpus) for i, embedding in enumerate(embedding_list): article_list[i].set_embedding(embedding) # Getting Coordinates embedding_list = np.array(embedding_list)
if cMutantIndividual[nRandomChange] == "0": aMutantIndividual[nRandomChange] = "1" else: aMutantIndividual[nRandomChange] = "0" cNewIndividual = "".join(aMutantIndividual) ''' # Troca aleatório de Cromossomos cNewIndividual = "" for cChromo in cMutantIndividual: nRandomChange = random.randint(0,1) # Se for multiplo de 2 (par) troca if nRandomChange == 1: if cChromo == "1": cNewIndividual += "0" else: cNewIndividual += "1" else: cNewIndividual += cChromo ''' aNewPopulation.append(cNewIndividual) # Adiciona os pais na nova população aNewPopulation.append(cFather1) aNewPopulation.append(cFather2) return aNewPopulation dataset = DatasetReader().read('c11') kbf = KnapsackGeneticAlgorithm(len(dataset[0]), dataset[0], dataset[1], dataset[2]) ExecutionLogger().run(kbf)
def main(NetClass, key_name, scale=32): torch.set_grad_enabled(True) assert scale in [32, 64, 128] model_id = NetClass.model_id save_dir = '{}.{}'.format(simple_net_save_dir_prefix, scale) os.makedirs(save_dir, exist_ok=True) train_dataset_path = '{}/{}/train'.format(dataset_path, key_name) eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name) ck_name = '{}/model_{}_{}.pt'.format(save_dir, model_id, key_name) ck_extra_name = '{}/extra_{}_{}.yml'.format(save_dir, model_id, key_name) cm_name = '{}/cm_valid_{}_{}.png'.format(save_dir, model_id, key_name) logdir = '{}_{}_{}.{}'.format(simple_net_train_logs_dir_prefix, model_id, key_name, scale) sw = SummaryWriter(logdir) train_dataset = DatasetReader(train_dataset_path, is_require_cls_blance=True, target_hw=(scale, scale)) eval_dataset = DatasetReader(eval_dataset_path, target_hw=(scale, scale)) net = NetClass(in_dim) net = net.to(device) batch_count = train_dataset.get_batch_count(batch_size) optim = torch.optim.Adam(net.parameters(), 1e-3, eps=1e-8) optim_adjust = torch.optim.lr_scheduler.MultiStepLR(optim, [90, 180, 270], gamma=0.1) max_valid_value = 0. class_weight_for_loss = torch.tensor([1, 1], dtype=torch.float, device=device) for e in range(epoch): net.train() optim_adjust.step(e) train_acc = 0 train_loss = 0 for b in range(batch_count): batch_im, batch_cls = train_dataset.get_batch(batch_size) batch_im = torch.tensor(batch_im.astype(np.int32), dtype=torch.float) / 65535 # batch_im += (torch.rand_like(batch_im) * 0.1 - 0.05) batch_cls = torch.tensor(batch_cls, dtype=torch.long) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) batch_cls = batch_cls.to(device) net_out = net(batch_im) # net_out = net_train(batch_im) with torch.no_grad(): out = torch.argmax(net_out, 1) acc = torch.eq(out, batch_cls).sum(dtype=torch.float) / len(out) loss = F.cross_entropy(net_out, batch_cls, class_weight_for_loss) train_acc += acc.item() train_loss += loss.item() print('epoch: {} train acc: {:.3f} loss: {:.3f}'.format( e, acc.item(), loss.item())) optim.zero_grad() loss.backward() optim.step() train_acc = train_acc / batch_count train_loss = train_loss / batch_count sw.add_scalar('train_acc', train_acc, global_step=e) sw.add_scalar('train_loss', train_loss, global_step=e) # here to check eval if (e + 1) % 3 == 0: with torch.no_grad(): net.eval() all_pred = [] all_label = [] for i in range(len(eval_dataset)): ims, cls = eval_dataset.get_im_patch_list_to_combind_predict( i, one_im=False) batch_im = torch.tensor(ims.astype(np.int32), dtype=torch.float) / 65535 # batch_cls = torch.tensor([cls]).repeat(len(batch_im)) batch_im = batch_im.permute(0, 3, 1, 2) batch_im = batch_im.to(device) # batch_cls = batch_cls.to(device) net_out = net(batch_im) out = torch.argmax(net_out, 1) all_label.append(cls) if out.sum(dtype=torch.float).item( ) > out.shape[0] * simple_thresh: all_pred.append(1) else: all_pred.append(0) _accuracy = accuracy_score(all_label, all_pred) _malignant_precision, _malignant_recall, _malignant_f1, _ =\ precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary') _benign_precision, _benign_recall, _benign_f1, _ =\ precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary') _accuracy = float(_accuracy) _malignant_precision = float(_malignant_precision) _malignant_recall = float(_malignant_recall) _malignant_f1 = float(_malignant_f1) _benign_precision = float(_benign_precision) _benign_recall = float(_benign_recall) _benign_f1 = float(_benign_f1) sw.add_scalar('eval_acc', _accuracy, global_step=e) sw.add_scalar('eval_m_prec', _malignant_precision, global_step=e) sw.add_scalar('eval_m_recall', _malignant_recall, global_step=e) sw.add_scalar('eval_m_f1', _malignant_f1, global_step=e) sw.add_scalar('eval_b_prec', _benign_precision, global_step=e) sw.add_scalar('eval_b_recall', _benign_recall, global_step=e) sw.add_scalar('eval_b_f1', _benign_f1, global_step=e) print( 'epoch: {} eval acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} ' 'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f}'.format( e, _accuracy, _malignant_precision, _malignant_recall, _malignant_f1, _benign_precision, _benign_recall, _benign_f1)) avg_f1 = (_malignant_f1 + _benign_f1) / 2 #if _benign_precision - _malignant_precision > 0.2: # class_weight_for_loss[1] += 0.1 if avg_f1 >= max_valid_value: max_valid_value = avg_f1 torch.save(net.state_dict(), ck_name) extra = { 'accuracy': _accuracy, 'm_precision': _malignant_precision, 'm_recall': _malignant_recall, 'm_f1': _malignant_f1, 'b_precision': _benign_precision, 'b_recall': _benign_recall, 'b_f1': _benign_f1, } yaml.safe_dump(extra, open(ck_extra_name, 'w')) cm = confusion_matrix(all_label, all_pred) draw_confusion_matrix(cm, list(eval_dataset.class2id.keys()), cm_name) # early exit if _accuracy == 1.: print('found valid acc == 1. , early exit') break sw.close()
if train_filename == test_filename: print 'Dev run.' ignore_closest = True else: ignore_closest = False debug = False dummy = False print 'Reading model..' if not dummy: model_reader = ModelReader(model_params_filename) model = model_reader.model else: model = DummyContextModel() dataset_reader = DatasetReader(model) print 'Reading train dataset..' train_set, train_key2ind, train_ind2key = dataset_reader.read_dataset( train_filename, train_filename + '.key', True, isolate_target_sentence) knn = Knn(k, train_set, train_key2ind) print 'Reading test dataset..' test_set, test_key2ind, test_ind2key = dataset_reader.read_dataset( test_filename, test_filename + '.key', False, isolate_target_sentence) print 'Starting to classify test set:' with open(result_filename, 'w') as o: for ind, key_set in enumerate(test_set): key = test_ind2key[ind] if debug:
def main(): if not (args.use_w1_w2_embeddings or args.use_paraphrase_vectors): raise ValueError( 'At least one of "use_w1_w2_embeddings" or "use_paraphrase_vectors" should be set.' ) # Load the datasets logger.info('Loading the datasets from {}'.format(args.dataset_prefix)) train_set = DatasetReader(args.dataset_prefix + '/train.tsv') val_set = DatasetReader(args.dataset_prefix + '/val.tsv', label2index=train_set.label2index) test_set = DatasetReader(args.dataset_prefix + '/test.tsv', label2index=train_set.label2index) # Generate the feature vectors using the paraphrasing model logger.info('Generating feature vectors...') train_features, val_features, test_features = [], [], [] if args.use_paraphrase_vectors: logger.info('Reading word embeddings from {}...'.format( args.word_embeddings_for_model)) wv, model_words = load_binary_embeddings( args.word_embeddings_for_model) logger.info('Loading paraphrasing model from {}...'.format( args.paraphrase_model_dir)) model = Model.load_model(args.language_model_dir, wv) model_words = ['[w1]', '[w2]', '[par]'] + model_words modelw2index = {w: i for i, w in enumerate(model_words)} UNK = modelw2index['unk'] if args.use_w1_w2_embeddings: logger.info('Reading word embeddings from {}...'.format( args.word_embeddings_for_dist)) wv, words = load_binary_embeddings(args.word_embeddings_for_dist) w2index = {w: i for i, w in enumerate(words)} UNK = w2index['unk'] train_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in train_set.noun_compounds ])) val_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in val_set.noun_compounds ])) test_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in test_set.noun_compounds ])) # Tune the hyper-parameters using the validation set logger.info('Classifying...') reg_values = [0.5, 1, 2, 5, 10] penalties = ['l2'] k_values = [10, 15, 25, 50] if args.use_paraphrase_vectors else [0] classifiers = ['logistic', 'svm'] f1_results = [] descriptions = [] models = [] all_test_instances = [] for k in k_values: curr_train_features, curr_val_features, curr_test_features = train_features, val_features, test_features if args.use_paraphrase_vectors: curr_train_features += [ predict_paraphrases(model, train_set.noun_compounds, model_words, modelw2index, UNK, k) ] curr_val_features += [ predict_paraphrases(model, val_set.noun_compounds, model_words, modelw2index, UNK, k) ] curr_test_features += [ predict_paraphrases(model, test_set.noun_compounds, model_words, modelw2index, UNK, k) ] train_instances = [ np.concatenate(list(f)) for f in zip(*curr_train_features) ] val_instances = [ np.concatenate(list(f)) for f in zip(*curr_val_features) ] test_instances = [ np.concatenate(list(f)) for f in zip(*curr_test_features) ] for cls in classifiers: for reg_c in reg_values: for penalty in penalties: descriptions.append( 'K: {}, Classifier: {}, Penalty: {}, C: {:.2f}'.format( k, cls, penalty, reg_c)) # Create the classifier if cls == 'logistic': classifier = LogisticRegression( penalty=penalty, C=reg_c, multi_class='multinomial', n_jobs=20, solver='sag') else: classifier = LinearSVC(penalty=penalty, dual=False, C=reg_c) logger.info( 'Training with classifier: {}, penalty: {}, c: {:.2f}...' .format(cls, penalty, reg_c)) classifier.fit(train_instances, train_set.labels) val_pred = classifier.predict(val_instances) p, r, f1, _ = evaluate(val_set.labels, val_pred, val_set.index2label, do_full_reoprt=False) logger.info( 'K: {}, Classifier: {}, penalty: {}, c: {:.2f}, precision: {:.3f}, recall: {:.3f}, F1: {:.3f}' .format(k, cls, penalty, reg_c, p, r, f1)) f1_results.append(f1) models.append(classifier) all_test_instances.append(test_instances) best_index = np.argmax(f1_results) description = descriptions[best_index] classifier = models[best_index] logger.info('Best hyper-parameters: {}'.format(description)) # Save the best model to a file logger.info('Copying the best model...') joblib.dump(classifier, '{}/best.pkl'.format(args.model_dir)) # Evaluate on the test set logger.info('Evaluation:') test_instances = all_test_instances[best_index] test_pred = classifier.predict(test_instances) precision, recall, f1, support = evaluate(test_set.labels, test_pred, test_set.index2label, do_full_reoprt=True) logger.info('Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}'.format( precision, recall, f1)) # Write the predictions to a file output_predictions(args.model_dir + '/predictions.tsv', test_set.index2label, test_pred, test_set.noun_compounds, test_set.labels)