def train_data(): test_file = "./fixtures/net_20180312_201803114_100k_preprocessed.test" train_file = "./fixtures/net_20180312_201803114_100k_preprocessed.train" fmap_file = "./fixtures/net_20180312_201803114_100k.feature_map" num_features = -1 for _ in open(fmap_file): num_features += 1 conf = { "feature_cols": "PageID OrderID".split(), "target_col": "IsClick", "num_fields": 2, "num_features": num_features, "dim": 10, "use_unary": True, "num_iter": 5, "opt_cls": optim.Adam, "opt_kwargs": { "lr": 1e-3 }, "batch_size": 64 } test = read_dataset(test_file, conf["feature_cols"], conf["target_col"]) train_iter = BatchIter(train_file, conf["feature_cols"], conf["target_col"], batch_size=conf["batch_size"]) return Data(train_iter, test, conf)
def run(dataset_name, output_file_name): dataset = data.read_dataset('datasets/' + dataset_name + '.csv', **utils.reader_parameters[dataset_name]) random.shuffle(dataset) params = parameters[dataset_name] network = params['NETWORKS'][0] with open('J_vs_N_' + str(network) + dataset_name + '.csv', 'w') as file: folds = fold(10, dataset) valids = folds[0] train = [i for s in folds[1:] for i in s] trains = [train[i:i + 10] for i in range(0, len(train), 10)] n = 0 net = NeuralNetwork(network, ALPHA=params['ALPHA'], LAMBDA=0, K=0, BETA=0.8, STOP=1) for i in range(50): for train in trains: n += len(train) net.train(train) J = 0 for valid in valids: J += net.verifyPerformance(valid) file.write(str(n) + ',' + str(J) +'\n') for network in params['NETWORKS']: with open('f1_vs_lambda_' + str(network) + dataset_name + '.csv', 'w') as file: for LAMBDA in [0, 1, 8, 64, 512]: net = NeuralNetwork(network, ALPHA=params['ALPHA'], LAMBDA=LAMBDA/1000.0, K=params['K'], BETA=0.8, STOP=params['STOP']) cv = CrossValidator(k=10, classifier=net) cv.run(dataset) try: file.write(str(LAMBDA/1000.0) + ',' + str(mean(cv.f1s(1))) + ',' + str(stdev(cv.f1s(1))) +'\n') except: file.write(str(LAMBDA/1000.0) + ',0,0\n') file.flush()
def evaluate(): """Eval SUN3D for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for SUN3D. images, depths,transforms = data.read_dataset(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. result, transform = model.inference(images,True) result = lss.inverse(result) depths = lss.inverse(depths) depths=tf.slice(depths,(0, 0, 0, 0), (BATCH_SIZE,IMAGE_SIZE_H, IMAGE_SIZE_W,1)) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( model.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) config = tf.ConfigProto(gpu_options=gpu_options) while True: print('Start depth output') eval_once(result,depths,config,saver) if EVAL_RUN_ONCE: print('end of depth output') break time.sleep(EVAL_INTERVAL_SECS)
def main(): ## create an es client es_conn = Elasticsearch(config.ES_HOSTS, timeout=30) ## create the index if it doesn't exist create_index(es_conn = es_conn, index_name = config.INDEX_NAME) dataset = read_dataset(config.DOCS_PATH) counter_read, counter_idx_failed = 0, 0 ## counters while True: try: doc = next(dataset) res = es_conn.index( index = config.INDEX_NAME, doc_type = config.DOC_TYPE, id = doc.filename, body = doc._asdict()) counter_read += 1 if counter_read % config.PRINT_EVERY == 0: print('indexed {} documents'.format(counter_read)) if not res['created']: print('indexing `{}` failed'.format(doc.path)) counter_idx_failed += 1 except StopIteration: print('finished reading docs from `{}`'.format(config.DOCS_PATH)) break print('indexed {} docs to index `{}`, failed to index {} docs'.format( counter_read, config.INDEX_NAME, counter_idx_failed ))
def train(): tr, va, te = read_dataset('../mnist.pkl.gz') binarizer = LabelBinarizer().fit(range(10)) x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) keep_prob = tf.placeholder(tf.float32) preds = model.inference(x, keep_prob) loss, total_loss = model.loss(preds, y) acc = model.evaluation(preds, y) # learning rate: 0.1 train_op = model.training(total_loss, 0.1) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) for i in xrange(10000): batch_xs, batch_ys = tr.next_batch(50) if i % 100 == 0: train_acc = acc.eval(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 1.0}, session=sess) print "step: {0}, training accuracy {1}".format(i, train_acc) validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess) print("Validation accuracy : {0}".format(validation_accuracy)) train_op.run(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5}, session=sess) test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess) print("Test accuracy : ", test_accuracy)
def load_data(): #data.create_datasets() #data.generate_data('val.pkl', 5000) print("loading data...") train_data, test_data, val_data, eval_data = data.read_dataset() print("data loaded") return train_data, test_data, val_data, eval_data
def evaluate_on_dev(model, filename, batch_size=32): """Prints predictions and metrics by model on development dataset.""" lemmas, tags, inflected_forms = read_dataset(filename) predictions = generate_predictions(model, lemmas, tags, batch_size) for prediction in predictions: print(prediction) print() print("Accuracy: {}, Average Distance: {}".format(accuracy(predictions, inflected_forms), average_distance(predictions, inflected_forms)))
def evaluate_on_dev(model, filename, batch_size=32): """Prints predictions and metrics by model on development dataset.""" lemmas, tags, inflected_forms = read_dataset(filename) predictions = generate_predictions(model, lemmas, tags, batch_size) result_text = '' for lemma, tag, inflected_form, prediction in zip(lemmas, tags, inflected_forms, predictions): if inflected_form != prediction: result_text += '{}\t{}\t{}\t{}\n'.format(lemma, tag, inflected_form, prediction) return result_text, accuracy(predictions, inflected_forms), average_distance( predictions, inflected_forms)
def main(argv): keep_prob = tf.placeholder(tf.float32, name="keep_prob") image = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3], name="input_image") annotation = tf.placeholder(tf.int32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 1], name="annotation") pred_annotation, logits = inference(image) tf.summary.image("input_image", image) tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8)) tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8)) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.squeeze(annotation,squeeze_dims=[3]))) tf.summary.scalar("loss", loss) trainable_var = tf.trainable_variables() train_op = train(FLAGS.learning_rate, loss, trainable_var) summary_op = tf.summary.merge_all() train_records, valid_records = data.read_dataset(FLAGS.data_dir) print("Train records:", len(train_records)) print("Valid records:", len(valid_records)) train_reader = BatchReader(train_records, {'resize': True, 'resize_size': IMAGE_SIZE}) sess = tf.Session() saver = tf.train.Saver(max_to_keep=10) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.model_dir, "logs"), sess.graph) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored from", ckpt.model_checkpoint_path) else: print("initialize new model") for itr in xrange(MAX_ITERATION): train_images, train_annotations = train_reader.next_batch(FLAGS.batch_size) feed_dict = {image: train_images, annotation: train_annotations, keep_prob: 0.8} train_loss, _, pred_result, summary_str = sess.run([loss, train_op, pred_annotation, summary_op], feed_dict=feed_dict) summary_writer.add_summary(summary_str, itr) print("Time: %d, Step: %d, Train loss: %g" % (time.time(), itr, train_loss)) if itr % 10 == 0 and itr > 0: saver.save(sess, FLAGS.model_dir + "model.ckpt", itr) print(pred_result[0])
def train(config, exp_name, data_path): # Read train and dev data, set dev mode = True results_dir = os.path.join(data_path, exp_name) if os.path.exists(results_dir): print("{} already exists, no need to train.\n".format(results_dir)) return os.makedirs(results_dir) json.dump(config, open(os.path.join(results_dir, 'config.json'), 'w'), sort_keys=True, separators=(',\n', ': ')) is_typed = config.get('is_typed', False) print("Typed Regularizer {}".format(is_typed)) data_set = data.read_dataset(data_path, dev_mode=True, is_typed=is_typed) is_dev = config['is_dev'] print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST')) print("Number of training data points {}".format(len(data_set['train']))) print("Number of dev data points {}".format(len(data_set['test']))) # Set up functions and params neg_sampler = data.NegativeSampler(data_set['train']) model = models.get_model(config, neg_sampler) evaluater = RankEvaluater(model, neg_sampler) updater = algorithms.Adam() typed_data = data_set['typed'] if is_typed else None minimizer = optimizer.GradientDescent(data_set['train'], data_set['test'], updater, model, evaluater, results_dir, 'single', config, is_typed=is_typed, typed_data=typed_data) print('Training {}...\n'.format(config['model'])) start = time.time() minimizer.minimize() end = time.time() hours = (end - start) / 3600 minutes = ((end - start) % 3600) / 60 print("Finished Training! Took {} hours and {} minutes\n".format( hours, minutes))
def evaluate(): """Eval SUN3D for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for SUN3D. images, depths, transforms = data.read_dataset(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. result, resulttransform = model.inference(images, False) depths = lss.inverse(depths) result = lss.inverse(result) depths = tf.slice(depths, (0, 0, 0, 0), (BATCH_SIZE, IMAGE_SIZE_H, IMAGE_SIZE_W, 1)) # Calculate predictions. scale_inv_error = evalfunct.scinv(result, depths) L1_relative_error = evalfunct.L1rel(result, depths) L1_inverse_error = evalfunct.L1inv(result, depths) L1_transform = tf.reduce_mean(tf.abs(resulttransform - transforms)) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( model.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(TEST_LOG, g) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) config = tf.ConfigProto(gpu_options=gpu_options) while True: print('Start evaluation') eval_once(result, config, saver, summary_writer, scale_inv_error, L1_relative_error, L1_inverse_error, L1_transform, summary_op) if EVAL_RUN_ONCE: print('end of evaluation') break time.sleep(EVAL_INTERVAL_SECS)
def test(config, exp_name, data_path): print("Testing...\n") is_dev = config['is_dev'] print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST')) results_dir = os.path.join(data_path, exp_name) params_path = os.path.join(data_path, exp_name, 'params_single.cpkl') if not os.path.exists(params_path): print("No trained params found, quitting.") return data_set = data.read_dataset(data_path, dev_mode=is_dev) all_data = copy.copy(data_set['train']) all_data.extend(data_set['dev']) all_data.extend(data_set['test']) neg_sampler = data.NegativeSampler(all_data) # Initializing the model changes config. model = models.get_model(config, neg_sampler) params = data.load_params(params_path, model) print("Number of Test Samples {}".format(len(data_set['test']))) evaluater = TestEvaluater(model, neg_sampler, params, is_dev, results_dir) evaluate(data_set['test'], evaluater, results_dir, is_dev)
def test(config,exp_name,data_path): print("Testing...\n") is_dev = config['is_dev'] print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST')) results_dir = os.path.join(data_path, exp_name) params_path = os.path.join(data_path,exp_name,'params.torch') if not os.path.exists(params_path): print("No trained params found, quitting.") return data_set = data.read_dataset(data_path,results_dir,dev_mode=is_dev) print("Number of Test Samples {}".format(len(data_set['test']))) print("Vocabulary size: {}".format(constants.vocab_size)) # Initializing the model changes config. print("Loading model params") model = models.TCNN(constants.vocab_size,config['ent_dim']) if torch.cuda.is_available(): model.cuda() model.load_state_dict( torch.load(os.path.join(results_dir, "params.torch"))) print("Using GPU {}".format(torch.cuda.current_device())) else: print("Using CPU") torch.set_num_threads(56) model.load_state_dict( torch.load(os.path.join(results_dir, "params.torch"), map_location=lambda storage, loc: storage)) print("Model keys: {}".format(model.state_dict().keys())) check_embeddings(model,data_set['test']) model.eval() loss = evaluate(model,data_set['test']) print("Test KL Divergence Loss {}".format(loss))
def train(): tr, va, te = read_dataset('../mnist.pkl.gz') binarizer = LabelBinarizer().fit(range(10)) x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) keep_prob = tf.placeholder(tf.float32) preds = model.inference(x, keep_prob) loss, total_loss = model.loss(preds, y) acc = model.evaluation(preds, y) # learning rate: 0.1 train_op = model.training(total_loss, 0.1) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) for i in xrange(10000): batch_xs, batch_ys = tr.next_batch(50) if i % 100 == 0: train_acc = acc.eval(feed_dict={ x: batch_xs, y: binarizer.transform(batch_ys), keep_prob: 1.0 }, session=sess) print "step: {0}, training accuracy {1}".format(i, train_acc) validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess) print("Validation accuracy : {0}".format(validation_accuracy)) train_op.run(feed_dict={ x: batch_xs, y: binarizer.transform(batch_ys), keep_prob: 0.5 }, session=sess) test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess) print("Test accuracy : ", test_accuracy)
def visualize(config,exp_name,data_path): results_dir = os.path.join(data_path, exp_name) params_path = os.path.join(data_path, exp_name, 'params.torch') if not os.path.exists(params_path): print("No trained params found, quitting.") return data_set = data.read_dataset(data_path, results_dir, dev_mode=True,gen_neg=False) # Initializing the model changes config. print("Loading model params for visualization") model = models.TCNN(constants.vocab_size, config['ent_dim']) model.load_state_dict(torch.load(os.path.join(results_dir, "params.torch"), map_location=lambda storage, loc: storage)) model.eval() if torch.cuda.is_available(): model.cuda() print("Using GPU {}".format(torch.cuda.current_device())) torch.cuda.manual_seed(241984) else: print("Using CPU") torch.manual_seed(241984) count = 1 for s in util.chunk(data_set['dev'],1000): conv_1_weights, conv_2_weights, conv_3_weights = model.visualize(util.get_tuples(s)) print("Layer: {}".format(conv_1_weights)) np.save(open(results_dir + '/conv_1_weights.' + str(count), 'w'), conv_1_weights) print("Layer: {}".format(conv_2_weights)) np.save(open(results_dir + '/conv_2_weights.' + str(count), 'w'), conv_2_weights) print("Layer: {}".format(conv_1_weights)) np.save(open(results_dir + '/conv_3_weights.' + str(count), 'w'), conv_3_weights) count += 1
def train(config,exp_name,data_path): # Read train and dev data, set dev mode = True results_dir = os.path.join(data_path,exp_name) if os.path.exists(results_dir): print("{} already exists, no need to train.\n".format(results_dir)) return os.makedirs(results_dir) json.dump(config,open(os.path.join(results_dir,'config.json'),'w'), sort_keys=True,separators=(',\n', ': ')) data_set = data.read_dataset(data_path,results_dir,dev_mode=True) is_dev = config['is_dev'] print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST')) print("Number of training data points {}".format(len(data_set['train']))) print("Number of dev data points {}".format(len(data_set['dev']))) print("Number of test data points {}".format(len(data_set['test']))) print("Vocabulary size: {}".format(constants.vocab_size)) model = models.TCNN(constants.vocab_size, config['ent_dim']) if torch.cuda.is_available(): model.cuda() print("Using GPU {}".format(torch.cuda.current_device())) else: print("Using CPU") torch.set_num_threads(56) grad_descent = optimizer.GradientDescent(data_set['train'],data_set['test'], config,results_dir,model) print('Training...\n') start = time.time() grad_descent.minimize() end = time.time() hours = int((end-start)/ 3600) minutes = ((end-start) % 3600) / 60 print("Finished Training! Took {} hours and {} minutes\n".format(hours,minutes))
import os from constants import TASK1_DATA_PATH from data import read_dataset from utils import accuracy, average_distance, mean if __name__ == "__main__": predictions_dir = os.path.join('output', 'dh0p1') acc = {'low': [], 'medium': [], 'high': []} dist = {'low': [], 'medium': [], 'high': []} for filename in sorted(os.listdir(predictions_dir)): for dataset in ['low', 'medium', 'high']: if dataset in filename and dataset == filename.split('-')[-2]: language = '-'.join(filename.split('-')[:-2]) _, _, predictions = read_dataset(os.path.join(predictions_dir, filename)) _, _, truth = read_dataset(os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language))) print('{}[task 1/{}]: {:.4f}, {:.4f}'.format(language, dataset, accuracy(predictions, truth), average_distance(predictions, truth))) acc[dataset].append(accuracy(predictions, truth)) dist[dataset].append(average_distance(predictions, truth)) print() print() for dataset in ['low', 'medium', 'high']: print('Average[{}]: {:.4f}, {:.4f}'.format(dataset, mean(acc[dataset]), mean(dist[dataset])))
train_path = 'resources/traing_dataset.csv' valid_path = 'resources/valid_dataset.csv' test_path = 'resources/test_dataset.csv' # Setting for learning batch_size = 100 iteration = 10 epochs = 10 valid_size = 5 X, Y, is_training, cost, optimizer, accuracy, merged = alexnet() # Read dataset. classes = ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip'] train = read_dataset(train_path) valid = read_dataset(valid_path) tran_labels = train[classes] valid_labels = valid[classes] sess = tf.Session() # For tensorboard logdir='log/' train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph) valid_writer = tf.summary.FileWriter(logdir + '/valid') # Initialize valuables init = tf.global_variables_initializer() sess.run(init)
total_loss += loss_value print("\nLoss per sentence: %.3f" % (total_loss/len(traindata))) print("Example outputs:") for s in traindata[:5]: for i,fields in enumerate(s): wf, lemma, msd = fields if (iscandidatemsd(msd) or (msd == NONE and lemma != NONE))\ and random() < SAMPLETRAIN: print("INPUT:", s[i][LEMMA], "OUTPUT:", generate(i,s,id2char), "GOLD:",wf) break devacc, devlev = eval(devdata,id2char) print("Development set accuracy: %.2f" % (100*devacc)) print("Development set avg. Levenshtein distance: %.2f" % devlev) print() if __name__=='__main__': traindata, wf2id, lemma2id, char2id, msd2id = read_dataset(sysargv[1]) devinputdata, _, _, _, _ = read_dataset(sysargv[2]) devgolddata, _, _, _, _ = read_dataset(sysargv[3]) id2char = {id:char for char,id in char2id.items()} init_model(wf2id,lemma2id,char2id,msd2id) train(traindata,[devinputdata,devgolddata], wf2id,lemma2id,char2id,id2char,msd2id,20) eval([devinputdata,devgolddata],id2char,generating=1, outf=open("%s-out" % sysargv[2],"w"))
def train(): """Train SUN3D for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:0'): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for SUN3D. images, gtdepths, gttransforms = data.read_dataset(eval_data=False) pdepth, ptransforms = model.inference(images, True) # Calculate loss. loss = model.loss(pdepth, gtdepths, gttransforms, ptransforms) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = model.train(loss, global_step) config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=LOG_DEVICE_PLACEMENT, intra_op_parallelism_threads=NUM_PREPROCESS_THREADS) config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 0.4 class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( save_checkpoint_secs=3600, checkpoint_dir=TRAIN_LOG, hooks=[ tf.train.StopAtStepHook(last_step=NUM_ITER), tf.train.NanTensorHook(loss), _LoggerHook() ], config=config) as mon_sess: while not mon_sess.should_stop(): print(mon_sess.run(loss)) mon_sess.run(train_op)
def main(argv): keep_prob = tf.placeholder(tf.float32, name="keep_prob") image = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3], name="input_image") annotation = tf.placeholder( tf.int32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 1], name="annotation") pred_annotation, logits = inference(image) tf.summary.image("input_image", image) tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8)) tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8)) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tf.squeeze( annotation, squeeze_dims=[3]))) tf.summary.scalar("loss", loss) trainable_var = tf.trainable_variables() train_op = train(FLAGS.learning_rate, loss, trainable_var) summary_op = tf.summary.merge_all() train_records, valid_records = data.read_dataset(FLAGS.data_dir) print("Train records:", len(train_records)) print("Valid records:", len(valid_records)) train_reader = BatchReader(train_records, { 'resize': True, 'resize_size': IMAGE_SIZE }) sess = tf.Session() saver = tf.train.Saver(max_to_keep=10) summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.model_dir, "logs"), sess.graph) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored from", ckpt.model_checkpoint_path) else: print("initialize new model") for itr in xrange(MAX_ITERATION): train_images, train_annotations = train_reader.next_batch( FLAGS.batch_size) feed_dict = { image: train_images, annotation: train_annotations, keep_prob: 0.8 } train_loss, _, pred_result, summary_str = sess.run( [loss, train_op, pred_annotation, summary_op], feed_dict=feed_dict) summary_writer.add_summary(summary_str, itr) print("Time: %d, Step: %d, Train loss: %g" % (time.time(), itr, train_loss)) if itr % 10 == 0 and itr > 0: saver.save(sess, FLAGS.model_dir + "model.ckpt", itr) print(pred_result[0])
optimE = optim.Adam(encoder.parameters(), lr=config.getfloat('training', 'lr')*0.01) optimG = optim.Adam(generator.parameters(), lr=config.getfloat('training', 'lr')) optimD = optim.Adam(discriminator.parameters(), lr=config.getfloat('training', 'lr')) ''' Quake_Smart_seq2 = data.read_dataset(_path+"../data/Quake_Smart-seq2/data.h5") Quake_10x = data.read_dataset(_path+"../data/Quake_10x/data.h5") merge = {"A":Quake_Smart_seq2, "B":Quake_10x} mergedexpr, mergedl = data.merge_datasets(merge) s = mergedexpr.sum(axis=1) x = (mergedexpr.T/s).T x = x * 10000 x,y,z,w = data.split_data(x, mergedl, test_size=0.01) ''' Baron_human = data.read_dataset(_path+"../data/Baron_human/data.h5") Muraro = data.read_dataset(_path+"../data/Muraro/data.h5") Enge = data.read_dataset(_path+"../data/Enge/data.h5") Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5") Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5") Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5") merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe, 'Xin_2016':Xin_2016, 'Lawlor':Lawlor} mergedexpr, mergedl = data.merge_datasets(merge) s = mergedexpr.sum(axis=1) x = (mergedexpr.T/s).T x = x*10000 #x = x[: ,:1000] whole_set = dataset.Single(x, mergedl)
with open(os.path.join(M2M_ALIGNER_PATH, 'output'), 'r', encoding='utf8') as file: for i, line in enumerate(file): if 'NO ALIGNMENT' in line: alignments.append([-1] * len(tgts[i])) continue src, tgt = line.split('\t') src = src.replace('|', '') tgt = tgt.replace('|', '') seq_alignment = [] src_i = 0 for token1, token2 in zip(src, tgt): if token1 != '_': seq_alignment.append(src_i) src_i += 1 else: seq_alignment.append(-1) alignments.append(seq_alignment) return alignments if __name__ == "__main__": lemmas, tags, inflected_forms = read_dataset( os.path.join(TASK1_DATA_PATH, 'hindi-train-high')) alignments = one_one_alignment([list(word) for word in lemmas], [list(word) for word in inflected_forms]) print(alignments)
import numpy as np import pandas as pd import tensorflow as tf import matplotlib.pyplot as plt from data import read_dataset mnist = read_dataset("Data") import tensorflow as tf sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, shape=[None, 784]) y_ = tf.placeholder(tf.float32, shape=[None, 1]) W = tf.Variable(tf.zeros([784, 50])) b = tf.Variable(tf.zeros([50])) def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
list_of_length.append(i[1] - i[0]) print("length of all types:", list_of_length) def __getitem__(self, index): index = self.index[index] label = self.train_Y[index] r1 = random.randint( 0, self.st[label][1] - self.st[label][0] - 1) + self.st[label][0] r2 = random.randint( 0, len(self) - self.st[label][1] + self.st[label][0] - 1) r2 = (r2 + self.st[label][1]) % len(self) r1 = self.index[r1] r2 = self.index[r2] assert self.train_Y[index] == self.train_Y[r1] assert self.train_Y[index] != self.train_Y[r2] return self.train_X[index], self.train_X[r1], self.train_X[ r2], self.train_Y[index] if __name__ == '__main__': a = data.read_dataset("../data/Plasschaert/data.h5") #a.exprs = a.exprs[:,:1000] x, y, z, w = data.getdata(a) print(x.shape) dataset = Single(x, z) print(dataset.catagories) dl = DataLoader(dataset, batch_size=5) for i, j in enumerate(dl): pass
def generate_entry(model_name, hyperparameters, datasets=('low', 'medium', 'high'), use_hierarchical_attention=False, use_ptr_gen=True, test_data='test', write_hyperparameter=False, output_folder=None, resume=False): languages = get_languages() if output_folder is None: output_folder = os.path.join('output', model_name) if not resume: os.makedirs(output_folder) if write_hyperparameter: with open(os.path.join(output_folder, 'hyperparameters'), 'w', encoding='utf8') as file: file.write(hyperparameters) for language in tqdm(sorted(languages)): for dataset in datasets: if resume and os.path.exists( os.path.join(output_folder, '{}-{}-out'.format( language, dataset))): continue lr = hyperparameters['lr'][dataset] embedding_size = hyperparameters['embedding_size'][dataset] hidden_size = hyperparameters['hidden_size'][dataset] clip = hyperparameters['clip'][dataset] dropout_p = hyperparameters['dropout_p'][dataset] alpha = hyperparameters['alpha'][dataset] beta = hyperparameters['beta'][dataset] patience = hyperparameters['patience'][dataset] epochs_extension = hyperparameters['epochs_extension'][dataset] experiment_name = "{}_{}_{}_lr{}_em{}_hd_{}_clip{}_p{}_a{}_b_{}_{}".format( model_name, language, dataset, lr, embedding_size, hidden_size, str(clip), dropout_p, alpha, beta, int(time.time())) try: model_inputs_train, model_inputs_val, labels_train, labels_val, \ vocab = package.data.load_data(language, dataset, test_data=test_data, use_external_val_data=True, val_ratio=0.2, random_state=42) except FileNotFoundError: continue model = package.net.Model( vocab, embedding_size=embedding_size, hidden_size=hidden_size, use_hierarchical_attention=use_hierarchical_attention, use_ptr_gen=use_ptr_gen, dropout_p=dropout_p).to(device) optimizer = optim.Adam(lr=lr, params=model.parameters()) loss_fn = package.loss.Criterion(vocab, alpha, beta) writer = SummaryWriter('runs/' + experiment_name) model_save_dir = os.path.join('./saved_models', experiment_name) os.makedirs(model_save_dir) epochs = hyperparameters['epochs'][dataset] train_and_evaluate(model_inputs_train, labels_train, model_inputs_val, labels_val, model, optimizer, loss_fn, epochs=epochs, batch_size=32, model_save_dir=model_save_dir, show_progress=False, writer=writer, clip=clip) epochs_trained = epochs # Load best performing model on validation set best_state = torch.load(os.path.join(model_save_dir, 'best.model')) while epochs_trained - best_state['epoch_num'] < patience: train_and_evaluate(model_inputs_train, labels_train, model_inputs_val, labels_val, model, optimizer, loss_fn, epochs=epochs_extension, batch_size=32, model_save_dir=model_save_dir, show_progress=False, writer=writer, clip=clip, starting_epoch=epochs_trained + 1, initial_best_val_acc=best_state['val_acc']) epochs_trained += epochs_extension best_state = torch.load( os.path.join(model_save_dir, 'best.model')) model.load_state_dict(best_state['model_state']) if test_data == 'dev': dev_file = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language)) lemmas_test, tags_test, _ = read_dataset(dev_file) elif test_data == 'test': test_file = os.path.join(TASK1_DATA_PATH, '{}-covered-test'.format(language)) lemmas_test, tags_test = read_covered_dataset(test_file) else: raise ValueError file_path = os.path.join(output_folder, '{}-{}-out'.format(language, dataset)) generate_output(model, lemmas_test, tags_test, file_path)
encoder = model.get_encoder(config, "M").cuda() discriminator = model.get_discriminator(config).cuda() generator = model.get_generator(config).cuda() encoder = encoder.cpu() encoder = encoder.cuda() #classifier = model.get_classifier(config).cuda() #gpu_tracker.track() #optimC = optim.Adam(classifier.parameters(), lr=config.getfloat('training', 'lr')) optimE = optim.Adam(encoder.parameters(), lr=config.getfloat('training', 'lr') * 0.01) optimG = optim.Adam(generator.parameters(), lr=config.getfloat('training', 'lr')) optimD = optim.Adam(discriminator.parameters(), lr=config.getfloat('training', 'lr')) Quake_Smart_seq2 = data.read_dataset(_path + "../data/Quake_Smart-seq2/data.h5") Quake_10x = data.read_dataset(_path + "../data/Quake_10x/data.h5") merge = {"A": Quake_Smart_seq2, "B": Quake_10x} mergedexpr, mergedl = data.merge_datasets(merge) s = mergedexpr.sum(axis=1) x = (mergedexpr.T / s).T x = x * 10000 x, y, z, w = data.split_data(x, mergedl, test_size=0.01) ''' Baron_human = data.read_dataset(_path+"../data/Baron_human/data.h5") Muraro = data.read_dataset(_path+"../data/Muraro/data.h5") Enge = data.read_dataset(_path+"../data/Enge/data.h5") Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5") Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5") Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5") merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe,
def main(): """ Main script for training and evaluating a GSDT. """ args = parse_args() np.random.seed(args.seed) torch.manual_seed(args.seed) device = to_device(args.device) trn_x, trn_y, test_x, test_y = data.read_dataset(args.data) in_features = trn_x.shape[1] out_classes = trn_y.max() + 1 model = models.GSDT(in_features, out_classes, args.depth, args.branch, args.cov, args.rank).to(device) trn_loader = to_loader(trn_x, trn_y, batch_size=args.batch_size, shuffle=True) test_loader = to_loader(test_x, test_y, batch_size=args.batch_size) loss_func = models.TreeLoss(args.loss_type, lamda=args.lamda) optimizer1 = optim.Adam(model.parameters(), lr=args.lr) optimizer2 = optim.Adam(model.parameters(), lr=1e-2) logs = [] for epoch in range(1, args.epochs + 1): model.train() loss1_sum, loss2_sum, count = 0, 0, 0 for x, y in trn_loader: x = x.to(device) y = y.to(device) loss1, loss2 = loss_func(model, x, y) optimizer1.zero_grad() (loss1 + loss2).backward() optimizer1.step() loss1_sum += loss1.item() * x.size(0) loss2_sum += loss2.item() * x.size(0) count += x.size(0) if epoch == args.epochs // 2: fit_leaves(model, trn_loader, device, optimizer2) trn_acc = evaluate(model, trn_loader, device) test_acc = evaluate(model, test_loader, device) logs.append( (epoch, loss1_sum / count, loss2_sum / count, trn_acc, test_acc)) if args.save: model_path = '{}/{}/{}.model'.format(args.out, args.data, args.seed) os.makedirs(os.path.dirname(model_path), exist_ok=True) torch.save(model.state_dict(), model_path) df = pd.DataFrame(logs) log_path = '{}/{}/{}.log'.format(args.out, args.data, args.seed) os.makedirs(os.path.dirname(log_path), exist_ok=True) df.to_csv(log_path, index=False, sep='\t', header=False, float_format='%.4f') trn_acc = evaluate(model, trn_loader, device) test_acc = evaluate(model, test_loader, device) result = np.array([trn_acc, test_acc]) np.save('{}/{}/{}'.format(args.out, args.data, args.seed), result)
def load_data(language, dataset, test_data='dev', val_ratio=0.2, random_state=42, use_external_val_data=False): """Loads training data.""" train_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-{}'.format(language, dataset)) lemmas, tags, inflected_forms = read_dataset(train_dataset) train_data_size = len(lemmas) if val_ratio*train_data_size > 1000: val_ratio = 1000/train_data_size val_dataset = None if use_external_val_data: dev_dataset = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language)) high_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-high'.format(language)) medium_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-medium'.format(language)) low_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-low'.format(language)) if test_data != 'dev': val_dataset = dev_dataset elif os.path.exists(high_dataset) and train_dataset != high_dataset: val_dataset = high_dataset elif os.path.exists(medium_dataset) and train_dataset != medium_dataset: val_dataset = medium_dataset elif os.path.exists(low_dataset) and train_dataset != low_dataset: val_dataset = low_dataset if val_dataset is not None: lemmas_val, tags_val, inflected_forms_val = read_dataset(val_dataset) if val_dataset is not None and len(lemmas_val) >= val_ratio*train_data_size: lemmas_train, tags_train, inflected_forms_train = lemmas, tags, inflected_forms val_data = list(zip(lemmas_val, tags_val, inflected_forms_val)) random.seed(random_state) val_data_size = int(min(max(val_ratio*train_data_size, 100), len(lemmas_val))) val_data = random.sample(val_data, val_data_size) lemmas_val, tags_val, inflected_forms_val = zip(*val_data) lemmas_val, tags_val, inflected_forms_val = list(lemmas_val), list(tags_val), list(inflected_forms_val) else: lemmas_train, lemmas_val, tags_train, tags_val, inflected_forms_train, inflected_forms_val = train_test_split( lemmas, tags, inflected_forms, test_size=val_ratio, random_state=random_state) train_data_size = len(lemmas_train) val_data_size = len(lemmas_val) if test_data == 'dev': dev_data = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language)) lemmas_test, tags_test, _ = read_dataset(dev_data) elif test_data == 'test': test_data = os.path.join(TASK1_DATA_PATH, '{}-covered-test'.format(language)) lemmas_test, tags_test = read_covered_dataset(test_data) else: lemmas_test, tags_test, inflected_forms_test = [], [], [] vocab = Vocab(lemmas_train+lemmas_val+lemmas_test, tags_train+tags_val+tags_test, inflected_forms_train) alignments_train = get_alignment(lemmas_train, inflected_forms_train, vocab) p_gens_train = get_p_gens([[vocab.START_CHAR] + list(lemma) + [vocab.STOP_CHAR] for lemma in lemmas_train], [list(inflected_form) + [vocab.STOP_CHAR] for inflected_form in inflected_forms_train], alignments_train) alignments_val = get_alignment(lemmas_train+lemmas_val, inflected_forms_train+inflected_forms_val, vocab)[train_data_size:] p_gens_val = get_p_gens([[vocab.START_CHAR] + list(lemma) + [vocab.STOP_CHAR] for lemma in lemmas_val], [list(inflected_form) + [vocab.STOP_CHAR] for inflected_form in inflected_forms_val], alignments_val) lemmas_indices = vocab.words_to_indices(lemmas_train+lemmas_val, start_char=True, stop_char=True) tags_indices = vocab.tag_to_indices(tags_train+tags_val) inflected_forms_indices = vocab.words_to_indices(inflected_forms_train+inflected_forms_val) model_inputs_train = list(zip(lemmas_indices[:train_data_size], tags_indices[:train_data_size])) labels_train = list(zip(inflected_forms_indices[:train_data_size], alignments_train, p_gens_train)) model_inputs_val = list(zip(lemmas_indices[train_data_size:], tags_indices[train_data_size:])) labels_val = list(zip(inflected_forms_indices[train_data_size:], alignments_val, p_gens_val)) return model_inputs_train, model_inputs_val, labels_train, labels_val, vocab
from sys import argv from baseline import eval from data import read_dataset if __name__ == "__main__": sysdata, _, _, _, _ = read_dataset(argv[1]) golddata, _, _, _, _ = read_dataset(argv[2]) acc, lev = eval([sysdata, golddata], id2char={}, generating=0) print("Accuracy: %.2f" % (100 * acc)) print("Avg. Levenshtein distance: %.2f" % (lev))
# print(i) dis = [] for j in range(data.shape[0]): if i == j: dis.append(10000000000.0) else: dis.append(np.sqrt(((data[i] - data[j])**2).sum())) index = np.argsort(dis) knn.append(index[:k]) if label != None: for j in range(k): if label[index[j]] == label[i]: gcnt += 1 if label != None: print(gcnt, gcnt / float(len(label) * 10)) return knn if __name__ == '__main__': a = data.read_dataset("../data/Adam/data.h5") #a.exprs = a.exprs[:,:1000] x, y, z, w = data.getdata(a) knn = buildKNN(x, label=z, k=10) with open("./data/Adamknn.pkl", 'wb') as f: pickle.dump(knn, f) np.save()
(lang, devlev)) print() break if __name__ == '__main__': exp_name = str(sysargv[5]) exp_path = 'dumped/' + exp_name global wf2id_dict, lemma2id_dict, char2id_dict, id2char_dict, msd2id_dict, msd2id_split, id2msd_split, languages languages = str(sysargv[1]).split(',') traindata, devinputdata, devgolddata, wf2id_dict, lemma2id_dict, char2id_dict, msd2id_dict, id2char_dict, msd2id_split_monolingual, id2msd_split = {},{},{},{},{},{},{},{},{},{} for lang in languages: traindata[lang], wf2id_dict[lang], lemma2id_dict[lang], char2id_dict[lang], msd2id_dict[lang], \ msd2id_split_monolingual[lang] = read_dataset(sysargv[2].format(lang)) devinputdata[lang], _, _, _, _, _ = read_dataset( sysargv[3].format(lang)) devgolddata[lang], _, _, _, _, _ = read_dataset( sysargv[4].format(lang)) id2char_dict[lang] = { id: char for char, id in char2id_dict[lang].items() } all_msd_splits = [] for i, lang in enumerate(languages): if i == 0: all_msd_splits += msd2id_split_monolingual[lang].keys() else: for m in msd2id_split_monolingual[lang].keys():