def read_data(self, source): df = pd.read_csv(source, sep='\t', header=None) df.columns = [ "polarity", "aspect_category", "target_term", "character_offset", "sentence" ] df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else (0 if x == "neutral" else -1)) #remove target sentence_red = [0] * len(df) for i in range(len(df)): sentence_red[i] = df["sentence"][i][:int( df["character_offset"][i].split(":")[0])] + df["sentence"][i][ int(df["character_offset"][i].split(":")[1]):] df["sentence_red"] = sentence_red #remove stopwords df["sentence_red"] = df["sentence_red"].apply( lambda x: self.remove_stopwords(x)) #word2vec embeddings PATH_TO_DATA = Path( 'C:/Users/Armand/Desktop/3A/Deep Learning/nlp_project/nlp_project/' ) en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz' if not en_embeddings_path.exists(): urlretrieve( 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path) w2vec = word2vec.Word2vec(en_embeddings_path, vocab_size=50000) sentence2vec = word2vec.BagOfWords(w2vec) sentences_emb = [ sentence2vec.encode(df["sentence_red"][i]) for i in range(len(df["sentence_red"])) ] return (sentences_emb, df["label"])
tf.flags.DEFINE_integer("num_epochs", 1, "Number of training epochs (default: 1)") tf.flags.DEFINE_integer("evaluate_every", 2545, "Evaluate model on dev set after this many steps (default: 2545)") tf.flags.DEFINE_integer("checkpoint_every", 2500, "Save model after this many steps (default: 2500)") tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation (default : 1/10)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") # Eval Parameters tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() # Get the w2vv (the program will assume it has already been constructed and will complain if it hasn't) w2v = word2vec.Word2vec(FLAGS.w2v_path) num_filters=[64,128,200,300,400,500] filter_sizes=["2","3","4","5","6","2,3,4","3,4,5","4,5,6","2,3,4,5","3,4,5,6"] for fs in filter_sizes: for nf in num_filters: FLAGS.filter_sizes = fs FLAGS.num_filters = nf _,loss,accuracy = train(FLAGS,w2v) s = str(FLAGS.num_filters)+" "+str(FLAGS.filter_sizes)+" "+str(loss)+" "+str(accuracy)+"\n" print(s) f = open('GS.txt','a') f.write(s) f.close()
def main(argv): # todo create map file word_to_id, tag_to_id, id_to_tag = data_utils.load_map_file(FLAGS.map_file) id_to_word = {v: k for k, v in word_to_id.items()} num_dict = data_utils.load_size_file(FLAGS.size_file) train_num = num_dict["train_num"] dev_num = num_dict["dev_num"] test_num = num_dict['test_num'] model_config = init_mode_config(len(word_to_id), len(tag_to_id)) print(model_config) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Graph().as_default(): print("load pre word2vec ...") wv = word2vec.Word2vec() embed = wv.load_w2v_array(FLAGS.pre_embedding_file, id_to_word) word_embedding = tf.constant(embed, dtype=tf.float32) model = BiLSTM(model_config, word_embedding) train_batcher = SegBatcher(FLAGS.train_file, FLAGS.batch_size, num_epochs=FLAGS.max_epoch) dev_batcher = SegBatcher(FLAGS.dev_file, FLAGS.batch_size, num_epochs=1) test_batcher = SegBatcher(FLAGS.test_file, FLAGS.batch_size, num_epochs=1) tf.global_variables_initializer() sv = tf.train.Supervisor( logdir=FLAGS.out_dir, save_model_secs=FLAGS.save_model_secs, ) with sv.managed_session() as sess: sess.as_default() threads = tf.train.start_queue_runners(sess=sess) loss = [] def run_evaluation(dev_batches, report=False): """ Evaluates model on a dev set """ preds = [] true_tags = [] tmp_x = [] for x_batch, y_batch, sent_len in dev_batches: feed_dict = { model.char_inputs: x_batch, model.targets: y_batch, model.lengths: sent_len.reshape(-1, ), model.dropout: 1.0 } step, loss, logits, lengths, trans = sess.run([ model.global_step, model.loss, model.logits, model.lengths, model.trans ], feed_dict) index = 0 small = -1000.0 start = np.asarray([[small] * model_config["num_tags"] + [0]]) for score, length in zip(logits, lengths): score = score[:length] pad = small * np.ones([length, 1]) logit = np.concatenate([score, pad], axis=1) logit = np.concatenate([start, logit], axis=0) path, _ = tf.contrib.crf.viterbi_decode(logit, trans) preds.append(path[1:]) tmp_x.append(x_batch[index][:length]) index += 1 for y, length in zip(y_batch, lengths): y = y.tolist() true_tags.append(y[:length]) if FLAGS.debug and len(tmp_x) > 5: print(tag_to_id) for j in range(5): sent = [id_to_word.get(i, "<OOV>") for i in tmp_x[j]] print("".join(sent)) print("pred:", preds[j]) print("true:", true_tags[j]) preds = np.concatenate(preds, axis=0) true_tags = np.concatenate(true_tags, axis=0) if report: print(classification_report(true_tags, preds)) acc = accuracy_score(true_tags, preds) return acc def run_test(): print("start run test ......") test_batches = [] done = False print("load all test batches to memory") while not done: try: tags, chars, sent_lens = sess.run( test_batcher.next_batch_op) test_batches.append((chars, tags, sent_lens)) except: done = True test_acc = run_evaluation(test_batches, True) print("test accc %f" % (test_acc)) best_acc = 0.0 dev_batches = [] done = False print("load all dev batches to memory") while not done: try: tags, chars, sent_lens = sess.run( dev_batcher.next_batch_op) dev_batches.append((chars, tags, sent_lens)) except: done = True print("start training ...") early_stop = False for step in range(FLAGS.max_epoch): if sv.should_stop(): run_test() break examples = 0 while examples < train_num: if early_stop: break try: batch = sess.run(train_batcher.next_batch_op) except Exception as e: break tags, chars, sent_lens = batch feed_dict = { model.char_inputs: chars, model.targets: tags, model.dropout: FLAGS.dropout, model.lengths: sent_lens.reshape(-1, ), } global_step, batch_loss, _ = sess.run( [model.global_step, model.loss, model.train_op], feed_dict) print("%d iteration %d loss: %f" % (step, global_step, batch_loss)) if global_step % FLAGS.eval_step == 0: print("evaluation .......") acc = run_evaluation(dev_batches) print("%d iteration , %d dev acc: %f " % (step, global_step, acc)) if best_acc - acc > 0.01: print("stop training ealy ... best dev acc " % (best_acc)) early_stop = True break elif best_acc < acc: best_acc = acc sv.saver.save(sess, FLAGS.out_dir + "model", global_step=global_step) print( "%d iteration , %d global step best dev acc: %f " % (step, global_step, best_acc)) loss.append(batch_loss) examples += FLAGS.batch_size sv.saver.save(sess, FLAGS.out_dir + "model", global_step=global_step) run_test() sv.coord.request_stop() sv.coord.join(threads) sess.close()
def main(argv): model_class = get_model() size_map = load_size_file(FLAGS.size_file) vocab_size = size_map.get('vocab_size') num_class = size_map.get("num_tag") num_train = size_map.get("train_num") model_conf = init_config(vocab_size, num_class) print(model_conf) _, id_to_word = load_vocab(FLAGS.vocab_file) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) wv = word2vec.Word2vec() embed = wv.load_w2v_array(FLAGS.embedding_file, id_to_word) print("embeding shape:", embed.shape) word_embedding = tf.constant(embed, dtype=tf.float32) model = model_class(model_conf, word_embedding) train_batcher = SegBatcher(FLAGS.train_file, FLAGS.batch_size, num_epochs=FLAGS.max_epoch) test_batcher = SegBatcher(FLAGS.test_file, FLAGS.batch_size, num_epochs=1) print("train_file ====> ", FLAGS.train_file) print("test_file =====>", FLAGS.test_file) print("batch size =====>", FLAGS.batch_size) print("most epoch ======>", FLAGS.max_epoch) loss_summary = tf.summary.scalar("loss", model.loss_val) acc_summary = tf.summary.scalar("accuracy", model.accuracy) train_summary_op = tf.summary.merge([loss_summary, acc_summary]) tf.global_variables_initializer() sv = tf.train.Supervisor(logdir=FLAGS.out_dir, save_model_secs=0, save_summaries_secs=0) with sv.managed_session(config=session_conf) as sess: threads = tf.train.start_queue_runners(sess=sess) test_batches = [] done = False print("load all dev batches to memory") while not done: try: words, labels = sess.run(test_batcher.next_batch_op) test_batches.append((words, labels)) except Exception as e: done = True def run_eval(batchs): print("eval....") true_labels = [] pred_labels = [] for words, label in batchs: feed_dict = { model.input_x: words, model.input_y: label, model.dropout_keep_prob: 1.0, } predictions, acc = sess.run( [model.predictions, model.accuracy], feed_dict=feed_dict) pred_labels.append(predictions) label = np.argmax(label, axis=1) true_labels.append(label) true_labels = np.concatenate(true_labels, axis=0) pred_labels = np.concatenate(pred_labels, axis=0) report = classification_report(true_labels, pred_labels) print(report) acc = accuracy_score(true_labels, pred_labels) return acc best_acc = 0.0 for epoch in range(FLAGS.max_epoch): if sv.should_stop(): # todo test print("stop.......") break examples = 0 while examples < num_train: try: batch = sess.run(train_batcher.next_batch_op) except Exception as e: print(e) exit(0) words, label = batch feed_dict = { model.input_x: words, model.input_y: label, model.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, loss, accuracy, summaries = sess.run([ model.train_op, model.global_step, model.loss_val, model.accuracy, train_summary_op ], feed_dict) examples += len(words) if step % FLAGS.eval_step == 0: # todo evaliate acc = run_eval(test_batches) if acc < best_acc: sv.saver.save(sess, os.path.join(FLAGS.out_dir, "model"), global_step=step) # todo finish print("eary stoped .....") sv.stop() break else: best_acc = acc sv.saver.save( sess, os.path.join(FLAGS.out_dir, "model"), global_step=step, ) print("{}: test{:g} best acc :{}".format( time_str, acc, best_acc)) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) sv.coord.request_stop() sv.coord.join(threads) sess.close()
#!/usr/bin/env python # -*- coding: utf-8 -*- # import gensim import utils import os import sys import logging import word2vec import word2vecReader #sys.setdefaultencoding() logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(__name__) dir = '/media/robert/dataThesis/tekst/' # utils.encodeTextToUTF8(dname) sentences = gensim.models.word2vec.LineSentence(dname, max_sentence_length=150, limit=1000) w2v = word2vec.Word2vec() model = w2v.trainModel(sentences, False) mname = "/home/robert/data/gensimModel.bin" w2v.saveModel(model, mname)
# ================================================== # Data loading params tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") tf.flags.DEFINE_string("positive_data_file", "../twitter-datasets/train_pos.txt", "Data source for the positive data.") tf.flags.DEFINE_string("negative_data_file", "../twitter-datasets/train_neg.txt", "Data source for the positive data.") tf.flags.DEFINE_string("eval_data_file", "../twitter-datasets/test_data.txt", "Data source for the evaluation.") # Model Hyperparameters tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") tf.flags.DEFINE_integer("num_filters", 64, "Number of filters per filter size (default: 64)") tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)") # Training parameters tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 1, "Number of training epochs (default: 1)") tf.flags.DEFINE_integer("evaluate_every", 200, "Evaluate model on dev set after this many steps (default: 200)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_string("w2v_path", "", "path to precomputed word2vec vector (default: None is used)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() w2v = word2vec.Word2vec(FLAGS.w2v_path) if FLAGS.w2v_path != "" else None train(FLAGS,w2v)