Exemplo n.º 1
0
 def read_data(self, source):
     df = pd.read_csv(source, sep='\t', header=None)
     df.columns = [
         "polarity", "aspect_category", "target_term", "character_offset",
         "sentence"
     ]
     df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else
                                        (0 if x == "neutral" else -1))
     #remove target
     sentence_red = [0] * len(df)
     for i in range(len(df)):
         sentence_red[i] = df["sentence"][i][:int(
             df["character_offset"][i].split(":")[0])] + df["sentence"][i][
                 int(df["character_offset"][i].split(":")[1]):]
     df["sentence_red"] = sentence_red
     #remove stopwords
     df["sentence_red"] = df["sentence_red"].apply(
         lambda x: self.remove_stopwords(x))
     #word2vec embeddings
     PATH_TO_DATA = Path(
         'C:/Users/Armand/Desktop/3A/Deep Learning/nlp_project/nlp_project/'
     )
     en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
     if not en_embeddings_path.exists():
         urlretrieve(
             'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz',
             en_embeddings_path)
     w2vec = word2vec.Word2vec(en_embeddings_path, vocab_size=50000)
     sentence2vec = word2vec.BagOfWords(w2vec)
     sentences_emb = [
         sentence2vec.encode(df["sentence_red"][i])
         for i in range(len(df["sentence_red"]))
     ]
     return (sentences_emb, df["label"])
Exemplo n.º 2
0
tf.flags.DEFINE_integer("num_epochs", 1, "Number of training epochs (default: 1)")
tf.flags.DEFINE_integer("evaluate_every", 2545, "Evaluate model on dev set after this many steps (default: 2545)")
tf.flags.DEFINE_integer("checkpoint_every", 2500, "Save model after this many steps (default: 2500)")
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation (default : 1/10)")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
# Eval Parameters
tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

# Get the w2vv (the program will assume it has already been constructed and will complain if it hasn't)
w2v = word2vec.Word2vec(FLAGS.w2v_path)
num_filters=[64,128,200,300,400,500]
filter_sizes=["2","3","4","5","6","2,3,4","3,4,5","4,5,6","2,3,4,5","3,4,5,6"]

for fs in filter_sizes:
	for nf in num_filters:
		FLAGS.filter_sizes = fs
		FLAGS.num_filters = nf
		_,loss,accuracy = train(FLAGS,w2v)
		s = str(FLAGS.num_filters)+" "+str(FLAGS.filter_sizes)+" "+str(loss)+" "+str(accuracy)+"\n"
		print(s) 
		f = open('GS.txt','a')
		f.write(s)
		f.close()

Exemplo n.º 3
0
def main(argv):
    # todo create map file
    word_to_id, tag_to_id, id_to_tag = data_utils.load_map_file(FLAGS.map_file)
    id_to_word = {v: k for k, v in word_to_id.items()}

    num_dict = data_utils.load_size_file(FLAGS.size_file)
    train_num = num_dict["train_num"]
    dev_num = num_dict["dev_num"]
    test_num = num_dict['test_num']

    model_config = init_mode_config(len(word_to_id), len(tag_to_id))
    print(model_config)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    with tf.Graph().as_default():

        print("load pre word2vec ...")
        wv = word2vec.Word2vec()
        embed = wv.load_w2v_array(FLAGS.pre_embedding_file, id_to_word)

        word_embedding = tf.constant(embed, dtype=tf.float32)
        model = BiLSTM(model_config, word_embedding)
        train_batcher = SegBatcher(FLAGS.train_file,
                                   FLAGS.batch_size,
                                   num_epochs=FLAGS.max_epoch)
        dev_batcher = SegBatcher(FLAGS.dev_file,
                                 FLAGS.batch_size,
                                 num_epochs=1)
        test_batcher = SegBatcher(FLAGS.test_file,
                                  FLAGS.batch_size,
                                  num_epochs=1)

        tf.global_variables_initializer()
        sv = tf.train.Supervisor(
            logdir=FLAGS.out_dir,
            save_model_secs=FLAGS.save_model_secs,
        )

        with sv.managed_session() as sess:
            sess.as_default()
            threads = tf.train.start_queue_runners(sess=sess)
            loss = []

            def run_evaluation(dev_batches, report=False):
                """
                Evaluates model on a dev set
                """
                preds = []
                true_tags = []
                tmp_x = []
                for x_batch, y_batch, sent_len in dev_batches:
                    feed_dict = {
                        model.char_inputs: x_batch,
                        model.targets: y_batch,
                        model.lengths: sent_len.reshape(-1, ),
                        model.dropout: 1.0
                    }

                    step, loss, logits, lengths, trans = sess.run([
                        model.global_step, model.loss, model.logits,
                        model.lengths, model.trans
                    ], feed_dict)

                    index = 0
                    small = -1000.0
                    start = np.asarray([[small] * model_config["num_tags"] +
                                        [0]])

                    for score, length in zip(logits, lengths):
                        score = score[:length]
                        pad = small * np.ones([length, 1])
                        logit = np.concatenate([score, pad], axis=1)
                        logit = np.concatenate([start, logit], axis=0)
                        path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
                        preds.append(path[1:])
                        tmp_x.append(x_batch[index][:length])
                        index += 1

                    for y, length in zip(y_batch, lengths):
                        y = y.tolist()
                        true_tags.append(y[:length])

                if FLAGS.debug and len(tmp_x) > 5:
                    print(tag_to_id)

                    for j in range(5):
                        sent = [id_to_word.get(i, "<OOV>") for i in tmp_x[j]]
                        print("".join(sent))
                        print("pred:", preds[j])
                        print("true:", true_tags[j])

                preds = np.concatenate(preds, axis=0)
                true_tags = np.concatenate(true_tags, axis=0)

                if report:
                    print(classification_report(true_tags, preds))

                acc = accuracy_score(true_tags, preds)
                return acc

            def run_test():
                print("start run test ......")
                test_batches = []
                done = False
                print("load all test batches to memory")

                while not done:
                    try:
                        tags, chars, sent_lens = sess.run(
                            test_batcher.next_batch_op)
                        test_batches.append((chars, tags, sent_lens))
                    except:
                        done = True
                test_acc = run_evaluation(test_batches, True)
                print("test accc %f" % (test_acc))

            best_acc = 0.0
            dev_batches = []
            done = False
            print("load all dev batches to memory")

            while not done:
                try:
                    tags, chars, sent_lens = sess.run(
                        dev_batcher.next_batch_op)
                    dev_batches.append((chars, tags, sent_lens))
                except:
                    done = True

            print("start training ...")
            early_stop = False
            for step in range(FLAGS.max_epoch):
                if sv.should_stop():
                    run_test()
                    break
                examples = 0

                while examples < train_num:
                    if early_stop:
                        break
                    try:
                        batch = sess.run(train_batcher.next_batch_op)
                    except Exception as e:
                        break

                    tags, chars, sent_lens = batch
                    feed_dict = {
                        model.char_inputs: chars,
                        model.targets: tags,
                        model.dropout: FLAGS.dropout,
                        model.lengths: sent_lens.reshape(-1, ),
                    }
                    global_step, batch_loss, _ = sess.run(
                        [model.global_step, model.loss, model.train_op],
                        feed_dict)

                    print("%d iteration %d loss: %f" %
                          (step, global_step, batch_loss))
                    if global_step % FLAGS.eval_step == 0:
                        print("evaluation .......")
                        acc = run_evaluation(dev_batches)

                        print("%d iteration , %d dev acc: %f " %
                              (step, global_step, acc))

                        if best_acc - acc > 0.01:
                            print("stop training ealy ... best dev acc " %
                                  (best_acc))
                            early_stop = True
                            break

                        elif best_acc < acc:
                            best_acc = acc
                            sv.saver.save(sess,
                                          FLAGS.out_dir + "model",
                                          global_step=global_step)
                            print(
                                "%d iteration , %d global step best dev acc: %f "
                                % (step, global_step, best_acc))

                    loss.append(batch_loss)
                    examples += FLAGS.batch_size

            sv.saver.save(sess,
                          FLAGS.out_dir + "model",
                          global_step=global_step)
            run_test()
        sv.coord.request_stop()
        sv.coord.join(threads)
        sess.close()
Exemplo n.º 4
0
def main(argv):
    model_class = get_model()

    size_map = load_size_file(FLAGS.size_file)
    vocab_size = size_map.get('vocab_size')
    num_class = size_map.get("num_tag")
    num_train = size_map.get("train_num")

    model_conf = init_config(vocab_size, num_class)
    print(model_conf)

    _, id_to_word = load_vocab(FLAGS.vocab_file)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        wv = word2vec.Word2vec()
        embed = wv.load_w2v_array(FLAGS.embedding_file, id_to_word)
        print("embeding shape:", embed.shape)
        word_embedding = tf.constant(embed, dtype=tf.float32)
        model = model_class(model_conf, word_embedding)

        train_batcher = SegBatcher(FLAGS.train_file,
                                   FLAGS.batch_size,
                                   num_epochs=FLAGS.max_epoch)
        test_batcher = SegBatcher(FLAGS.test_file,
                                  FLAGS.batch_size,
                                  num_epochs=1)

        print("train_file ====> ", FLAGS.train_file)
        print("test_file =====>", FLAGS.test_file)
        print("batch size =====>", FLAGS.batch_size)
        print("most epoch ======>", FLAGS.max_epoch)

        loss_summary = tf.summary.scalar("loss", model.loss_val)
        acc_summary = tf.summary.scalar("accuracy", model.accuracy)
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])

        tf.global_variables_initializer()
        sv = tf.train.Supervisor(logdir=FLAGS.out_dir,
                                 save_model_secs=0,
                                 save_summaries_secs=0)
        with sv.managed_session(config=session_conf) as sess:
            threads = tf.train.start_queue_runners(sess=sess)
            test_batches = []
            done = False

            print("load all dev batches to memory")

            while not done:
                try:
                    words, labels = sess.run(test_batcher.next_batch_op)
                    test_batches.append((words, labels))
                except Exception as e:
                    done = True

            def run_eval(batchs):
                print("eval....")
                true_labels = []
                pred_labels = []
                for words, label in batchs:
                    feed_dict = {
                        model.input_x: words,
                        model.input_y: label,
                        model.dropout_keep_prob: 1.0,
                    }
                    predictions, acc = sess.run(
                        [model.predictions, model.accuracy],
                        feed_dict=feed_dict)
                    pred_labels.append(predictions)
                    label = np.argmax(label, axis=1)
                    true_labels.append(label)
                true_labels = np.concatenate(true_labels, axis=0)
                pred_labels = np.concatenate(pred_labels, axis=0)
                report = classification_report(true_labels, pred_labels)
                print(report)
                acc = accuracy_score(true_labels, pred_labels)
                return acc

            best_acc = 0.0
            for epoch in range(FLAGS.max_epoch):
                if sv.should_stop():
                    # todo test
                    print("stop.......")
                    break
                examples = 0
                while examples < num_train:
                    try:
                        batch = sess.run(train_batcher.next_batch_op)
                    except Exception as e:
                        print(e)
                        exit(0)

                    words, label = batch
                    feed_dict = {
                        model.input_x: words,
                        model.input_y: label,
                        model.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }

                    _, step, loss, accuracy, summaries = sess.run([
                        model.train_op, model.global_step, model.loss_val,
                        model.accuracy, train_summary_op
                    ], feed_dict)
                    examples += len(words)

                    if step % FLAGS.eval_step == 0:
                        # todo evaliate
                        acc = run_eval(test_batches)

                        if acc < best_acc:
                            sv.saver.save(sess,
                                          os.path.join(FLAGS.out_dir, "model"),
                                          global_step=step)
                            # todo finish
                            print("eary stoped .....")
                            sv.stop()
                            break
                        else:
                            best_acc = acc
                            sv.saver.save(
                                sess,
                                os.path.join(FLAGS.out_dir, "model"),
                                global_step=step,
                            )
                        print("{}: test{:g} best acc :{}".format(
                            time_str, acc, best_acc))
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
        sv.coord.request_stop()
        sv.coord.join(threads)
        sess.close()
Exemplo n.º 5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
import gensim
import utils
import os
import sys
import logging
import word2vec
import word2vecReader

#sys.setdefaultencoding()

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

dir = '/media/robert/dataThesis/tekst/'




# utils.encodeTextToUTF8(dname)

sentences = gensim.models.word2vec.LineSentence(dname, max_sentence_length=150, limit=1000)

w2v = word2vec.Word2vec()
model = w2v.trainModel(sentences, False)

mname = "/home/robert/data/gensimModel.bin"
w2v.saveModel(model, mname)
Exemplo n.º 6
0
    # ==================================================
    # Data loading params
    tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
    tf.flags.DEFINE_string("positive_data_file", "../twitter-datasets/train_pos.txt", "Data source for the positive data.")
    tf.flags.DEFINE_string("negative_data_file", "../twitter-datasets/train_neg.txt", "Data source for the positive data.")
    tf.flags.DEFINE_string("eval_data_file", "../twitter-datasets/test_data.txt", "Data source for the evaluation.")

    # Model Hyperparameters
    tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
    tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
    tf.flags.DEFINE_integer("num_filters", 64, "Number of filters per filter size (default: 64)")
    tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
    tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")

    # Training parameters
    tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 1, "Number of training epochs (default: 1)")
    tf.flags.DEFINE_integer("evaluate_every", 200, "Evaluate model on dev set after this many steps (default: 200)")
    tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_string("w2v_path", "", "path to precomputed word2vec vector (default: None is used)")

    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

    FLAGS = tf.flags.FLAGS
    FLAGS._parse_flags()

    w2v = word2vec.Word2vec(FLAGS.w2v_path) if FLAGS.w2v_path != "" else None
    train(FLAGS,w2v)