Пример #1
0
def create_model(session, y, vocab, config, path, logger):
    # create model, reuse parameters if exists
    initializer = tf.random_uniform_initializer(-1 * config.init_scale, 1 * config.init_scale)  # between(-1, 1)
    with tf.variable_scope("bi_rnn", reuse=None, initializer=initializer):
        bi_rnn = Bi_Lstm_Model(config=config,
                               num_step=config.num_step,
                               num_classes=1,
                               vocab_size=len(vocab),
                               is_training=0)   # 0 train, 1 valid, 2 predict
    with tf.variable_scope("bi_rnn", reuse=True, initializer=initializer):
        valid_bi_rnn = Bi_Lstm_Model(config=config,
                                     num_step=config.num_step,
                                     num_classes=1,
                                     vocab_size=len(vocab),
                                     is_training=1)
        test_bi_rnn = Bi_Lstm_Model(config=config,
                                    num_step=config.num_step,
                                    num_classes=1,
                                    vocab_size=len(vocab),
                                    is_training=1)
    if path:
        ckpt = tf.train.get_checkpoint_state(os.path.join(path, "checkpoints"))
        if tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            bi_rnn.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        logger.info("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
        emb_weights = session.run(bi_rnn.embeddings.read_value())
        emb_weights = data_helpers.load_word2vec(config.word2vec_path, vocab, config.embed_dim, emb_weights)
        session.run(bi_rnn.embeddings.assign(emb_weights))
        logger.info("Load pre-trained embedding.")
    return bi_rnn, valid_bi_rnn, test_bi_rnn
 def __init__(self, w2v_file, pooling_type='max_min'):
     """
     Args:
         w2v_file: word2vec text file
         pooling_type: [max_min | avg | all], default max_min
     """
     self.word2vec, self.vec_dim, _ = data_helpers.load_word2vec(w2v_file)
     if pooling_type == 'max_min':
         self.pooling = self.max_min_pooling
     elif pooling_type == 'avg':
         self.pooling = self.average_pooling
     else:
         self.pooling = self.all_pooling
Пример #3
0
def make_data(args, max_len, raw_input):
    vocab_file, _, _ = data_helpers.process_train_file(
        data_dir=args.data_dir,
        raw_input=raw_input,
        max_length=max_len,
        min_frequency=args.min_freq,
    )

    w2v, vec_dim, _ = data_helpers.load_word2vec(args.w2v_file)
    data_helpers.make_embedding_matrix(
        data_dir=args.data_dir,
        prefix=os.path.basename(raw_input),
        word2vec=w2v,
        vec_dim=vec_dim,
        vocab_file=vocab_file,
    )
Пример #4
0
    def train(self, sess, x_text, y, split_no, FLAGS):

        vocab_processor = learn.preprocessing.VocabularyProcessor(
            self.max_document_length)
        pickle.dump(vocab_processor,
                    open("vocabproc{}.pickle".format(split_no), "wb"))
        #vocab_processor = pickle.load(open("vocabproc{}.pickle".format(split_no), "rb"))

        topics = pickle.load(
            open("phrase3000_{}.pickle".format(split_no), "rb"))
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        y = np.array(y)
        bm25 = BM25()

        t = bm25.relevance(x_text, topics, split_no)

        np.random.seed(10)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x_shuffled = x[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        t_shuffled = t[shuffle_indices]
        # Build vocabulary

        text_x_shuffled = []
        for index in np.nditer(shuffle_indices):
            text_x_shuffled.append(x_text[index])
        dev_sample_index = -1 * int(
            FLAGS.dev_sample_percentage * float(len(y)))

        x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
            dev_sample_index:]
        y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
            dev_sample_index:]
        t_train, t_dev = t_shuffled[:dev_sample_index], t_shuffled[
            dev_sample_index:]
        del x, y, x_shuffled, y_shuffled

        self.initW = data_helpers.load_word2vec(vocab_processor,
                                                FLAGS.embedding_dim)
        cnn = self.get_cnn(FLAGS, voca_size=len(vocab_processor.vocabulary_))
        self.train_nn(sess, cnn, x_train, x_dev, t_train, y_train, y_dev,
                      t_dev, topics)
Пример #5
0
def main():
    current_time = str(datetime.now().strftime('%Y%m%d_%H%M%S'))
    print(current_time)

    print('Loading data')
    texts, labels = load_data_and_labels(pos_file, neg_file)

    if W2V_file_addr is not None:
        print('Loading Word2Vec')
        # embeddings_index, embedding_dim = load_word2vec_nonbinary(W2V_file_addr)
        embeddings_index, embedding_dim = load_word2vec(W2V_file_addr)
        print('Found %s word vectors.' % len(embeddings_index))
    else:
        embeddings_index = None
        embedding_dim = 300

    checkpoint_dir = os.path.join(keras_checkpoint_dir, file_tag, current_time)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # vectorize the text samples into a 2D integer tensor
    print('Tokenizing the texts')
    tokenizer = Tokenizer(num_words=max_num_words, lower=False)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    tokenizer_addr = os.path.join(checkpoint_dir, 'tokenizer.pickle')
    with open(tokenizer_addr, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Tokenizer is saved as %s' % tokenizer_addr)
    print('Padding Sequences')
    data = pad_sequences(sequences, maxlen=max_sequence_length)

    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    num_validation_samples = int(validation_split * data.shape[0])

    x_train = data[:-num_validation_samples]
    y_train = labels[:-num_validation_samples]
    x_val = data[-num_validation_samples:]
    y_val = labels[-num_validation_samples:]

    # prepare embedding matrix
    num_words = min(max_num_words, len(word_index) + 1)
    if W2V_file_addr is not None:
        embedding_matrix = np.zeros((num_words, embedding_dim))
        for word, i in word_index.items():
            if i >= max_num_words:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix = np.zeros((num_words, embedding_dim))

    # Build model
    model_input = Input(shape=(max_sequence_length, ), dtype='int32')

    z = Embedding(num_words,
                  embedding_dim,
                  weights=[embedding_matrix],
                  input_length=max_sequence_length,
                  trainable=False)(model_input)
    z = Reshape((max_sequence_length, embedding_dim, 1))(z)

    # Convolutional block
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv2D(num_filters,
                      kernel_size=(sz, embedding_dim),
                      padding='valid',
                      activation='relu')(z)
        conv = MaxPool2D(pool_size=(max_sequence_length - sz + 1, 1),
                         strides=(1, 1),
                         padding='valid')(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    #z = Dense(hidden_dims, kernel_regularizer=regularizers.l2(0.0001),activity_regularizer=regularizers.l1(0.0001), activation="relu")(z)
    z = Dense(hidden_dims, activation="relu")(z)
    z = Dropout(drop)(z)
    model_output = Dense(units=2, activation='softmax')(z)

    adam = Adam(lr=learning_rate, decay=1e-6)
    # adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00001)
    # adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0001)

    model = Model(model_input, model_output)
    model.summary()

    model.compile(optimizer=adam,
                  loss='binary_crossentropy',
                  metrics=['mse', 'acc'])  # <==<==biogpu12-1
    #model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    #model.compile(optimizer=adam, loss='mean_squared_error', metrics=['mse', 'acc']) # <== best <==biogpu12-2

    # Log Dir
    log_dir = os.path.join(checkpoint_dir, '../')
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    f_log = open(os.path.join(log_dir, 'result_logs_' + file_tag + '.txt'),
                 'a')
    print(
        'Log File: %s' %
        os.path.join(os.getcwd(), log_dir, 'result_logs_' + file_tag + '.txt'))
    f_log.write("\n")
    f_log.write(current_time)
    f_log.write("\n")
    f_log.write('pos_file: %s\n' % pos_file)
    f_log.write('neg_file: %s\n' % neg_file)
    f_log.write('checkpoint_folder: %s\n' % checkpoint_dir)
    f_log.write('embedding_dim = %s\n' % embedding_dim)
    f_log.write('W2V_file_addr = %s\n' % W2V_file_addr)
    f_log.write('filter_sizes = %s\n' % filter_sizes)
    f_log.write('num_filters = %s\n' % num_filters)
    f_log.write('hidden_dims = %s\n' % hidden_dims)
    f_log.write('drop = %s\n' % drop)
    f_log.write('validation_split = %s\n' % validation_split)
    f_log.write('learning_rate = %s\n' % learning_rate)
    f_log.write('epochs = %s\n' % epochs)
    f_log.write('batch_size = %s\n' % batch_size)
    f_log.write('max_num_words = %s\n' % max_num_words)
    f_log.write('max_sequence_length = %s\n' % max_sequence_length)
    f_log.flush()

    checkpoint = ModelCheckpoint(os.path.join(
        checkpoint_dir, file_tag + "_" + current_time +
        '_weights.{epoch:03d}-{val_acc:.4f}.hdf5'),
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='auto')

    print("Traning Model...")
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=2,
              callbacks=[checkpoint],
              validation_data=(x_val, y_val))  # starts training
    pred_results = model.predict(x_val)
    f1_res = f1_score(y_val.argmax(axis=1), pred_results.argmax(axis=1))
    precision_res = precision_score(y_val.argmax(axis=1),
                                    pred_results.argmax(axis=1))
    recall_res = recall_score(y_val.argmax(axis=1),
                              pred_results.argmax(axis=1))

    print("\n")
    print("F1:\t%s" % f1_res)
    print("Precision:\t%s" % precision_res)
    print("recall:\t%s" % recall_res)

    f_log.write("F1:\t%s\n" % f1_res)
    f_log.write("Precision:\t%s\n" % precision_res)
    f_log.write("recall:\t%s\n" % recall_res)
    f_log.write("\n\n")
    f_log.flush()
    f_log.close()

    model.save(os.path.join(
        checkpoint_dir, 'final_model.h5'))  # creates a HDF5 file 'my_model.h5'
    del model  # deletes the existing model

    print("For evaluation, please use the following checkpoint : %s " %
          checkpoint_dir)
Пример #6
0

        # Split train/test set
        # TODO: This is very crude, should use cross-validation
        dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
        x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
        y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

        x_dev_text = text_x_shuffled[dev_sample_index:]
        pickle.dump(x_dev_text, open("dev_x_text.pickle","wb"))
        del x, y, x_shuffled, y_shuffled

        #print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
        #print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

        initW = data_helpers.load_word2vec(vocab_processor, FLAGS.embedding_dim)

        # Training
        # ==================================================

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
              allow_soft_placement=FLAGS.allow_soft_placement,
              log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                cnn = TextCNN(
                    sequence_length=x_train.shape[1],
                    num_classes=y_train.shape[1],
                    vocab_size=len(vocab_processor.vocabulary_),
                    batch_size=FLAGS.batch_size,
import sys
import pickle
import data_helpers

if __name__ == "__main__":
    w2v_file = sys.argv[1]

    print("Loading data ...")
    pos_file, neg_file = "data/rt-polarity.neg", "data/rt-polarity.pos"
    x_tokenized, y, vocab = data_helpers.load_data(pos_file, neg_file)
    print("Data loaded!")
    print("Vocabulary Size: {}".format(len(vocab)))
    print("Number of Samples: {}".format(len(y)))

    print("Load word2vec ...")
    w2v = data_helpers.load_word2vec(w2v_file, vocab)
    print("Word2vec loaded!")

    print("Add unknown word...")
    data_helpers.add_unknown_words(w2v, vocab)
    print("Unkown word loaded!")

    print("Build pretrained embedding filter...")
    word2index, pretrained_embedding_filter = data_helpers.get_pretrained_embedding_filter(
        w2v)
    x = data_helpers.index_data(x_tokenized, word2index)
    print("Pretrained embedding filter built!")

    pickle.dump([x, y, pretrained_embedding_filter, word2index],
                open("data.p", "wb"))
Пример #8
0
    freply3 = "de_replies.txt"
    freply4 = "tfidf_replies.txt"
    freply5 = "true.txt"
    # # Path to word2vec weights
    fqword2vec = 'GoogleNews-vectors-negative300.txt'
    frword2vec = 'GoogleNews-vectors-negative300.txt'

    print("Processing training files")
    process_train_file(processed_data_dir, fquery, query_max_length)
    process_train_file(processed_data_dir, freply1, reply_max_length)
    process_train_file(processed_data_dir, freply2, reply_max_length)
    process_train_file(processed_data_dir, freply3, reply_max_length)
    process_train_file(processed_data_dir, freply4, reply_max_length)
    process_train_file(processed_data_dir, freply5, reply_max_length)

    fqvocab = '%s.vocab%d'%(fquery, query_max_length)
    frvocab1 = '%s.vocab%d'%(freply1, reply_max_length)
    frvocab2 = '%s.vocab%d'%(freply2, reply_max_length)
    frvocab3 = '%s.vocab%d'%(freply3, reply_max_length)
    frvocab4 = '%s.vocab%d'%(freply4, reply_max_length)
    frvocab5 = '%s.vocab%d'%(freply5, reply_max_length)
    word2vec, vec_dim, _ = load_word2vec(word2vec_dir, fqword2vec)
    make_embedding_matrix(processed_data_dir, fquery, word2vec, vec_dim, fqvocab)

    make_embedding_matrix(processed_data_dir, freply1, word2vec, vec_dim, frvocab1)
    make_embedding_matrix(processed_data_dir, freply2, word2vec, vec_dim, frvocab2)
    make_embedding_matrix(processed_data_dir, freply3, word2vec, vec_dim, frvocab3)
    make_embedding_matrix(processed_data_dir, freply4, word2vec, vec_dim, frvocab4)
    make_embedding_matrix(processed_data_dir, freply5, word2vec, vec_dim, frvocab5)
    pass
Пример #9
0
    #make sure embed and vocab file paths are correct
    raw_data_dir = "./data"
    process_train_file(processed_train_dir, fquery_train, query_max_length)
    process_train_file(processed_train_dir, sub_query, query_max_length)

    process_train_file(processed_train_dir, freply_train, reply_max_length)
    process_train_file(processed_train_dir, true_reply, reply_max_length)
    process_train_file(processed_train_dir, sub_reply, reply_max_length)

    fqvocab = '%s.vocab%d' % (fquery_train, query_max_length)
    fqsvocab = '%s.vocab%d' % (sub_query, query_max_length)

    frvocab = '%s.vocab%d' % (freply_train, reply_max_length)
    frtvocab = '%s.vocab%d' % (true_reply, reply_max_length)
    frsvocab = '%s.vocab%d' % (sub_reply, reply_max_length)

    word2vec, vec_dim, _ = load_word2vec(raw_data_dir, fqword2vec)
    make_embedding_matrix(processed_train_dir, fquery_train, word2vec, vec_dim,
                          fqvocab)
    make_embedding_matrix(processed_train_dir, sub_query, word2vec, vec_dim,
                          fqsvocab)

    make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim,
                          frvocab)
    make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim,
                          frtvocab)
    make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim,
                          frsvocab)

    pass