示例#1
0
def process():
    x_text1, x_text2, y_train = datahelper.load_data(FLAGS.en_train,
                                                     FLAGS.sp_train)
    x_text = np.concatenate([x_text1, x_text2], axis=0)
    word2index, index2word = datahelper.create_vocabulary(x_text)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding()

    max_len = max([len(x.split(" ")) for x in x_text])

    x_text1_int = []
    x_text2_int = []
    for line in x_text1:
        line_list = datahelper.text_to_wordlist(line)
        line_list = line_list.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = datahelper.text_to_wordlist(line)
        line_list = line_list.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
示例#2
0
def process():
    x_train1, x_train2, _ = datahelper.load_data(filepath_en_train,
                                                 filepath_sp_train)

    # stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines())
    # stop_word_list = [
    #     line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!",
    #                                                                                                        "").replace(
    #         "¡", "").lower() for
    #     line in
    #     stop_word]
    train_data = np.concatenate([x_train1, x_train2], axis=0)
    d2c_list = []
    for line in train_data:
        # line_list = [x for x in line if x not in stop_word_list]
        line = datahelper.text_to_wordlist(line,
                                           remove_stop_words=True,
                                           stem_words=False).split(" ")
        d2c_list.append(line)

    alldocuments = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for id, record in enumerate(d2c_list):
        qid = str('SENT_%s' % id)
        words = record
        words_text = " ".join(words)
        words = gensim.utils.simple_preprocess(words_text)
        tags = [qid]
        alldocuments.append(analyzedDocument(words, tags))
    print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now())))
    saved_model_name = "doc_2_vec_" + str(int(time.time()))
    model_4 = gensim.models.Doc2Vec(alldocuments,
                                    dm=1,
                                    dm_concat=1,
                                    vector_size=300,
                                    window=5,
                                    min_count=2,
                                    epochs=100)
    model_4.save("%s" % (saved_model_name))
    print("model training completed : %s" % (saved_model_name))
def makeFeature():
    x_train1, x_train2, _ = datahelper.load_data(filepath_en_train,
                                                 filepath_sp_train)

    x_train1 = process_data(x_train1)
    x_train2 = process_data(x_train2)

    now = datetime.datetime.now()
    print
    now.strftime('%Y-%m-%d %H:%M:%S')
    print('get sentence vector')
    train = pd.DataFrame()
    train['doc2vec_train1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train1
    ]
    train['doc2vec_train2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train2
    ]
    print('get six kinds of coefficient about vector')

    train['cosine1'] = train.apply(
        lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['manhatton1'] = train.apply(
        lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['euclidean1'] = train.apply(
        lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['pearson1'] = train.apply(
        lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['spearman1'] = train.apply(
        lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['kendall1'] = train.apply(
        lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train.to_csv('train_doc2vec1.csv', index=False)
示例#4
0
# filepath_unlabel = "E:\\CIKM2018\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
# w2v_pah = "E:\\CIKM2018\\w2v.model.bin"
# fast_path = "E:\\CIKM2018\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
# file_stop_word = "E:\\CIKM2018\\spanish_stop_word.txt"
filepath_en_train = "I:\\CIKM\\cikm_english_train_20180516\\cikm_english_train_20180516.txt"
filepath_sp_train = "I:\\CIKM\\cikm_spanish_train_20180516.txt"
filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt"
filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
w2v_pah = "I:\\CIKM\\w2v.model.bin"
fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
file_stop_word = "I:\\CIKM\\spanish_stop_word.txt"
from text_match.en.data_utils import datahelper
import pandas as pd
import hashlib

x_train1, x_train2, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)

train = pd.DataFrame()
test = pd.DataFrame()

train['question1'] = x_train1
train['question2'] = x_train2


# Generating a graph of Questions and their neighbors
def generate_qid_graph_table(row):
    hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
    hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()

    qid_graph.setdefault(hash_key1, []).append(hash_key2)
    qid_graph.setdefault(hash_key2, []).append(hash_key1)
示例#5
0
    "I:\\nlp_semantics\\text_match\\en\\features\\word2ve\\train_weight_tfidf.csv"
)
train_w2v.drop(['question1', 'question2', 'q1_unique', 'q2_unique'],
               axis=1,
               inplace=True)

train_doc2vec = pd.read_csv(
    "I:\\nlp_semantics\\text_match\\en\\features\\doc2vec\\train_doc2vec1.csv")
train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True)

train = pd.concat([
    train_bag, train_magic1, train_magic2, train_freq, train_ngram,
    train_simple, train_weight, train_page, train_w2v, train_doc2vec
],
                  axis=1)
_, _, y_train = datahelper.load_data(filepath_en_train, filepath_sp_train)
import numpy as np

sums = np.sum(y_train, axis=0)

print(sums / len(y_train))

from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(train.values,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=0)
import lightgbm as lgb

train_input = lgb.Dataset(x_train, y_train)