Python load_data примеры, CIKM.datautils.datahelper.load_data Python примеры использования

Пример #1

0

Показать файл

Файл: doc2vec_model.py Проект: SFKevin/nlp_semantics

def process():
    _, _, _, _, w2v_list, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)

    stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!",
                                                                                                           "").replace(
            "¡", "").lower() for
        line in
        stop_word]
    d2c_list = []
    for line in w2v_list:
        # line_list = [x for x in line if x not in stop_word_list]
        d2c_list.append(line)

    alldocuments = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for id, record in enumerate(d2c_list):
        qid = str('SENT_%s' % id)
        words = record
        words_text = " ".join(words)
        words = gensim.utils.simple_preprocess(words_text)
        tags = [qid]
        alldocuments.append(analyzedDocument(words, tags))
    print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now())))
    saved_model_name = "doc_2_vec_" + str(int(time.time()))
    model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5,
                                    min_count=2, epochs=100)
    model_4.save("%s" % (saved_model_name))
    print("model training completed : %s" % (saved_model_name))

Пример #2

0

Показать файл

def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])
    test1, test2 = datahelper.load_testdata(filepath_test)
    test1_int = []
    test2_int = []

    x_text1_int = []
    x_text2_int = []

    for line in x_text1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    for line in test1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test1_int.append(text)

    for line in test2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)
    x_test1 = pad_sequences(test1_int, max_len)
    x_test2 = pad_sequences(test2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2

Пример #3

0

Показать файл

def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])

    x_text1_int = []
    x_text2_int = []
    stop_word = list(open(FLAGS.stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace(
            "?", "").replace("¿", "").replace("!", "").replace("¡",
                                                               "").lower()
        for line in stop_word
    ]
    for line in x_text1:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size

Пример #4

0

Показать файл

Файл: train_lgb.py Проект: SFKevin/nlp_semantics

test_doc2vec = pd.read_csv(
    "I:\\temp\\CNNimpl_static\\CIKM\\features\\doc2vec\\test_doc2vec1.csv")
train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True)
test_doc2vec.drop(['doc2vec_test1', 'doc2vec_test2'], axis=1, inplace=True)

train = pd.concat([
    train_bag, train_magic1, train_magic2, train_freq, train_ngram,
    train_simple, train_weight, train_page, train_w2v, train_doc2vec
],
                  axis=1)
test = pd.concat([
    test_bag, test_magic1, test_magic2, test_freq, test_ngram, test_simple,
    test_weight, test_page, test_w2v, test_doc2vec
],
                 axis=1)
_, _, _, y_train, _, _ = datahelper.load_data(filepath_en_train,
                                              filepath_sp_train)
import numpy as np

sums = np.sum(y_train, axis=0)

print(sums / len(y_train))

from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(train.values,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=0)
import lightgbm as lgb

train_input = lgb.Dataset(x_train, y_train)

Пример #5

0

Показать файл

Файл: test_unique.py Проект: SFKevin/nlp_semantics

filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt"
filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
w2v_pah = "I:\\CIKM\\w2v.model.bin"
fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
file_stop_word = "I:\\CIKM\\spanish_stop_word.txt"
from CIKM.datautils import datahelper
import pandas as pd
import numpy as np
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from scipy import spatial
import datetime
from scipy.stats import skew, kurtosis
from gensim.corpora.dictionary import Dictionary

x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)
x_test1, x_test2 = datahelper.load_testdata(filepath_test)
train = pd.DataFrame()
test = pd.DataFrame()

train['question1'] = x_train1
train['question2'] = x_train2

test['question1'] = x_test1
test['question2'] = x_test2

# clean
tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[
    'question2'].tolist()
train_qs = pd.Series(tfidf_txt).astype(str)
dictionary = Dictionary(x.split(" ") for x in tfidf_txt)

Пример #6

0

Показать файл

Файл: train.py Проект: SFKevin/nlp_semantics

def train_w2v():
    _, _, _, _, data, _ = datahelper.load_data(filepath_en_train,
                                               filepath_sp_train)
    model = gensim.models.Word2Vec(data, size=300, min_count=1)
    model.wv.save_word2vec_format('w2v.model.bin', binary=True)

Пример #7

0

Показать файл

Файл: doc2vec_inference.py Проект: SFKevin/nlp_semantics

def makeFeature():
    x_train1, x_train2, _, _, _, _ = datahelper.load_data(
        filepath_en_train, filepath_sp_train)
    x_test1, x_test2 = datahelper.load_testdata(filepath_test)

    x_train1 = process_data(x_train1)
    x_train2 = process_data(x_train2)
    x_test1 = process_data(x_test1)
    x_test2 = process_data(x_test2)
    now = datetime.datetime.now()
    print
    now.strftime('%Y-%m-%d %H:%M:%S')
    print('get sentence vector')
    train = pd.DataFrame()
    test = pd.DataFrame()
    train['doc2vec_train1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train1
    ]
    train['doc2vec_train2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train2
    ]
    test['doc2vec_test1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test1
    ]
    test['doc2vec_test2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test2
    ]
    print('get six kinds of coefficient about vector')

    train['cosine1'] = train.apply(
        lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['manhatton1'] = train.apply(
        lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['euclidean1'] = train.apply(
        lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['pearson1'] = train.apply(
        lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['spearman1'] = train.apply(
        lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['kendall1'] = train.apply(
        lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train.to_csv('train_doc2vec1.csv', index=False)

    test['cosine1'] = test.apply(
        lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['manhatton1'] = test.apply(
        lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['euclidean1'] = test.apply(
        lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['pearson1'] = test.apply(
        lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['spearman1'] = test.apply(
        lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['kendall1'] = test.apply(
        lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)

    test.to_csv('test_doc2vec1.csv', index=False)

Python load_data примеры использования