示例#1
0
def process_data(inputs_data):
    x_test = []
    for line in inputs_data:
        line_list = datahelper.text_to_wordlist(line,
                                                remove_stop_words=True,
                                                stem_words=False).split(" ")
        x_test.append(line_list)
    return x_test
示例#2
0
def text_to_wordlist(text,
                     remove_stop_words=True,
                     stem_words=False,
                     lemma=True):
    text = datahelper.text_to_wordlist(text,
                                       remove_stop_words=True,
                                       stem_words=False)
    return text
示例#3
0
def process():
    x_text1, x_text2, y_train = datahelper.load_data(FLAGS.en_train,
                                                     FLAGS.sp_train)
    x_text = np.concatenate([x_text1, x_text2], axis=0)
    word2index, index2word = datahelper.create_vocabulary(x_text)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding()

    max_len = max([len(x.split(" ")) for x in x_text])

    x_text1_int = []
    x_text2_int = []
    for line in x_text1:
        line_list = datahelper.text_to_wordlist(line)
        line_list = line_list.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = datahelper.text_to_wordlist(line)
        line_list = line_list.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
def process_data(inputs_data):
    d2c_list = []
    for line in inputs_data:
        # line_list = [x for x in line if x not in stop_word_list]
        line = datahelper.text_to_wordlist(line,
                                           remove_stop_words=True,
                                           stem_words=False).split(" ")
        d2c_list.append(line)
    return d2c_list
示例#5
0
def process():
    x_train1, x_train2, _ = datahelper.load_data(filepath_en_train,
                                                 filepath_sp_train)

    # stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines())
    # stop_word_list = [
    #     line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!",
    #                                                                                                        "").replace(
    #         "¡", "").lower() for
    #     line in
    #     stop_word]
    train_data = np.concatenate([x_train1, x_train2], axis=0)
    d2c_list = []
    for line in train_data:
        # line_list = [x for x in line if x not in stop_word_list]
        line = datahelper.text_to_wordlist(line,
                                           remove_stop_words=True,
                                           stem_words=False).split(" ")
        d2c_list.append(line)

    alldocuments = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for id, record in enumerate(d2c_list):
        qid = str('SENT_%s' % id)
        words = record
        words_text = " ".join(words)
        words = gensim.utils.simple_preprocess(words_text)
        tags = [qid]
        alldocuments.append(analyzedDocument(words, tags))
    print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now())))
    saved_model_name = "doc_2_vec_" + str(int(time.time()))
    model_4 = gensim.models.Doc2Vec(alldocuments,
                                    dm=1,
                                    dm_concat=1,
                                    vector_size=300,
                                    window=5,
                                    min_count=2,
                                    epochs=100)
    model_4.save("%s" % (saved_model_name))
    print("model training completed : %s" % (saved_model_name))