Exemplo n.º 1
0
        return None, None
    with open(dict_filename, 'rb') as d_f, open(embd_filename, 'rb') as n_f:
        print('Loading pickled embedding.')
        word_dict = pickle.load(d_f)
        embedding = np.load(n_f)
    return word_dict, embedding


if __name__ == '__main__':
    from config import DATA_DIR
    import os
    from util.vocab_stat import get_dict
    # 2196018
    # 400001
    # file_path = os.path.join(DATA_DIR, 'Glove/glove.840B.300d.zip')
    # di_1, embd_1 = build_voc(file_path,  2196018)
    dict_ = get_dict()
    file_path = os.path.join(DATA_DIR, 'Glove/glove.6B.300d.zip')
    di_1, embd_1 = fine_selected_embedding(file_path,
                                           dict_,
                                           pre_vocab_size=400001)
    # print(len(di_1))
    # print(embd_1)
    # pprint(embd_1.shape)

    # di_2, embd_2 = voc_builder(os.path.join(DATA_DIR, 'Glove/glove.6B.300d.txt'))

    # print(embd_1[-1, :] - embd_2[-1, :])

# 15291 11862 16640 64611 122665, 140648 138700
import pandas as pd

from etc.preprocessor import build_new_sets
from util.dict_builder import build_voc, fine_selected_embedding
from util.vocab_stat import get_dict
from config import DATA_DIR
import os

phrase_train_df = pd.read_csv('/Users/Eason/RA/landOfflol/datasets/Diy/sst/intermediates/phase_train.txt', sep='|',
                              quoting=3, encoding='utf-8')
# 3185582
# print(phrase_train_df.dropna(axis=0))
phase_map_df = pd.read_csv(DATA_DIR + '/SST/dictionary.txt', sep='|', quoting=3, encoding='utf-8')
# 239242
sentiment_labels_df = pd.read_csv(DATA_DIR + '/SST/sentiment_labels.txt', sep='|', quoting=3, encoding='utf-8')

phases_with_labels = phase_map_df.merge(sentiment_labels_df, on='phrase ids')
phrase_train_df = phrase_train_df.merge(phases_with_labels, on='sentence')

phrase_train_df = phrase_train_df.reindex(np.random.permutation(phrase_train_df.index))

lower_df = phrase_train_df['sentence'].map(lambda x: x.lower())
phrase_train_df['sentence'] = lower_df.drop_duplicates()
phrase_train_df = phrase_train_df.dropna(axis=0)

fine_dict = get_dict()
word2id, word_embedding = fine_selected_embedding(os.path.join(DATA_DIR, 'Glove/glove.840B.300d.zip'),
                                                  fine_dict, pre_vocab_size=2196018)

build_new_sets(phrase_train_df, word2id, os.path.join(DATA_DIR, 'Diy/sst/fs/p_train_data.txt'))
Exemplo n.º 3
0
from datetime import datetime
from models.snli.lstm_variant import biLSTM
import config
from util.dict_builder import fine_selected_embedding
from tests.snli.test_wrapper import wrapper
from util.vocab_stat import get_dict
import argparse

if __name__ == '__main__':
    arg_parse = argparse.ArgumentParser(description='Test for some variant of LSTM model.')
    arg_parse.add_argument('-m', '--message', metavar='message', dest='message', action='store',
                           help='Specify a message for this test')

    args = arg_parse.parse_args()
    
    max_length = 80

    # 2196018
    # 400001
    voc_size = 2196018

    dict_ = get_dict()
    word2id, word_embedding = fine_selected_embedding(config.GLOVE_840B_PATH, dict_, pre_vocab_size=voc_size)

    timestamp = '{0:(%Y-%m-%d-%H:%M:%S)}'.format(datetime.now())

    biLSTM = biLSTM(lstm_step=max_length, embedding=word_embedding, hidden_d=100, vocab_size=voc_size, Time=timestamp,
                    Model_Name='BiLSTM', Message=args.message)
    biLSTM.setup(embedding=word_embedding)
    wrapper(model_name='snli-biLSTM', model=biLSTM, max_length=max_length, benchmark=0.70)
Exemplo n.º 4
0
# 3185582
# print(phrase_train_df.dropna(axis=0))
phase_map_df = pd.read_csv(DATA_DIR + '/SST/dictionary.txt',
                           sep='|',
                           quoting=3,
                           encoding='utf-8')
# 239242
sentiment_labels_df = pd.read_csv(DATA_DIR + '/SST/sentiment_labels.txt',
                                  sep='|',
                                  quoting=3,
                                  encoding='utf-8')

phases_with_labels = phase_map_df.merge(sentiment_labels_df, on='phrase ids')
phrase_train_df = phrase_train_df.merge(phases_with_labels, on='sentence')

phrase_train_df = phrase_train_df.reindex(
    np.random.permutation(phrase_train_df.index))

lower_df = phrase_train_df['sentence'].map(lambda x: x.lower())
phrase_train_df['sentence'] = lower_df.drop_duplicates()
phrase_train_df = phrase_train_df.dropna(axis=0)

fine_dict = get_dict()
word2id, word_embedding = fine_selected_embedding(os.path.join(
    DATA_DIR, 'Glove/glove.840B.300d.zip'),
                                                  fine_dict,
                                                  pre_vocab_size=2196018)

build_new_sets(phrase_train_df, word2id,
               os.path.join(DATA_DIR, 'Diy/sst/fs/p_train_data.txt'))