return None, None with open(dict_filename, 'rb') as d_f, open(embd_filename, 'rb') as n_f: print('Loading pickled embedding.') word_dict = pickle.load(d_f) embedding = np.load(n_f) return word_dict, embedding if __name__ == '__main__': from config import DATA_DIR import os from util.vocab_stat import get_dict # 2196018 # 400001 # file_path = os.path.join(DATA_DIR, 'Glove/glove.840B.300d.zip') # di_1, embd_1 = build_voc(file_path, 2196018) dict_ = get_dict() file_path = os.path.join(DATA_DIR, 'Glove/glove.6B.300d.zip') di_1, embd_1 = fine_selected_embedding(file_path, dict_, pre_vocab_size=400001) # print(len(di_1)) # print(embd_1) # pprint(embd_1.shape) # di_2, embd_2 = voc_builder(os.path.join(DATA_DIR, 'Glove/glove.6B.300d.txt')) # print(embd_1[-1, :] - embd_2[-1, :]) # 15291 11862 16640 64611 122665, 140648 138700
import pandas as pd from etc.preprocessor import build_new_sets from util.dict_builder import build_voc, fine_selected_embedding from util.vocab_stat import get_dict from config import DATA_DIR import os phrase_train_df = pd.read_csv('/Users/Eason/RA/landOfflol/datasets/Diy/sst/intermediates/phase_train.txt', sep='|', quoting=3, encoding='utf-8') # 3185582 # print(phrase_train_df.dropna(axis=0)) phase_map_df = pd.read_csv(DATA_DIR + '/SST/dictionary.txt', sep='|', quoting=3, encoding='utf-8') # 239242 sentiment_labels_df = pd.read_csv(DATA_DIR + '/SST/sentiment_labels.txt', sep='|', quoting=3, encoding='utf-8') phases_with_labels = phase_map_df.merge(sentiment_labels_df, on='phrase ids') phrase_train_df = phrase_train_df.merge(phases_with_labels, on='sentence') phrase_train_df = phrase_train_df.reindex(np.random.permutation(phrase_train_df.index)) lower_df = phrase_train_df['sentence'].map(lambda x: x.lower()) phrase_train_df['sentence'] = lower_df.drop_duplicates() phrase_train_df = phrase_train_df.dropna(axis=0) fine_dict = get_dict() word2id, word_embedding = fine_selected_embedding(os.path.join(DATA_DIR, 'Glove/glove.840B.300d.zip'), fine_dict, pre_vocab_size=2196018) build_new_sets(phrase_train_df, word2id, os.path.join(DATA_DIR, 'Diy/sst/fs/p_train_data.txt'))
from datetime import datetime from models.snli.lstm_variant import biLSTM import config from util.dict_builder import fine_selected_embedding from tests.snli.test_wrapper import wrapper from util.vocab_stat import get_dict import argparse if __name__ == '__main__': arg_parse = argparse.ArgumentParser(description='Test for some variant of LSTM model.') arg_parse.add_argument('-m', '--message', metavar='message', dest='message', action='store', help='Specify a message for this test') args = arg_parse.parse_args() max_length = 80 # 2196018 # 400001 voc_size = 2196018 dict_ = get_dict() word2id, word_embedding = fine_selected_embedding(config.GLOVE_840B_PATH, dict_, pre_vocab_size=voc_size) timestamp = '{0:(%Y-%m-%d-%H:%M:%S)}'.format(datetime.now()) biLSTM = biLSTM(lstm_step=max_length, embedding=word_embedding, hidden_d=100, vocab_size=voc_size, Time=timestamp, Model_Name='BiLSTM', Message=args.message) biLSTM.setup(embedding=word_embedding) wrapper(model_name='snli-biLSTM', model=biLSTM, max_length=max_length, benchmark=0.70)
# 3185582 # print(phrase_train_df.dropna(axis=0)) phase_map_df = pd.read_csv(DATA_DIR + '/SST/dictionary.txt', sep='|', quoting=3, encoding='utf-8') # 239242 sentiment_labels_df = pd.read_csv(DATA_DIR + '/SST/sentiment_labels.txt', sep='|', quoting=3, encoding='utf-8') phases_with_labels = phase_map_df.merge(sentiment_labels_df, on='phrase ids') phrase_train_df = phrase_train_df.merge(phases_with_labels, on='sentence') phrase_train_df = phrase_train_df.reindex( np.random.permutation(phrase_train_df.index)) lower_df = phrase_train_df['sentence'].map(lambda x: x.lower()) phrase_train_df['sentence'] = lower_df.drop_duplicates() phrase_train_df = phrase_train_df.dropna(axis=0) fine_dict = get_dict() word2id, word_embedding = fine_selected_embedding(os.path.join( DATA_DIR, 'Glove/glove.840B.300d.zip'), fine_dict, pre_vocab_size=2196018) build_new_sets(phrase_train_df, word2id, os.path.join(DATA_DIR, 'Diy/sst/fs/p_train_data.txt'))