Exemplo n.º 1
0
query_maxlen = max(map(len, (x for x, _ in train_data + test_data)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Query max length:', query_maxlen, 'words')
print('Number of training data:', len(train_data))
print('Number of test data:', len(test_data))
print('-')
print('Here\'s what a "data" tuple looks like (query, answer):')
print(train_data[0])
print('-')
print('Vectorizing the word sequences...')

print('Number of entities', len(entities))

queries_train, answers_train = vectorize(train_data, w2i, query_maxlen,
                                         w2i_label)
queries_test, answers_test = vectorize(test_data, w2i, query_maxlen, w2i_label)
# queries_dev, answers_dev = vectorize(dev_data, w2i, query_maxlen, w2i_label)

print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)

# max_mem_len = 3
mem_key_len = 2  # ['Blade Runner', 'directed_by']
mem_val_len = 1  # ['Ridley Scott']
Exemplo n.º 2
0
from sklearn import svm
import numpy as np
import time

#TODO: fix up preprocessing data code and upload it as a separate file to WMD as well
#TODO: debug low dimensional approximation
#TODO: run more extensive tests on performance of mSDA with and without low dimensional approximation

#fetch training documents from 20 newsgroups dataset in random order
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
all_20news = fetch_20newsgroups(subset='all',categories=categories, shuffle=True, random_state=42)
all_raw_data = all_20news.data #all the data
all_data_stringsList = process_data.createWordLists(process_data.unicodeToString(all_raw_data))
all_data_words = process_data.preprocess_by_word(all_data_stringsList)
all_labels = all_20news.target #all the labels
all_full_data = process_data.vectorize(all_data_words) #convert to bag of words
all_full_data = all_full_data.transpose() #so rows are data, columns are features (format we predominantly use)
num_mostCommon = 800
all_mostCommonFeatures_data = process_data.getMostCommonFeatures(all_full_data, num_mostCommon)
train_data, train_labels, test_data, test_labels = process_data.splitTrainTest(all_mostCommonFeatures_data, all_labels)

print "Shape of training data: ", train_data.shape
print "Shape of test data: ", test_data.shape

#classify with linear SVM
#transpose because sklearn requires (#data x #features)
clf_baseline = svm.SVC().fit(train_data.transpose(), train_labels)
baseline_preds = clf_baseline.predict(test_data.transpose())
base_accuracy = np.mean(baseline_preds == test_labels)
print "Accuracy with linear SVM on basic representation: ", base_accuracy
Exemplo n.º 3
0
#TODO: run more extensive tests on performance of mSDA with and without low dimensional approximation

#fetch training documents from 20 newsgroups dataset in random order
categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]
all_20news = fetch_20newsgroups(subset='all',
                                categories=categories,
                                shuffle=True,
                                random_state=42)
all_raw_data = all_20news.data  #all the data
all_data_stringsList = process_data.createWordLists(
    process_data.unicodeToString(all_raw_data))
all_data_words = process_data.preprocess_by_word(all_data_stringsList)
all_labels = all_20news.target  #all the labels
all_full_data = process_data.vectorize(
    all_data_words)  #convert to bag of words
all_full_data = all_full_data.transpose(
)  #so rows are data, columns are features (format we predominantly use)
num_mostCommon = 800
all_mostCommonFeatures_data = process_data.getMostCommonFeatures(
    all_full_data, num_mostCommon)
train_data, train_labels, test_data, test_labels = process_data.splitTrainTest(
    all_mostCommonFeatures_data, all_labels)

print "Shape of training data: ", train_data.shape
print "Shape of test data: ", test_data.shape

#classify with linear SVM
#transpose because sklearn requires (#data x #features)
clf_baseline = svm.SVC().fit(train_data.transpose(), train_labels)
baseline_preds = clf_baseline.predict(test_data.transpose())
Exemplo n.º 4
0
    def __init__(self, max_mem_size, batch_size, device):
        '''max_mem_size means how many memories can be visited for one query
        '''
        self.device = device

        train_data = load_pickle('pickle/mov_task1_qa_pipe_train.pickle')
        test_data = load_pickle('pickle/mov_task1_qa_pipe_test.pickle')
        dev_data = load_pickle('pickle/mov_task1_qa_pipe_dev.pickle')
        kv_pairs = load_pickle('pickle/mov_kv_pairs.pickle')
        train_k = np.array(load_pickle('pickle/mov_train_k.pickle'))
        train_v = np.array(load_pickle('pickle/mov_train_v.pickle'))
        test_k = np.array(load_pickle('pickle/mov_test_k.pickle'))
        test_v = np.array(load_pickle('pickle/mov_test_v.pickle'))
        dev_k = np.array(load_pickle('pickle/mov_dev_k.pickle'))
        dev_v = np.array(load_pickle('pickle/mov_dev_v.pickle'))
        entities = load_pickle('pickle/mov_entities.pickle')
        entity_size = len(entities)

        # TODO
        vocab = load_pickle('pickle/mov_vocab.pickle')
        self.vocab_size = len(vocab)

        stopwords = load_pickle('pickle/mov_stopwords.pickle')
        w2i = load_pickle('pickle/mov_w2i.pickle')
        i2w = load_pickle('pickle/mov_i2w.pickle')

        w2i_label = load_pickle('pickle/mov_w2i_label.pickle')
        i2w_label = load_pickle('pickle/mov_i2w_label.pickle')

        print('before filter:', len(train_data), len(test_data))
        train_data, train_k, train_v = filter_data(train_data, train_k,
                                                   train_v, 0, 100)
        pdb.set_trace()
        test_data, test_k, test_v = filter_data(test_data, test_k, test_v, 0,
                                                100)
        dev_data, dev_k, dev_v = filter_data(dev_data, dev_k, dev_v, 0, 100)
        print('after filter:', len(train_data), len(test_data))

        query_maxlen = max(map(len, (x for x, _ in train_data + test_data)))

        print('-')
        print('Vocab size:', self.vocab_size, 'unique words')
        print('Query max length:', query_maxlen, 'words')
        print('Number of training data:', len(train_data))
        print('Number of test data:', len(test_data))
        print('-')
        print('Here\'s what a "data" tuple looks like (query, answer):')
        print(train_data[0])
        print('-')
        print('Vectorizing the word sequences...')

        print('Number of entities', len(entities))

        # TODO, change the vectorize function
        self.queries_train, self.answers_train = vectorize(
            train_data, w2i, query_maxlen, w2i_label)
        self.queries_test, self.answers_test = vectorize(
            test_data, w2i, query_maxlen, w2i_label)
        # queries_dev, answers_dev = vectorize(dev_data, w2i, query_maxlen, w2i_label)

        print('-')
        print('queries: integer tensor of shape (samples, max_length)')
        print('queries_train shape:', self.queries_train.shape)
        print('queries_test shape:', self.queries_test.shape)
        print('-')
        print(
            'answers: binary (1 or 0) tensor of shape (samples, len(w2i_label))'
        )
        print('answers_train shape:', self.answers_train.shape)
        print('answers_test shape:', self.answers_test.shape)

        # max_mem_len = 3
        self.query_max_len = query_maxlen
        self.max_mem_size = max_mem_size
        self.mem_key_len = 2  # ['Blade Runner', 'directed_by']
        self.mem_val_len = 1  # ['Ridley Scott']
        self.vec_train_k = vectorize_kv(train_k, self.mem_key_len,
                                        self.max_mem_size, w2i)
        self.vec_train_v = vectorize_kv(train_v, self.mem_val_len,
                                        self.max_mem_size, w2i)
        self.vec_test_k = vectorize_kv(test_k, self.mem_key_len,
                                       self.max_mem_size, w2i)
        self.vec_test_v = vectorize_kv(test_v, self.mem_val_len,
                                       self.max_mem_size, w2i)
        print('vec_k', self.vec_train_k.shape)
        print('vec_v', self.vec_train_v.shape)

        assert len(self.vec_train_k) == len(self.queries_train)
        assert len(self.vec_test_k) == len(self.queries_test)

        print('vec_k', self.vec_train_k.shape)
        print('vec_v', self.vec_train_v.shape)

        self.batch_size = batch_size
        print("The batch size is %d." % self.batch_size)
        self.num_qa_pairs_train = len(self.vec_train_k)
        self.num_steps = int(self.num_qa_pairs_train /
                             self.batch_size)  # drop the tail
        self.current_step = -1  # current_step should be in [0, num_steps-1]
        print("There's %d steps in one epoch" % self.num_steps)
        self.answer_set_size = len(w2i_label)
print('Build dictionary..')
word_dict = build_dict(train_d + train_q)
entity_markers = list(
    set([w for w in word_dict.keys() if w.startswith('@entity')] + train_a))
entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
print('Entity markers: %d' % len(entity_dict))
num_labels = len(entity_dict)

doc_maxlen = max(map(len, (d for d in train_d)))
query_maxlen = max(map(len, (q for q in train_q)))
print('doc_maxlen:', doc_maxlen, ', q_maxlen:', query_maxlen)

v_train_d, v_train_q, v_train_y, _ = vectorize(train_d, train_q, train_a,
                                               word_dict, entity_dict,
                                               doc_maxlen, query_maxlen)
v_dev_d, v_dev_q, v_dev_y, _ = vectorize(dev_d, dev_q, dev_a, word_dict,
                                         entity_dict, doc_maxlen, query_maxlen)
print('vectroized shape')
print(v_train_d.shape, v_train_q.shape, v_train_y.shape)
print(v_dev_d.shape, v_dev_q.shape, v_dev_y.shape)

vocab_size = max(word_dict.values()) + 1
print('vocab_size:', vocab_size)
embd_size = 100
rnn_half_hidden_size = 64
glove_embd_w = load_glove_weights('./dataset', 100, vocab_size, word_dict)
model = Net(vocab_size, embd_size, rnn_half_hidden_size, glove_embd_w,
            doc_maxlen, query_maxlen, len(entity_dict))
print(model.summary())
Exemplo n.º 6
0
test_k = np.array(load_pickle('pickle/mov_test_k.pickle'))
test_v = np.array(load_pickle('pickle/mov_test_v.pickle'))

# filter data which have zero KV or too many KVs
print('before filter:', len(test_data))
test_data, test_k, test_v = filter_data(test_data, test_k, test_v, 0, 100)
print('after filter:', len(test_data))

vocab = load_pickle('pickle/mov_vocab.pickle')
vocab_size = len(vocab)
w2i = load_pickle('pickle/mov_w2i.pickle')
i2w = load_pickle('pickle/mov_i2w.pickle')
w2i_label = load_pickle('pickle/mov_w2i_label.pickle')
i2w_label = load_pickle('pickle/mov_i2w_label.pickle')

queries_test, answers_test = vectorize(test_data, w2i, max_query_len,
                                       w2i_label, True)
vec_test_k = vectorize_kv(test_k, 2, max_mem_size, w2i)
vec_test_v = vectorize_kv(test_v, 1, max_mem_size, w2i)

model = load_model(model_name)
# ret = model.evaluate([vec_test_k, vec_test_v, queries_test], answers_test, verbose=1)
# print('=====result=====')
# print('loss: {:.5f}, acc: {:.5f}'.format(ret[0], ret[1]))

print('=====wrong examples=====')
pred = model.predict([vec_test_k, vec_test_v, queries_test],
                     batch_size=32,
                     verbose=1)
wrong_ct = 0
for i, (p, a) in enumerate(zip(pred, answers_test)):
    p_id = np.argmax(p)