def embedding_test_master(self, input_file, embedding_file, block_size=10000): """ the master of mult-Theading for test by embedding model """ version = begin_time() self.word2vec = load_bigger(embedding_file) self.origin_sample = load_bigger(input_file) threadings = queue.Queue() waitthreadings = queue.Queue() num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): work = threading.Thread(target=self.embedding_test_agent, args=( start, end, block, )) threadings.put(work) start = end + 1 end = min(num - 1, block_size * (block + 2)) while not threadings.empty(): tempwork = threadings.get() tempwork.start() waitthreadings.put(tempwork) while not waitthreadings.empty(): waitthreadings.get().join() result = [self.wordresult[k] for k in sorted(self.wordresult.keys())] results = sum(result, []) totalnum = int(len(results)) correctnum = 0 top3num = 0 block_sizes = 10 for index in range(int(totalnum / block_sizes)): pre = results[index * block_sizes:(index + 1) * block_sizes] temp_index = np.array(pre).argmax() top3 = np.array(pre).argsort()[-3:][::-1] if not temp_index: correctnum += 1 if 0 in top3: top3num += 1 print(correctnum, top3num, int(totalnum / block_sizes), spend_time(version), str(correctnum / int(totalnum / block_sizes))[:5], str(top3num / int(totalnum / block_sizes))[:5]) end_time(version)
def test_model(dataset_file='SMN/data/datasets_test11.pkl', pre_file='SMN/data/smn_test11.pkl', model_name='SMN/data/model.bin', result_file='SMN/data/result_test11.txt'): """ test model return accuracy """ version = begin_time() datasets = load_bigger(dataset_file) pre = pickle.load(open(pre_file, "rb")) wordvecs = pre[1] predict(datasets, wordvecs.W, batch_size=200, max_l=50, hidden_size=200, word_embedding_size=200, model_name=model_name, result_file=result_file) sampleConduct = SampleConduct() end_time(version) return sampleConduct.calculate_test(result_file)
def word2ids(self, input_file, embedding_file, output1_file='SMN/data/weibo/word2id.pkl', output2_file='SMN/data/weibo/word_embedding.pkl', output3_file='SMN/data/weibo/word2id'): """ word 2 id """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: origin_sample = f.readlines() word_embedding = load_bigger(embedding_file) words = [] word_map = {} embedding_lists = [] word_map['_OOV_'] = 0 word_map['_EOS_'] = 1 embedding_lists.append([0] * 200) embedding_lists.append([0] * 200) for index in origin_sample: if index == '\r\n': continue words += [LCS(idx) for idx in index.replace('\r\n', '').split()] # words.update(set(index.replace('\r\n', '').split())) words = Counter(words) words = [index for index in words if words[index] > 2] word2id = ['_OOV_ 0', '_EOS_ 1'] print('Step 2: Begin') index_num = 2 for idx, index in enumerate(words): if index in word_embedding: if index not in word_map: word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append( list(word_embedding[index].astype('float16'))) # elif index[:3] in word_embedding: # if index[:3] not in word_map: # word_map[index[:3]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:3] + ' ' + str(word_map[index[:3]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:3]].astype('float16'))) # else: # word_map[index] = word_map[index[:3]] # word2id.append(index + ' ' + str(word_map[index])) # elif index[:2] in word_embedding: # if index[:2] not in word_map: # word_map[index[:2]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:2] + ' ' + str(word_map[index[:2]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:2]].astype('float16'))) # else: # word_map[index] = word_map[index[:2]] # word2id.append(index + ' ' + str(word_map[index])) # elif index[:1] in word_embedding: # if index[:1] not in word_map: # word_map[index[:1]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:1] + ' ' + str(word_map[index[:1]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:1]].astype('float16'))) # else: # word_map[index] = word_map[index[:1]] # word2id.append(index + ' ' + str(word_map[index])) print(index_num) with open(output3_file, 'w') as f: f.write(list2str(word2id)) print('Step 2: Over') # return embedding_lists, word_map pickle.dump(embedding_lists, open(output2_file, "wb")) pickle.dump(word_map, open(output1_file, "wb")) end_time(version)
def word2ids(self, input_file, embedding_file, output1_file='SMN/data/weibo/word2id.pkl', output2_file='SMN/data/weibo/word_embedding.pkl', output3_file='SMN/data/weibo/word2id', min_n=1, max_n=3): """ word 2 id """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: origin_sample = f.readlines() word_embedding = load_bigger(embedding_file) words = [] word_map = {} embedding_lists = [] word_map['_OOV_'] = 0 word_map['_EOS_'] = 1 embedding_lists.append([0] * 200) embedding_lists.append([0] * 200) for index in origin_sample: if index == '\r\n': continue words += [LCS(idx) for idx in index.replace('\r\n', '').split()] # words.update(set(index.replace('\r\n', '').split())) words = Counter(words) words = [index for index in words] word2id = ['_OOV_ 0', '_EOS_ 1'] word_size = word_embedding.wv.syn0[0].shape[0] print('Step 2: Begin') index_num = 2 for idx, index in enumerate(words): if index in word_map: continue if index in word_embedding.wv.vocab.keys(): word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append(word_embedding[index].astype('float32')) else: ngrams = compute_ngrams(index, min_n=min_n, max_n=max_n) word_vec = np.zeros(word_size, dtype=np.float32) ngrams_found = 0 ngrams_single = [ng for ng in ngrams if len(ng) == 1] ngrams_more = [ng for ng in ngrams if len(ng) > 1] for ngram in ngrams_more: if ngram in word_embedding.wv.vocab.keys(): word_vec += word_embedding[ngram] ngrams_found += 1 if ngrams_found == 0: for ngram in ngrams_single: if ngram in word_embedding.wv.vocab.keys(): word_vec += word_embedding[ngram] ngrams_found += 1 if word_vec.any(): word_vec /= max(1, ngrams_found) word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append(word_vec) print(index_num) with open(output3_file, 'w') as f: f.write(list2str(word2id)) print('Step 2: Over') # return embedding_lists, word_map pickle.dump(embedding_lists, open(output2_file, "wb")) pickle.dump(word_map, open(output1_file, "wb")) end_time(version)