def get_id2cpapers(self): cpapers_train = data_utils.load_json_lines(self.file_dir, 'clean-papers-train.dat') cpapers_test = data_utils.load_json_lines(self.file_dir, 'clean-papers-test.dat') cpapers = cpapers_train + cpapers_test id2paper = {} for paper in cpapers: paper['id'] = str(paper['id']) pid = paper['id'] id2paper[pid] = paper # data_utils.dump_json(id2paper, self.file_dir, 'clean-id2paper.json') return id2paper
def dump_dst_hash_tables(self): src_binary_codes_test, dst_binary_codes = self.two_domain_title_vectors_to_binary_codes( ) hash_to_dst_idx = dd(list) cpapers_train = data_utils.load_json_lines(settings.PAPER_DATA_DIR, 'clean-papers-train.dat') cpapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR, 'clean-papers-test.dat') cpapers = cpapers_train + cpapers_test for i, h in enumerate(dst_binary_codes): h = feature_utils.encode_binary_codes(h) hash_to_dst_idx[h].append(str(cpapers[i]['id'])) data_utils.dump_json(hash_to_dst_idx, settings.OUT_PAPER_DIR, 'hash_to_dst_paper_id.json')
def eval_hash_table(self): start_test_time = time.time() src_binary_codes_test, dst_binary_codes = self.two_domain_title_vectors_to_binary_codes( ) npapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR, 'noisy-papers-test.dat') labels = [str(item['id']) for item in npapers_test] hash_to_dst_idx = data_utils.load_json(settings.OUT_PAPER_DIR, 'hash_to_dst_paper_id.json') preds = [] before_loop_time = time.time() for i, h in enumerate(src_binary_codes_test): h = feature_utils.encode_binary_codes(h) if h in hash_to_dst_idx and len(hash_to_dst_idx[h]) == 1: preds.append(hash_to_dst_idx[h][0]) else: preds.append(None) end_time = time.time() pred_time = end_time - before_loop_time test_time = end_time - start_test_time r = eval_utils.eval_prec_rec_f1_ir(preds, labels) logger.info('eval results: Prec. %.4f, Rec. %.4f, F1. %.4f', r[0], r[1], r[2]) logger.info('test time %.2fs, predict time %.2fs', test_time, pred_time)
def prepare_paper_title_to_vectors(self): if self.model is None: self.load_model() src_vectors_fname = '{}-titles-doc2vec-test.pkl'.format('src') dst_vectors_fname = '{}-titles-doc2vec.pkl'.format('dst') if os.path.isfile(join(settings.OUT_PAPER_DIR, src_vectors_fname)) \ and os.path.isfile(join(settings.OUT_PAPER_DIR, dst_vectors_fname)): src_vectors_test = data_utils.load_large_obj( settings.OUT_PAPER_DIR, src_vectors_fname) dst_vectors = data_utils.load_large_obj(settings.OUT_PAPER_DIR, dst_vectors_fname) return src_vectors_test, dst_vectors fname = '{}-papers-{}.dat' cpapers_fname_train = fname.format('clean', 'train') cpapers_train = data_utils.load_json_lines(settings.PAPER_DATA_DIR, cpapers_fname_train) cpapers_fname_test = fname.format('clean', 'test') cpapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR, cpapers_fname_test) cpapers = cpapers_train + cpapers_test ctitles = [cpaper['title'].lower() for cpaper in cpapers] npapers_fname = fname.format('noisy', 'test') npapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR, npapers_fname) ntitles_test = [npaper['title'].lower() for npaper in npapers_test] src_vectors_test = self.titles2vec(ntitles_test) src_vectors_test = feature_utils.scale_matrix( src_vectors_test) # useful dst_vectors = self.titles2vec(ctitles) dst_vectors = feature_utils.scale_matrix(dst_vectors) data_utils.dump_large_obj(src_vectors_test, settings.OUT_PAPER_DIR, src_vectors_fname) data_utils.dump_large_obj(dst_vectors, settings.OUT_PAPER_DIR, dst_vectors_fname) return src_vectors_test, dst_vectors
def build_cpapers_inverted_index(self): logger.info('build inverted index for cpapers') cpapers_train = data_utils.load_json_lines(self.file_dir, 'clean-papers-train.dat') cpapers_test = data_utils.load_json_lines(self.file_dir, 'clean-papers-test.dat') papers = cpapers_train + cpapers_test word2ids = dd(list) for paper in papers: pid = str(paper['id']) title = paper['title'] words = feature_utils.get_words(title.lower(), window=self.build_index_window) for word in words: word2ids[word].append(pid) for word in word2ids: word2ids[word] = list(set(word2ids[word])) # data_utils.dump_json(word2ids, self.file_dir, 'clean-papers-inverted-index.json') logger.info('building inverted index completed') return word2ids
def get_noisy_papers_test(self): return data_utils.load_json_lines(self.file_dir, 'noisy-papers-test.dat')