示例#1
0
 def get_id2cpapers(self):
     cpapers_train = data_utils.load_json_lines(self.file_dir,
                                                'clean-papers-train.dat')
     cpapers_test = data_utils.load_json_lines(self.file_dir,
                                               'clean-papers-test.dat')
     cpapers = cpapers_train + cpapers_test
     id2paper = {}
     for paper in cpapers:
         paper['id'] = str(paper['id'])
         pid = paper['id']
         id2paper[pid] = paper
     # data_utils.dump_json(id2paper, self.file_dir, 'clean-id2paper.json')
     return id2paper
示例#2
0
文件: hash.py 项目: whuscity/OAG
 def dump_dst_hash_tables(self):
     src_binary_codes_test, dst_binary_codes = self.two_domain_title_vectors_to_binary_codes(
     )
     hash_to_dst_idx = dd(list)
     cpapers_train = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                                'clean-papers-train.dat')
     cpapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                               'clean-papers-test.dat')
     cpapers = cpapers_train + cpapers_test
     for i, h in enumerate(dst_binary_codes):
         h = feature_utils.encode_binary_codes(h)
         hash_to_dst_idx[h].append(str(cpapers[i]['id']))
     data_utils.dump_json(hash_to_dst_idx, settings.OUT_PAPER_DIR,
                          'hash_to_dst_paper_id.json')
示例#3
0
文件: hash.py 项目: whuscity/OAG
 def eval_hash_table(self):
     start_test_time = time.time()
     src_binary_codes_test, dst_binary_codes = self.two_domain_title_vectors_to_binary_codes(
     )
     npapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                               'noisy-papers-test.dat')
     labels = [str(item['id']) for item in npapers_test]
     hash_to_dst_idx = data_utils.load_json(settings.OUT_PAPER_DIR,
                                            'hash_to_dst_paper_id.json')
     preds = []
     before_loop_time = time.time()
     for i, h in enumerate(src_binary_codes_test):
         h = feature_utils.encode_binary_codes(h)
         if h in hash_to_dst_idx and len(hash_to_dst_idx[h]) == 1:
             preds.append(hash_to_dst_idx[h][0])
         else:
             preds.append(None)
     end_time = time.time()
     pred_time = end_time - before_loop_time
     test_time = end_time - start_test_time
     r = eval_utils.eval_prec_rec_f1_ir(preds, labels)
     logger.info('eval results: Prec. %.4f, Rec. %.4f, F1. %.4f', r[0],
                 r[1], r[2])
     logger.info('test time %.2fs, predict time %.2fs', test_time,
                 pred_time)
示例#4
0
文件: title2vec.py 项目: whuscity/OAG
    def prepare_paper_title_to_vectors(self):
        if self.model is None:
            self.load_model()

        src_vectors_fname = '{}-titles-doc2vec-test.pkl'.format('src')
        dst_vectors_fname = '{}-titles-doc2vec.pkl'.format('dst')


        if os.path.isfile(join(settings.OUT_PAPER_DIR, src_vectors_fname)) \
            and os.path.isfile(join(settings.OUT_PAPER_DIR, dst_vectors_fname)):
            src_vectors_test = data_utils.load_large_obj(
                settings.OUT_PAPER_DIR, src_vectors_fname)
            dst_vectors = data_utils.load_large_obj(settings.OUT_PAPER_DIR,
                                                    dst_vectors_fname)
            return src_vectors_test, dst_vectors

        fname = '{}-papers-{}.dat'
        cpapers_fname_train = fname.format('clean', 'train')
        cpapers_train = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                                   cpapers_fname_train)
        cpapers_fname_test = fname.format('clean', 'test')
        cpapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                                  cpapers_fname_test)
        cpapers = cpapers_train + cpapers_test
        ctitles = [cpaper['title'].lower() for cpaper in cpapers]

        npapers_fname = fname.format('noisy', 'test')
        npapers_test = data_utils.load_json_lines(settings.PAPER_DATA_DIR,
                                                  npapers_fname)
        ntitles_test = [npaper['title'].lower() for npaper in npapers_test]

        src_vectors_test = self.titles2vec(ntitles_test)
        src_vectors_test = feature_utils.scale_matrix(
            src_vectors_test)  # useful

        dst_vectors = self.titles2vec(ctitles)
        dst_vectors = feature_utils.scale_matrix(dst_vectors)
        data_utils.dump_large_obj(src_vectors_test, settings.OUT_PAPER_DIR,
                                  src_vectors_fname)

        data_utils.dump_large_obj(dst_vectors, settings.OUT_PAPER_DIR,
                                  dst_vectors_fname)
        return src_vectors_test, dst_vectors
示例#5
0
 def build_cpapers_inverted_index(self):
     logger.info('build inverted index for cpapers')
     cpapers_train = data_utils.load_json_lines(self.file_dir,
                                                'clean-papers-train.dat')
     cpapers_test = data_utils.load_json_lines(self.file_dir,
                                               'clean-papers-test.dat')
     papers = cpapers_train + cpapers_test
     word2ids = dd(list)
     for paper in papers:
         pid = str(paper['id'])
         title = paper['title']
         words = feature_utils.get_words(title.lower(),
                                         window=self.build_index_window)
         for word in words:
             word2ids[word].append(pid)
     for word in word2ids:
         word2ids[word] = list(set(word2ids[word]))
     # data_utils.dump_json(word2ids, self.file_dir, 'clean-papers-inverted-index.json')
     logger.info('building inverted index completed')
     return word2ids
示例#6
0
 def get_noisy_papers_test(self):
     return data_utils.load_json_lines(self.file_dir,
                                       'noisy-papers-test.dat')