예제 #1
0
    def titles2matrix(self, title1, title2):
        twords1 = data_utils.get_words(
            title1, remove_stopwords=False)[:self.title_mat_size]
        twords2 = data_utils.get_words(
            title2, remove_stopwords=False)[:self.title_mat_size]

        matrix = -np.ones((self.title_mat_size, self.title_mat_size))
        for i, word1 in enumerate(twords1):
            for j, word2 in enumerate(twords2):
                matrix[i][j] = (1 if word1 == word2 else -1)
        return matrix
예제 #2
0
 def model2vec(self, model, titles, d):  # doc2vec model
     m = len(titles)
     vectors = np.zeros((m, d), dtype=np.double)
     for i, title in enumerate(titles):
         words = data_utils.get_words(title)
         vec = model.infer_vector(words)
         vectors[i, :] = vec
     return vectors
예제 #3
0
 def get_candidates_by_ii(self, npaper, word2ids):
     title = npaper['title']
     words = data_utils.get_words(title, window=self.ii_window)
     cids = []
     for word in words:
         if word in word2ids:
             cids += word2ids[word]
     cids = list(set(cids))
     return cids
예제 #4
0
파일: Hash.py 프로젝트: sangcheng0615/OAG
def authors2b(authors):
    matrix = np.array((other_bit, ))
    for author in authors:
        author = author.lower()
        name_words = data_utils.get_words(author)
        for name in name_words:
            bit_order = ord(name[0]) - ord('a')
            if 0 <= bit_order < other_bit:
                matrix[bit_order] = 1
    return matrix
예제 #5
0
파일: Hash.py 프로젝트: sangcheng0615/OAG
 def authors2binary_matrix(self, authors_list):
     m = len(authors_list)
     # matrix = np.zeros((m, self.other_bit), dtype=bool)
     matrix = np.zeros((m, self.other_bit))
     for i, authors in enumerate(authors_list):
         for author in authors:
             author = author.lower()
             name_words = data_utils.get_words(author)
             for name in name_words:
                 bit_order = ord(name[0]) - ord('a')
                 if 0 <= bit_order < self.other_bit:
                     matrix[i, bit_order] = 1
     # print(matrix)
     return matrix
예제 #6
0
 def prepare_corpus(self):
     train_corpus_analyzed = []
     analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
     train_corpus = data_utils.load_json(self.train_data_dir, self.train_data_fname)
     print('training documents loaded')
     print('documents number: {}'.format(len(train_corpus)))
     for i, text in enumerate(train_corpus):
         if i % 10000 == 0:
             print(i)
         words = data_utils.get_words(text)
         tags = [i]
         train_corpus_analyzed.append(analyzedDocument(words=words, tags=tags))
         # if i > 100000:
         #     break
     return train_corpus_analyzed
예제 #7
0
 def build_inverted_index(self, fold):
     print('build inverted index for cpapers: fold', fold)
     fname = 'clean-papers-test-{}.dat'.format(fold)
     papers = data_utils.load_json_lines(self.paper_dir, fname)
     word2ids = dd(list)
     for paper in papers:
         pid = str(paper['id'])
         title = paper['title']
         words = data_utils.get_words(title, window=self.ii_window)
         for word in words:
             word2ids[word].append(pid)
     for word in word2ids:
         word2ids[word] = list(set(word2ids[word]))
     data_utils.dump_json(word2ids, self.inverted_index_dir,
                          'clean-papers-test-ii-{}.json'.format(fold))
     print('complete building II')
     return word2ids