def titles2matrix(self, title1, title2): twords1 = data_utils.get_words( title1, remove_stopwords=False)[:self.title_mat_size] twords2 = data_utils.get_words( title2, remove_stopwords=False)[:self.title_mat_size] matrix = -np.ones((self.title_mat_size, self.title_mat_size)) for i, word1 in enumerate(twords1): for j, word2 in enumerate(twords2): matrix[i][j] = (1 if word1 == word2 else -1) return matrix
def model2vec(self, model, titles, d): # doc2vec model m = len(titles) vectors = np.zeros((m, d), dtype=np.double) for i, title in enumerate(titles): words = data_utils.get_words(title) vec = model.infer_vector(words) vectors[i, :] = vec return vectors
def get_candidates_by_ii(self, npaper, word2ids): title = npaper['title'] words = data_utils.get_words(title, window=self.ii_window) cids = [] for word in words: if word in word2ids: cids += word2ids[word] cids = list(set(cids)) return cids
def authors2b(authors): matrix = np.array((other_bit, )) for author in authors: author = author.lower() name_words = data_utils.get_words(author) for name in name_words: bit_order = ord(name[0]) - ord('a') if 0 <= bit_order < other_bit: matrix[bit_order] = 1 return matrix
def authors2binary_matrix(self, authors_list): m = len(authors_list) # matrix = np.zeros((m, self.other_bit), dtype=bool) matrix = np.zeros((m, self.other_bit)) for i, authors in enumerate(authors_list): for author in authors: author = author.lower() name_words = data_utils.get_words(author) for name in name_words: bit_order = ord(name[0]) - ord('a') if 0 <= bit_order < self.other_bit: matrix[i, bit_order] = 1 # print(matrix) return matrix
def prepare_corpus(self): train_corpus_analyzed = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') train_corpus = data_utils.load_json(self.train_data_dir, self.train_data_fname) print('training documents loaded') print('documents number: {}'.format(len(train_corpus))) for i, text in enumerate(train_corpus): if i % 10000 == 0: print(i) words = data_utils.get_words(text) tags = [i] train_corpus_analyzed.append(analyzedDocument(words=words, tags=tags)) # if i > 100000: # break return train_corpus_analyzed
def build_inverted_index(self, fold): print('build inverted index for cpapers: fold', fold) fname = 'clean-papers-test-{}.dat'.format(fold) papers = data_utils.load_json_lines(self.paper_dir, fname) word2ids = dd(list) for paper in papers: pid = str(paper['id']) title = paper['title'] words = data_utils.get_words(title, window=self.ii_window) for word in words: word2ids[word].append(pid) for word in word2ids: word2ids[word] = list(set(word2ids[word])) data_utils.dump_json(word2ids, self.inverted_index_dir, 'clean-papers-test-ii-{}.json'.format(fold)) print('complete building II') return word2ids