def query_index(self, query, n_results=5): query_vector = self._vectorize(tokenize(normalize(query)), indexing=False) results, distances = self.index.get_nns_by_vector(query_vector, n=n_results, include_distances=True, search_k=10 * len(self.documents)) return [(distance, self.documents[result]) for result, distance in zip(results, distances)]
def query_index(self, query, n_results=5): query_tokens = [ word for word in tokenize(normalize(query)) if word in self.index ] query_norm = self._norm(query_tokens) query_vector = self._vectorize(query_tokens) processed = set() top_hits = [ (0, None) ] * n_results # using simple array with assumption that n_results is small for token in set(query_tokens): for document_id in self.index[token]: if document_id in processed: continue processed.add(document_id) document_tokens, document_norm = self.doc_tokens[ document_id], self.doc_norms[document_id] document_vector = self.doc_vectors[document_id] similarity = self._similarity_unnorm( unweighted_vector=query_vector, weighted_vector=document_vector) similarity /= (query_norm * document_norm) if similarity > top_hits[0][0]: del top_hits[0] insert_location = 0 for score, _ in top_hits: if similarity < score: break insert_location += 1 top_hits.insert(insert_location, (similarity, document_id)) return [(score, self.documents[doc_id]) for (score, doc_id) in reversed(top_hits) if doc_id is not None]
def write_vocabulary(collection, parsed_args): os.makedirs(os.path.join(parsed_args.out_dir, collection.name), exist_ok=True) vocabulary = set() texts = list(collection.documents.values()) + list(collection.queries.values()) for text in texts: vocabulary |= set(tokenize(normalize(text))) with open(os.path.join(parsed_args.out_dir, collection.name + '-vocabulary.txt'), 'w') as f: f.write('\n'.join(vocabulary))
def write_to_standard_form(collection, parsed_args): def write_to_file(name, dictionary, transform=lambda i: i): out_path = os.path.join(parsed_args.out_dir, collection.name, '{}-{}.txt'.format(collection.name, name)) with open(out_path, 'w') as out_file: out_file.write('.count.{} {}\n\n'.format(name, len(dictionary))) for item_id, item in dictionary.items(): out_file.write('.id.{} {}\n{}\n\n'.format(name[0], item_id, transform(item))) os.makedirs(os.path.join(parsed_args.out_dir, collection.name), exist_ok=True) if parsed_args.phrases: phrases = detect_phrases([normalize(doc) for doc in collection.documents]) transform_f = lambda l: ' '.join(tokenize(replace_phrases([normalize(l)], phrases=phrases))) else: # transform_f = lambda l: ' '.join(tokenize(normalize(l))) transform_f = lambda l: ' '.join(filter(lambda s: len(s) > 1, tokenize(normalize(l)))) write_to_file('documents', collection.documents, transform=transform_f) write_to_file('queries', collection.queries, transform=transform_f) write_to_file('relevance', collection.relevance, transform=lambda l: ' '.join(map(str, l)))
def index_documents(self, documents): doc_tokens = [] for i, document in enumerate(documents): self.documents.append(document) doc_tokens.append(tokenize(normalize(document))) self._init_word_weights_stopwords(doc_tokens, **self.word_weight_options) for i, tokens in enumerate(doc_tokens): self.index.add_item(i, self._vectorize(tokens=tokens, indexing=True)) self.index.build(n_trees=10)
def index_documents(self, documents): self.documents = list(documents) doc_tokens = [] for document in documents: doc_tokens.append(tokenize(normalize(document))) self._init_word_weights_stopwords(doc_tokens, **self.word_weight_options) self.doc_tokens = [[ token for token in document if token not in self.stopwords ] for document in doc_tokens] self.doc_vectors = [ self._vectorize(tokens, weighted=True) for tokens in self.doc_tokens ] self.doc_norms = [self._norm(tokens) for tokens in doc_tokens] for i, document in enumerate(self.doc_tokens): for token in document: if token not in self.index: self.index[token] = [i] elif self.index[token][-1] != i: self.index[token].append(i)
def texts_to_tokens(texts): tokens = [] for text in texts: tokens += tokenize(normalize(text)) return tokens
def to_fasttext(line): return ' '.join(filter(lambda s: len(s) > 1, tokenize(normalize(line))))