예제 #1
0
 def query_index(self, query, n_results=5):
     query_vector = self._vectorize(tokenize(normalize(query)), indexing=False)
     results, distances = self.index.get_nns_by_vector(query_vector,
                                                       n=n_results,
                                                       include_distances=True,
                                                       search_k=10 * len(self.documents))
     return [(distance, self.documents[result]) for result, distance in zip(results, distances)]
예제 #2
0
 def query_index(self, query, n_results=5):
     query_tokens = [
         word for word in tokenize(normalize(query)) if word in self.index
     ]
     query_norm = self._norm(query_tokens)
     query_vector = self._vectorize(query_tokens)
     processed = set()
     top_hits = [
         (0, None)
     ] * n_results  # using simple array with assumption that n_results is small
     for token in set(query_tokens):
         for document_id in self.index[token]:
             if document_id in processed:
                 continue
             processed.add(document_id)
             document_tokens, document_norm = self.doc_tokens[
                 document_id], self.doc_norms[document_id]
             document_vector = self.doc_vectors[document_id]
             similarity = self._similarity_unnorm(
                 unweighted_vector=query_vector,
                 weighted_vector=document_vector)
             similarity /= (query_norm * document_norm)
             if similarity > top_hits[0][0]:
                 del top_hits[0]
                 insert_location = 0
                 for score, _ in top_hits:
                     if similarity < score:
                         break
                     insert_location += 1
                 top_hits.insert(insert_location, (similarity, document_id))
     return [(score, self.documents[doc_id])
             for (score, doc_id) in reversed(top_hits)
             if doc_id is not None]
예제 #3
0
def write_vocabulary(collection, parsed_args):
    os.makedirs(os.path.join(parsed_args.out_dir, collection.name), exist_ok=True)

    vocabulary = set()
    texts = list(collection.documents.values()) + list(collection.queries.values())
    for text in texts:
        vocabulary |= set(tokenize(normalize(text)))

    with open(os.path.join(parsed_args.out_dir, collection.name + '-vocabulary.txt'), 'w') as f:
        f.write('\n'.join(vocabulary))
예제 #4
0
def write_to_standard_form(collection, parsed_args):
    def write_to_file(name, dictionary, transform=lambda i: i):
        out_path = os.path.join(parsed_args.out_dir, collection.name, '{}-{}.txt'.format(collection.name, name))
        with open(out_path, 'w') as out_file:
            out_file.write('.count.{} {}\n\n'.format(name, len(dictionary)))
            for item_id, item in dictionary.items():
                out_file.write('.id.{} {}\n{}\n\n'.format(name[0], item_id, transform(item)))

    os.makedirs(os.path.join(parsed_args.out_dir, collection.name), exist_ok=True)

    if parsed_args.phrases:
        phrases = detect_phrases([normalize(doc) for doc in collection.documents])
        transform_f = lambda l: ' '.join(tokenize(replace_phrases([normalize(l)], phrases=phrases)))
    else:
        # transform_f = lambda l: ' '.join(tokenize(normalize(l)))
        transform_f = lambda l: ' '.join(filter(lambda s: len(s) > 1, tokenize(normalize(l))))

    write_to_file('documents', collection.documents, transform=transform_f)
    write_to_file('queries', collection.queries, transform=transform_f)
    write_to_file('relevance', collection.relevance, transform=lambda l: ' '.join(map(str, l)))
예제 #5
0
    def index_documents(self, documents):
        doc_tokens = []
        for i, document in enumerate(documents):
            self.documents.append(document)
            doc_tokens.append(tokenize(normalize(document)))

        self._init_word_weights_stopwords(doc_tokens, **self.word_weight_options)

        for i, tokens in enumerate(doc_tokens):
            self.index.add_item(i, self._vectorize(tokens=tokens, indexing=True))
        self.index.build(n_trees=10)
예제 #6
0
    def index_documents(self, documents):
        self.documents = list(documents)
        doc_tokens = []
        for document in documents:
            doc_tokens.append(tokenize(normalize(document)))
        self._init_word_weights_stopwords(doc_tokens,
                                          **self.word_weight_options)
        self.doc_tokens = [[
            token for token in document if token not in self.stopwords
        ] for document in doc_tokens]
        self.doc_vectors = [
            self._vectorize(tokens, weighted=True)
            for tokens in self.doc_tokens
        ]
        self.doc_norms = [self._norm(tokens) for tokens in doc_tokens]

        for i, document in enumerate(self.doc_tokens):
            for token in document:
                if token not in self.index:
                    self.index[token] = [i]
                elif self.index[token][-1] != i:
                    self.index[token].append(i)
예제 #7
0
def texts_to_tokens(texts):
    tokens = []
    for text in texts:
        tokens += tokenize(normalize(text))
    return tokens
예제 #8
0
 def to_fasttext(line):
     return ' '.join(filter(lambda s: len(s) > 1, tokenize(normalize(line))))