예제 #1
0
    def get_posting_lists(self,
                          filepath: str) -> Iterator[Tuple[str, List[int]]]:
        docs = read_docs(filepath)

        def zip_with_doc(doc: Document):
            return doc, sorted(get_terms(doc))

        doc_term_pairs = map(zip_with_doc, docs)
        term_doc_pairs = map(self.make_term_doc_pair, doc_term_pairs)
        sorted_pairs = heapq.merge(*term_doc_pairs, key=get_fst)
        docs_by_term = groupby(sorted_pairs, get_fst)

        def sort_docs(pair: Tuple[str, List[Tuple[str, int]]]):
            term, docs = pair
            docs_sorted = sorted(map(get_snd, docs))
            return term, list(unique(docs_sorted))

        return map(sort_docs, docs_by_term)
예제 #2
0
    def get_posting_lists(
            self,
            filepath: str) -> Iterator[Tuple[str, List[Tuple[int, int]]]]:
        docs = read_docs(filepath)

        def zip_with_doc(doc: Document):
            return doc, sorted(get_terms_freqs(doc), key=get_fst)

        doc_term_pairs = map(zip_with_doc, docs)
        term_doc_pairs = map(self.make_term_doc_pair, doc_term_pairs)
        sorted_pairs = heapq.merge(*term_doc_pairs, key=get_fst)
        docs_by_term = groupby(sorted_pairs, get_fst)

        def sort_docs(tup: Tuple[str, List[Tuple[str, int, int]]]):
            term, docs_tfs = tup
            docs_sorted = sorted(((doc, tf) for term, doc, tf in docs_tfs),
                                 key=get_fst)
            grouped = groupbyfst(docs_sorted, op.add, 0)
            return term, list(grouped)

        return map(sort_docs, docs_by_term)
예제 #3
0
def get_docs_text(filepath: str):
    return (get_terms(doc.raw)
            for doc in sorted(read_docs(filepath), key=lambda d: d.id))
예제 #4
0
    merged = []
    i, j = 0, 0
    while i < len(xs) and j < len(ys):
        x, y = xs[i], ys[j]
        if x == y:
            i += 1
            j += 1
        elif x > y:
            merged.append(y)
            j += 1
        else:
            merged.append(x)
            i += 1
    extra = xs[i:] if i < len(xs) else ys[j:] if j < len(ys) else []
    return merged + extra


docs = list(read_docs(docs_path))
inv_index = InvertedIndex(docs)

queries = read_queries(queries_path)
sorted_queries = sorted(queries, key=lambda q: q.id)
print("merge with conjunction ---------------")
for q in sorted_queries:
    docs = inv_index.find_docs(q, merge_and)
    print(q.id, docs)

print("merge with disjunction --------------")
for q in sorted_queries:
    docs = inv_index.find_docs(q, merge_or)
    print(q.id, docs)