Exemplo n.º 1
0
def compute_tf(query_terms: List[str], index_reader: IndexReader,
               doc_id: str) -> np.ndarray:
    query_tf = np.zeros(len(query_terms))
    doc_vector = index_reader.get_document_vector(doc_id)

    for i, term in enumerate(query_terms):
        query_tf[i] = doc_vector.get(term, 0)

    return query_tf
# Retrieve a document using its docid
#id = 'd6ed7028c686e5756ceb0aa0c9b62e0d'
id = hits[0].docid

# See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py
# properties: docid; id (alias); lucene_document; contents; raw
doc = reader.doc(id).raw()
#print(doc)

# Get analyzed form (tokenized, stemmed, stopwords removed)
analyzed = reader.analyze(doc)
#print(analyzed)

# Raw document VECTOR is also stored
doc_vector = reader.get_document_vector(id)
utils.top_n_words(doc_vector, 10)

# ----------------
# Topics
# ----------------
from pyserini.search import get_topics
topics = get_topics('core18')

# Get some information on all topics
utils.print_topic(topics)

# More detailed info on the black bear example
utils.print_topic(topics, id=336)

# ----------------