def compute_tf(query_terms: List[str], index_reader: IndexReader, doc_id: str) -> np.ndarray: query_tf = np.zeros(len(query_terms)) doc_vector = index_reader.get_document_vector(doc_id) for i, term in enumerate(query_terms): query_tf[i] = doc_vector.get(term, 0) return query_tf
# Retrieve a document using its docid #id = 'd6ed7028c686e5756ceb0aa0c9b62e0d' id = hits[0].docid # See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py # properties: docid; id (alias); lucene_document; contents; raw doc = reader.doc(id).raw() #print(doc) # Get analyzed form (tokenized, stemmed, stopwords removed) analyzed = reader.analyze(doc) #print(analyzed) # Raw document VECTOR is also stored doc_vector = reader.get_document_vector(id) utils.top_n_words(doc_vector, 10) # ---------------- # Topics # ---------------- from pyserini.search import get_topics topics = get_topics('core18') # Get some information on all topics utils.print_topic(topics) # More detailed info on the black bear example utils.print_topic(topics, id=336) # ----------------