async def get_related(request: Request, uid: str, page_number: int = 1, query_id: str = None): if not settings.related_search: raise HTTPException(status_code=404, detail="Related search not enabled") searcher = request.app.state.searcher related_searcher = request.app.state.related_searcher # Invalid uid -> 404 if uid not in related_searcher.uid_set: raise HTTPException(status_code=404, detail="Item not found") source_vector = related_searcher.embedding[uid] related_results = [] # HNSW parameters. k = 20 * page_number # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md # ef needs to be between k and dataset.size() ef = 2 * k related_searcher.hnsw.set_ef(ef) # Retrieve documents from HNSW. labels, distances = related_searcher.hnsw.knn_query(source_vector, k=k) start_idx = (page_number - 1) * 20 end_idx = start_idx + 20 for index, dist in zip(labels[0][start_idx:end_idx], distances[0][start_idx:end_idx]): uid = related_searcher.index_to_uid[index] hit = searcher.doc(uid) if hit.lucene_document() is None: continue result = build_related_result(hit, dist) related_results.append(result) # Generate UUID for query. query_id = str(uuid4()) # Log query and results. related_logger.info( json.dumps({ "query_id": query_id, "uid": uid, "page_number": page_number, "request_ip": get_request_ip(request), "timestamp": datetime.utcnow().isoformat(), "response": [r.json() for r in related_results], })) return RelatedQueryResponse(query_id=query_id, response=related_results)
async def get_related(request: Request, uid: str, page_number: int = 1, query_id: str = None): searcher = request.app.state.searcher related_searcher = request.app.state.related_searcher # Invalid uid -> 404 if uid not in related_searcher.uid_set: raise HTTPException(status_code=404, detail="Item not found") source_vector = related_searcher.embedding[uid] related_results = [] source_vector = [float(vec) for vec in source_vector] parameters = {'ef': 101} # print(related_searcher.milvus.count_entities(related_searcher.collection_name)) status, results = related_searcher.milvus.search(collection_name=related_searcher.collection_name, query_records=[source_vector], top_k=100, params=parameters) print("Milvus search.", status) labels = results.id_array distances = results.distance_array # print(len(labels), len(distances)) start_idx = (page_number - 1)*20 end_idx = start_idx + 20 for index, dist in zip(labels[0][start_idx:end_idx], distances[0][start_idx:end_idx]): uid = related_searcher.index_to_uid[index] hit = searcher.doc(uid, SearchVertical.cord19) if hit.lucene_document() is None: continue result = build_related_result(hit, uid, dist) related_results.append(result) # Generate UUID for query. query_id = str(uuid4()) # Log query and results. related_logger.info(json.dumps({ 'query_id': query_id, 'uid': uid, 'page_number': page_number, 'request_ip': get_request_ip(request), 'timestamp': datetime.utcnow().isoformat(), 'response': [r.json() for r in related_results], })) return RelatedQueryResponse(query_id=query_id, response=related_results)
async def get_search(request: Request, query: str): # Get search results from Lucene index. rc = subprocess.call( "python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ -threads 1 -input integrations/resources/sample_collection_jsonl \ -index indexes/sample_collection_jsonl -storePositions -storeDocvectors -storeRaw" ) request.app.state.searcher.searcher = SimpleSearcher( 'indexes/sample_collection_jsonl') try: searcher_hits = request.app.state.searcher.search(query) except: # Sometimes errors out due to encoding bugs. searcher_hits = [] # Get paragraph or abstract if original document was retrieved. paragraphs = [hit.contents.split("\n")[-1] for hit in searcher_hits] # Get predictions from T5. if settings.neural_ranking: t5_scores = request.app.state.ranker.rerank(query, paragraphs) # Sort results by T5 scores. results = list(zip(searcher_hits, t5_scores)) results.sort(key=lambda x: x[1], reverse=True) else: searcher_scores = [hit.score for hit in searcher_hits] results = list(zip(searcher_hits, searcher_scores)) # Group paragraphs from same document by id in sorted order. grouped_results = OrderedDict() for result in results: base_docid = result[0].docid.split(".")[0] if base_docid not in grouped_results: grouped_results[base_docid] = [result] elif len( grouped_results[base_docid]) < settings.max_paragraphs_per_doc: # Append paragraph until we reach the configured maximum. grouped_results[base_docid].append(result) # Take top N paragraphs from each result to highlight and build article object. ranked_results = [] for base_docid, doc_results in grouped_results.items(): top_hit, top_score = doc_results[0] paragraphs = [] highlighted_abstract = False for (hit, _) in doc_results: paragraph_number = (int(hit.docid.split(".")[-1]) if hit.docid != base_docid else -1) if paragraph_number == -1: highlighted_abstract = True paragraphs.append((hit.contents.split("\n")[-1], paragraph_number)) # Sort top paragraphs by order of appearance in actual text. paragraphs.sort(key=lambda x: x[1]) paragraphs = [text for text, _ in paragraphs] # Add full article to results. article = build_article(top_hit, top_score, paragraphs, highlighted_abstract) ranked_results.append(article) if settings.highlight: # Highlights the paragraphs. highlight_time = time.time() paragraphs = [] for result in ranked_results: paragraphs.extend(result.paragraphs) total_paragraphs = len(paragraphs) paragraphs = paragraphs[:settings.highlight_max_paragraphs] all_highlights = request.app.state.highlighter.highlight_paragraphs( query=query, paragraphs=paragraphs) all_highlights.extend( [[] for _ in range(total_paragraphs - settings.highlight_max_paragraphs)]) # Update results with highlights. highlight_idx = 0 for result in ranked_results: num_paragraphs = len(result.paragraphs) result.highlights = all_highlights[highlight_idx:highlight_idx + num_paragraphs] highlight_idx += num_paragraphs if highlight_idx >= len(all_highlights): break print(f"Time to highlight: {time.time() - highlight_time}") # Generate UUID for query. query_id = str(uuid4()) # Log query and results. search_logger.info( json.dumps({ "query_id": query_id, "type": SearchLogType.query, "query": query, "request_ip": get_request_ip(request), "timestamp": datetime.utcnow().isoformat(), "response": [r.json() for r in ranked_results], })) return SearchQueryResponse(query_id=query_id, response=ranked_results)
async def get_related(request: Request, uid: str, page_number: int = 1, query_id: str = None): # Invalid uid -> 404 if uid not in related_searcher.index_to_uid: raise HTTPException(status_code=404, detail="Item not found") source_vector = related_searcher.embedding[uid] related_results = [] # HNSW parameters. k = 20 * page_number # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md # ef needs to be between k and dataset.size() ef = 2 * k related_searcher.HNSW.set_ef(ef) # Retrieve documents from HNSW. labels, distances = related_searcher.HNSW.knn_query(source_vector, k=k) start_idx = (page_number - 1) * 20 end_idx = start_idx + 20 for index, dist in zip(labels[0][start_idx:end_idx], distances[0][start_idx:end_idx]): uid = related_searcher.index_to_uid[index] related_results.append({ 'id': uid, 'abstract': gen_metadata_from_uid(uid, 'abstract'), 'authors': get_authors_from_uid(uid), 'distance': str(dist), 'journal': gen_metadata_from_uid(uid, 'journal'), 'publish_time': gen_metadata_from_uid(uid, 'publish_time'), 'source': gen_metadata_from_uid(uid, 'source_x'), 'title': gen_metadata_from_uid(uid, 'title'), 'url': gen_metadata_from_uid(uid, 'url'), }) # Generate UUID for query. query_id = str(uuid4()) # Log query and results. related_logger.info( json.dumps({ 'query_id': query_id, 'uid': uid, 'page_number': page_number, 'request_ip': get_request_ip(request), 'timestamp': datetime.utcnow().isoformat(), 'response': related_results, })) return RelatedQueryResponse(query_id=query_id, response=related_results)
async def get_search(request: Request, query: str, vertical: SearchVertical): # Get search results from Lucene index. try: searcher_hits = request.app.state.searcher.search(query, vertical) except: # Sometimes errors out due to encoding bugs. searcher_hits = [] # Get paragraph or abstract if original document was retrieved. paragraphs = [hit.contents.split('\n')[-1] for hit in searcher_hits] # Get predictions from T5. t5_scores = request.app.state.ranker.rerank(query, paragraphs) # Sort results by T5 scores. results = list(zip(searcher_hits, t5_scores)) results.sort(key=lambda x: x[1], reverse=True) # Group paragraphs from same document by id in sorted order. grouped_results = OrderedDict() for result in results: base_docid = result[0].docid.split('.')[0] if base_docid not in grouped_results: grouped_results[base_docid] = [result] elif len( grouped_results[base_docid]) < settings.max_paragraphs_per_doc: # Append paragraph until we reach the configured maximum. grouped_results[base_docid].append(result) # Take top N paragraphs from each result to highlight and build article object. ranked_results = [] for base_docid, doc_results in grouped_results.items(): top_hit, top_score = doc_results[0] paragraphs = [] highlighted_abstract = False for (hit, score) in doc_results: paragraph_number = int( hit.docid.split('.')[-1]) if hit.docid != base_docid else -1 if paragraph_number == -1: highlighted_abstract = True paragraphs.append((hit.contents.split('\n')[-1], paragraph_number)) # Sort top paragraphs by order of appearance in actual text. paragraphs.sort(key=lambda x: x[1]) paragraphs = [text for text, number in paragraphs] # Add full article to results. article = build_article(top_hit, base_docid, top_score, paragraphs, highlighted_abstract, vertical) ranked_results.append(article) if settings.highlight: # Highlights the paragraphs. highlight_time = time.time() paragraphs = [] for result in ranked_results: paragraphs.extend(result.paragraphs) total_paragraphs = len(paragraphs) paragraphs = paragraphs[:settings.highlight_max_paragraphs] all_highlights = request.app.state.highlighter.highlight_paragraphs( query=query, paragraphs=paragraphs) all_highlights.extend( [[] for _ in range(total_paragraphs - settings.highlight_max_paragraphs)]) # Update results with highlights. highlight_idx = 0 for result in ranked_results: num_paragraphs = len(result.paragraphs) result.highlights = all_highlights[highlight_idx:highlight_idx + num_paragraphs] highlight_idx += num_paragraphs if highlight_idx >= len(all_highlights): break print(f'Time to highlight: {time.time() - highlight_time}') # Generate UUID for query. query_id = str(uuid4()) # Log query and results. search_logger.info( json.dumps({ 'query_id': query_id, 'type': SearchLogType.query, 'vertical': vertical, 'query': query, 'request_ip': get_request_ip(request), 'timestamp': datetime.utcnow().isoformat(), 'response': [r.json() for r in ranked_results] })) return SearchQueryResponse(query_id=query_id, response=ranked_results)