示例#1
0
async def get_related(request: Request,
                      uid: str,
                      page_number: int = 1,
                      query_id: str = None):
    if not settings.related_search:
        raise HTTPException(status_code=404,
                            detail="Related search not enabled")

    searcher = request.app.state.searcher
    related_searcher = request.app.state.related_searcher

    # Invalid uid -> 404
    if uid not in related_searcher.uid_set:
        raise HTTPException(status_code=404, detail="Item not found")

    source_vector = related_searcher.embedding[uid]
    related_results = []

    # HNSW parameters.
    k = 20 * page_number
    # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
    # ef needs to be between k and dataset.size()
    ef = 2 * k
    related_searcher.hnsw.set_ef(ef)

    # Retrieve documents from HNSW.
    labels, distances = related_searcher.hnsw.knn_query(source_vector, k=k)
    start_idx = (page_number - 1) * 20
    end_idx = start_idx + 20
    for index, dist in zip(labels[0][start_idx:end_idx],
                           distances[0][start_idx:end_idx]):
        uid = related_searcher.index_to_uid[index]
        hit = searcher.doc(uid)
        if hit.lucene_document() is None:
            continue
        result = build_related_result(hit, dist)
        related_results.append(result)

    # Generate UUID for query.
    query_id = str(uuid4())

    # Log query and results.
    related_logger.info(
        json.dumps({
            "query_id": query_id,
            "uid": uid,
            "page_number": page_number,
            "request_ip": get_request_ip(request),
            "timestamp": datetime.utcnow().isoformat(),
            "response": [r.json() for r in related_results],
        }))

    return RelatedQueryResponse(query_id=query_id, response=related_results)
async def get_related(request: Request, uid: str, page_number: int = 1, query_id: str = None):
    searcher = request.app.state.searcher
    related_searcher = request.app.state.related_searcher

    # Invalid uid -> 404
    if uid not in related_searcher.uid_set:
        raise HTTPException(status_code=404, detail="Item not found")

    source_vector = related_searcher.embedding[uid]
    related_results = []

    source_vector = [float(vec) for vec in source_vector]

    parameters = {'ef': 101}
    # print(related_searcher.milvus.count_entities(related_searcher.collection_name))
    status, results = related_searcher.milvus.search(collection_name=related_searcher.collection_name, query_records=[source_vector], top_k=100, params=parameters)
    print("Milvus search.", status)
    labels = results.id_array
    distances = results.distance_array
    # print(len(labels), len(distances))

    start_idx = (page_number - 1)*20
    end_idx = start_idx + 20
    for index, dist in zip(labels[0][start_idx:end_idx], distances[0][start_idx:end_idx]):
        uid = related_searcher.index_to_uid[index]
        hit = searcher.doc(uid, SearchVertical.cord19)
        if hit.lucene_document() is None:
            continue
        result = build_related_result(hit, uid, dist)
        related_results.append(result)

    # Generate UUID for query.
    query_id = str(uuid4())

    # Log query and results.
    related_logger.info(json.dumps({
        'query_id': query_id,
        'uid': uid,
        'page_number': page_number,
        'request_ip': get_request_ip(request),
        'timestamp': datetime.utcnow().isoformat(),
        'response': [r.json() for r in related_results],
    }))

    return RelatedQueryResponse(query_id=query_id, response=related_results)
示例#3
0
async def get_search(request: Request, query: str):
    # Get search results from Lucene index.

    rc = subprocess.call(
        "python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
                          -threads 1 -input integrations/resources/sample_collection_jsonl \
                          -index indexes/sample_collection_jsonl -storePositions -storeDocvectors -storeRaw"
    )

    request.app.state.searcher.searcher = SimpleSearcher(
        'indexes/sample_collection_jsonl')

    try:
        searcher_hits = request.app.state.searcher.search(query)
    except:
        # Sometimes errors out due to encoding bugs.
        searcher_hits = []

    # Get paragraph or abstract if original document was retrieved.
    paragraphs = [hit.contents.split("\n")[-1] for hit in searcher_hits]

    # Get predictions from T5.
    if settings.neural_ranking:
        t5_scores = request.app.state.ranker.rerank(query, paragraphs)
        # Sort results by T5 scores.
        results = list(zip(searcher_hits, t5_scores))
        results.sort(key=lambda x: x[1], reverse=True)
    else:
        searcher_scores = [hit.score for hit in searcher_hits]
        results = list(zip(searcher_hits, searcher_scores))

    # Group paragraphs from same document by id in sorted order.
    grouped_results = OrderedDict()
    for result in results:
        base_docid = result[0].docid.split(".")[0]
        if base_docid not in grouped_results:
            grouped_results[base_docid] = [result]
        elif len(
                grouped_results[base_docid]) < settings.max_paragraphs_per_doc:
            # Append paragraph until we reach the configured maximum.
            grouped_results[base_docid].append(result)

    # Take top N paragraphs from each result to highlight and build article object.
    ranked_results = []
    for base_docid, doc_results in grouped_results.items():
        top_hit, top_score = doc_results[0]
        paragraphs = []
        highlighted_abstract = False

        for (hit, _) in doc_results:
            paragraph_number = (int(hit.docid.split(".")[-1])
                                if hit.docid != base_docid else -1)
            if paragraph_number == -1:
                highlighted_abstract = True
            paragraphs.append((hit.contents.split("\n")[-1], paragraph_number))

        # Sort top paragraphs by order of appearance in actual text.
        paragraphs.sort(key=lambda x: x[1])
        paragraphs = [text for text, _ in paragraphs]

        # Add full article to results.
        article = build_article(top_hit, top_score, paragraphs,
                                highlighted_abstract)
        ranked_results.append(article)

    if settings.highlight:
        # Highlights the paragraphs.
        highlight_time = time.time()
        paragraphs = []
        for result in ranked_results:
            paragraphs.extend(result.paragraphs)
        total_paragraphs = len(paragraphs)
        paragraphs = paragraphs[:settings.highlight_max_paragraphs]

        all_highlights = request.app.state.highlighter.highlight_paragraphs(
            query=query, paragraphs=paragraphs)
        all_highlights.extend(
            [[] for _ in range(total_paragraphs -
                               settings.highlight_max_paragraphs)])

        # Update results with highlights.
        highlight_idx = 0
        for result in ranked_results:
            num_paragraphs = len(result.paragraphs)
            result.highlights = all_highlights[highlight_idx:highlight_idx +
                                               num_paragraphs]
            highlight_idx += num_paragraphs
            if highlight_idx >= len(all_highlights):
                break

        print(f"Time to highlight: {time.time() - highlight_time}")

    # Generate UUID for query.
    query_id = str(uuid4())

    # Log query and results.
    search_logger.info(
        json.dumps({
            "query_id": query_id,
            "type": SearchLogType.query,
            "query": query,
            "request_ip": get_request_ip(request),
            "timestamp": datetime.utcnow().isoformat(),
            "response": [r.json() for r in ranked_results],
        }))

    return SearchQueryResponse(query_id=query_id, response=ranked_results)
示例#4
0
async def get_related(request: Request,
                      uid: str,
                      page_number: int = 1,
                      query_id: str = None):
    # Invalid uid -> 404
    if uid not in related_searcher.index_to_uid:
        raise HTTPException(status_code=404, detail="Item not found")

    source_vector = related_searcher.embedding[uid]
    related_results = []

    # HNSW parameters.
    k = 20 * page_number
    # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
    # ef needs to be between k and dataset.size()
    ef = 2 * k
    related_searcher.HNSW.set_ef(ef)

    # Retrieve documents from HNSW.
    labels, distances = related_searcher.HNSW.knn_query(source_vector, k=k)
    start_idx = (page_number - 1) * 20
    end_idx = start_idx + 20
    for index, dist in zip(labels[0][start_idx:end_idx],
                           distances[0][start_idx:end_idx]):
        uid = related_searcher.index_to_uid[index]
        related_results.append({
            'id':
            uid,
            'abstract':
            gen_metadata_from_uid(uid, 'abstract'),
            'authors':
            get_authors_from_uid(uid),
            'distance':
            str(dist),
            'journal':
            gen_metadata_from_uid(uid, 'journal'),
            'publish_time':
            gen_metadata_from_uid(uid, 'publish_time'),
            'source':
            gen_metadata_from_uid(uid, 'source_x'),
            'title':
            gen_metadata_from_uid(uid, 'title'),
            'url':
            gen_metadata_from_uid(uid, 'url'),
        })

    # Generate UUID for query.
    query_id = str(uuid4())

    # Log query and results.
    related_logger.info(
        json.dumps({
            'query_id': query_id,
            'uid': uid,
            'page_number': page_number,
            'request_ip': get_request_ip(request),
            'timestamp': datetime.utcnow().isoformat(),
            'response': related_results,
        }))

    return RelatedQueryResponse(query_id=query_id, response=related_results)
示例#5
0
async def get_search(request: Request, query: str, vertical: SearchVertical):
    # Get search results from Lucene index.
    try:
        searcher_hits = request.app.state.searcher.search(query, vertical)
    except:
        # Sometimes errors out due to encoding bugs.
        searcher_hits = []

    # Get paragraph or abstract if original document was retrieved.
    paragraphs = [hit.contents.split('\n')[-1] for hit in searcher_hits]

    # Get predictions from T5.
    t5_scores = request.app.state.ranker.rerank(query, paragraphs)

    # Sort results by T5 scores.
    results = list(zip(searcher_hits, t5_scores))
    results.sort(key=lambda x: x[1], reverse=True)

    # Group paragraphs from same document by id in sorted order.
    grouped_results = OrderedDict()
    for result in results:
        base_docid = result[0].docid.split('.')[0]
        if base_docid not in grouped_results:
            grouped_results[base_docid] = [result]
        elif len(
                grouped_results[base_docid]) < settings.max_paragraphs_per_doc:
            # Append paragraph until we reach the configured maximum.
            grouped_results[base_docid].append(result)

    # Take top N paragraphs from each result to highlight and build article object.
    ranked_results = []
    for base_docid, doc_results in grouped_results.items():
        top_hit, top_score = doc_results[0]
        paragraphs = []
        highlighted_abstract = False

        for (hit, score) in doc_results:
            paragraph_number = int(
                hit.docid.split('.')[-1]) if hit.docid != base_docid else -1
            if paragraph_number == -1:
                highlighted_abstract = True
            paragraphs.append((hit.contents.split('\n')[-1], paragraph_number))

        # Sort top paragraphs by order of appearance in actual text.
        paragraphs.sort(key=lambda x: x[1])
        paragraphs = [text for text, number in paragraphs]

        # Add full article to results.
        article = build_article(top_hit, base_docid, top_score, paragraphs,
                                highlighted_abstract, vertical)
        ranked_results.append(article)

    if settings.highlight:
        # Highlights the paragraphs.
        highlight_time = time.time()
        paragraphs = []
        for result in ranked_results:
            paragraphs.extend(result.paragraphs)
        total_paragraphs = len(paragraphs)
        paragraphs = paragraphs[:settings.highlight_max_paragraphs]

        all_highlights = request.app.state.highlighter.highlight_paragraphs(
            query=query, paragraphs=paragraphs)
        all_highlights.extend(
            [[] for _ in range(total_paragraphs -
                               settings.highlight_max_paragraphs)])

        # Update results with highlights.
        highlight_idx = 0
        for result in ranked_results:
            num_paragraphs = len(result.paragraphs)
            result.highlights = all_highlights[highlight_idx:highlight_idx +
                                               num_paragraphs]
            highlight_idx += num_paragraphs
            if highlight_idx >= len(all_highlights):
                break

        print(f'Time to highlight: {time.time() - highlight_time}')

    # Generate UUID for query.
    query_id = str(uuid4())

    # Log query and results.
    search_logger.info(
        json.dumps({
            'query_id': query_id,
            'type': SearchLogType.query,
            'vertical': vertical,
            'query': query,
            'request_ip': get_request_ip(request),
            'timestamp': datetime.utcnow().isoformat(),
            'response': [r.json() for r in ranked_results]
        }))

    return SearchQueryResponse(query_id=query_id, response=ranked_results)