def test_indexing_non_html(session, mock_s3): sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session) crawl_resp.headers["content-type"] = "application/pdf" # type: ignore indexing.index(session, crawl_req.crawl_uuid) fulltext_count = (session.query(FullText).filter( FullText.crawl_uuid == crawl_req.crawl_uuid).count()) assert fulltext_count == 0
def test_indexing_for_fresh(session, mock_s3): sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session) indexing.index(session, crawl_req.crawl_uuid) fulltext_obj = session.query(FullText).get(sqla_url.url_uuid) assert fulltext_obj.url_uuid == sqla_url.url_uuid assert fulltext_obj.crawl_uuid == crawl_req.crawl_uuid assert fulltext_obj.inserted == datetime(2018, 1, 3, tzinfo=timezone.utc) assert len(fulltext_obj.tsvector.split(" ")) == 10 assert len(fulltext_obj.full_text) > 0
def test_indexing_with_content_type_problems(session, mock_s3, headers): sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session) crawl_resp.headers = headers indexing.index(session, crawl_req.crawl_uuid) fulltext_obj = session.query(FullText).get(sqla_url.url_uuid) assert fulltext_obj.url_uuid == sqla_url.url_uuid assert fulltext_obj.crawl_uuid == crawl_req.crawl_uuid assert fulltext_obj.inserted == datetime(2018, 1, 3, tzinfo=timezone.utc) assert len(fulltext_obj.tsvector.split(" ")) == 10 assert len(fulltext_obj.full_text) > 0
def test_index_throws_an_error(session, mock_s3): sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session) session.commit() # First time, error thrown and recorded with mock.patch.object(indexing, "extract_metadata_from_html") as mock_gmd: mock_gmd.side_effect = RuntimeError indexing.index(session, crawl_req.crawl_uuid) error_count = (session.query(IndexingError).filter( IndexingError.crawl_uuid == crawl_req.crawl_uuid).count()) assert error_count == 1 # Second time, it's skipped indexing.index(session, crawl_req.crawl_uuid) assert error_count == 1
def on_index_requested(message: PickleMessage, ctx: missive.HandlingContext): event = cast(IndexRequested, message.get_obj()) session = get_session(ctx) metadata = indexing.index(session, event.crawl_uuid) if metadata: icon_message = icon_message_if_necessary(session, metadata) else: icon_message = None session.commit() ctx.ack() if icon_message: publish_message(icon_message, environ["QM_RABBITMQ_BG_WORKER_TOPIC"])
def test_indexing_idempotent(session, mock_s3): sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session) fulltext = FullText( url_uuid=sqla_url.url_uuid, crawl_uuid=crawl_req.crawl_uuid, inserted=datetime(2018, 1, 3, tzinfo=timezone.utc), full_text="hello world", tsvector=func.to_tsvector("hello world"), ) session.add(fulltext) session.commit() indexing.index(session, crawl_req.crawl_uuid) fulltext_count = (session.query(FullText).filter( FullText.url_uuid == sqla_url.url_uuid).count()) assert fulltext_count == 1 error_count = (session.query(IndexingError).filter( IndexingError.crawl_uuid == crawl_req.crawl_uuid).count()) assert error_count == 0