Exemplo n.º 1
0
 def test_should_index_document(self):
     extract_text_pending_gazettes(
         self.database_mock,
         self.storage_mock,
         self.index_mock,
         self.text_extraction_function,
     )
     self.index_mock.index_document.assert_called()
Exemplo n.º 2
0
 def test_database_call(self):
     extract_text_pending_gazettes(
         self.database_mock,
         self.storage_mock,
         self.index_mock,
         self.text_extraction_function,
     )
     self.database_mock.get_pending_gazettes.assert_called_once()
Exemplo n.º 3
0
    def test_set_gazette_as_processed(self):
        extract_text_pending_gazettes(
            self.database_mock,
            self.storage_mock,
            self.index_mock,
            self.text_extraction_function,
        )

        self.database_mock.set_gazette_as_processed.assert_called_once_with(
            1, "972aca2e-1174-11eb-b2d5-a86daaca905e")
def start_to_process_pending_gazettes():
    """
    Setup objects necessary to extract text from the gazettes and call the
    tasks function to extract the text
    """
    enable_debug_if_necessary()
    database = create_database_interface()
    storage = create_storage_interface()
    index = create_index_interface()
    text_extractor = create_apache_tika_text_extraction()
    extract_text_pending_gazettes(database, storage, index, text_extractor)
Exemplo n.º 5
0
    def test_gazette_url(self):
        expected_data = self.data[0].copy()
        expected_data["url"] = f"http://test.com/{expected_data['file_path']}"

        extract_text_pending_gazettes(
            self.database_mock,
            self.storage_mock,
            self.index_mock,
            self.text_extraction_function,
        )
        self.index_mock.index_document.assert_called_once_with(expected_data)
Exemplo n.º 6
0
    def test_text_extraction_function_call(self):
        extract_text_pending_gazettes(
            self.database_mock,
            self.storage_mock,
            self.index_mock,
            self.text_extraction_function,
        )

        self.text_extraction_function.extract_text.assert_called_once()
        self.assertEqual(
            len(self.text_extraction_function.extract_text.call_args.args), 1)
        self.assertIsInstance(
            self.text_extraction_function.extract_text.call_args.args[0], str)
Exemplo n.º 7
0
    def test_storage_call_to_get_file(self):
        extract_text_pending_gazettes(
            self.database_mock,
            self.storage_mock,
            self.index_mock,
            self.text_extraction_function,
        )

        self.storage_mock.get_file.assert_called_once()
        self.assertEqual(self.storage_mock.get_file.call_args.args[0],
                         self.data[0]["file_path"])
        self.assertIsInstance(self.storage_mock.get_file.call_args.args[1],
                              tempfile._TemporaryFileWrapper)
Exemplo n.º 8
0
    def test_invalid_file_type_should_be_skipped(self):

        text_extraction_function = MagicMock(spec=TextExtractorInterface)
        text_extraction_function.extract_text.side_effect = Exception(
            "Unsupported file type")

        extract_text_pending_gazettes(
            self.database_mock,
            self.storage_mock,
            self.index_mock,
            text_extraction_function,
        )
        self.storage_mock.get_file.assert_called_once()
        self.database_mock.get_pending_gazettes.assert_called_once()
        self.database_mock.set_gazette_as_processed.assert_not_called()
        self.index_mock.index_document.assert_not_called()
        self.file_should_not_exist(
            text_extraction_function.extract_text.call_args.args[0])
Exemplo n.º 9
0
    def test_indexed_document_should_contain_gazette_content(self):
        database_mock = MagicMock()
        data = [{
            "id": 1,
            "source_text": "",
            "date": date(2020, 10, 18),
            "edition_number": "1",
            "is_extra_edition": False,
            "power": "executive",
            "file_checksum": "972aca2e-1174-11eb-b2d5-a86daaca905e",
            "file_path": "tests/data/fake_gazette.txt",
            "file_url": "www.querido-diario.org",
            "scraped_at": datetime.now(),
            "created_at": datetime.now(),
            "territory_id": "3550308",
            "processed": False,
            "state_code": "SC",
            "territory_name": "Gaspar",
            "url": "http://test.com/tests/data/fake_gazette.txt",
            "file_raw_txt": "http://test.com/tests/data/fake_gazette.txt",
        }]
        expected_data = data[0].copy()
        with open("tests/data/fake_gazette.txt", "r") as f:
            expected_data["source_text"] = f.read()

        database_mock.get_pending_gazettes = MagicMock(return_value=data)
        database_mock.set_gazette_as_processed = MagicMock()

        tmp_gazette_file = self.copy_file_to_temporary_file(
            "tests/data/fake_gazette.txt")
        text_extraction_function = MagicMock(spec=TextExtractorInterface)
        text_extraction_function.extract_text.return_value = expected_data[
            "source_text"]

        extract_text_pending_gazettes(
            database_mock,
            self.storage_mock,
            self.index_mock,
            text_extraction_function,
        )
        self.index_mock.index_document.assert_called_once_with(expected_data)
Exemplo n.º 10
0
    def test_invalid_file_type_should_be_skipped_and_valid_should_be_processed(
            self):
        database_mock = MagicMock()
        data = [
            {
                "id":
                1,
                "source_text":
                "",
                "date":
                date(2020, 10, 18),
                "edition_number":
                "1",
                "is_extra_edition":
                False,
                "power":
                "executive",
                "file_checksum":
                "972aca2e-1174-11eb-b2d5-a86daaca905e",
                "file_path":
                "sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf",
                "file_url":
                "www.querido-diario.org",
                "scraped_at":
                datetime.now(),
                "created_at":
                datetime.now(),
                "territory_id":
                "3550308",
                "processed":
                False,
                "state_code":
                "SC",
                "territory_name":
                "Gaspar",
                "url":
                "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf",
                "file_raw_txt":
                "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.txt",
            },
            {
                "id":
                2,
                "source_text":
                "",
                "date":
                date(2020, 10, 19),
                "edition_number":
                "1",
                "is_extra_edition":
                False,
                "power":
                "executive",
                "file_checksum":
                "972aca2e-1174-11eb-b2d5-a86daaca905e",
                "file_path":
                "sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf",
                "file_url":
                "www.querido-diario.org",
                "scraped_at":
                datetime.now(),
                "created_at":
                datetime.now(),
                "territory_id":
                "3550308",
                "processed":
                False,
                "state_code":
                "SC",
                "territory_name":
                "Gaspar",
                "url":
                "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf",
                "file_raw_txt":
                "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.txt",
            },
        ]
        database_mock.get_pending_gazettes = MagicMock(return_value=data)
        database_mock.set_gazette_as_processed = MagicMock()

        file_content_returned_by_text_extraction_function_mock = None
        with open("tests/data/fake_gazette.txt", "r") as f:
            file_content_returned_by_text_extraction_function_mock = f.read()

        text_extraction_function = MagicMock(spec=TextExtractorInterface)
        text_extraction_function.extract_text.side_effect = [
            Exception("Unsupported file type"),
            file_content_returned_by_text_extraction_function_mock,
        ]

        extract_text_pending_gazettes(
            database_mock,
            self.storage_mock,
            self.index_mock,
            text_extraction_function,
        )

        database_mock.get_pending_gazettes.assert_called_once()
        self.assert_called_twice(self.storage_mock.get_file)
        self.assert_called_twice(text_extraction_function.extract_text)
        database_mock.set_gazette_as_processed.assert_called_once()
        self.index_mock.index_document.assert_called_once()
        self.file_should_not_exist(
            text_extraction_function.extract_text.call_args.args[0])