예제 #1
0
    def test_export_collection_missing_warc(self, mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]

        export_message = {
            "id": "test3",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "seeds": [{
                "id": "005b131f5f854402afa2b08a4b7ba960",
                "uid": "uid1"
            }],
            "format": "csv",
            "segment_size": None,
            "path": self.export_path
        }

        exporter = BaseExporter("http://test",
                                None,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")

        self.assertFalse(exporter.result.success)
        self.assertEqual(CODE_BAD_REQUEST, exporter.result.errors[0].code)
예제 #2
0
    def test_export_full_json(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {
                "key1": "k1v1",
                "key2": "k2v1",
                "key3": "k3v1"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v2",
                "key2": "k2v2",
                "key3": "k3v2"
            })
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(None,
                                mock_warc_iter_cls,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter._full_json_export(self.warcs, export_filepath, True, now,
                                   None, limit_uids, None)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(dedupe=True,
                                                    item_date_start=now,
                                                    item_date_end=None,
                                                    limit_item_types=None)

        file_path = export_filepath + '_001.json'
        self.assertTrue(os.path.exists(file_path))
        with open(file_path, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertDictEqual({
            "key1": "k1v1",
            "key2": "k2v1",
            "key3": "k3v1"
        }, json.loads(lines[0]))
예제 #3
0
    def test_export_collection(self, mock_producer, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        item_date_start = "2007-01-25T12:00:00Z"
        item_datetime_start = iso8601.parse_date(item_date_start)
        item_date_end = "2008-02-25T12:00:00Z"
        item_datetime_end = iso8601.parse_date(item_date_end)
        harvest_date_start = "2007-03-25T12:00:00Z"
        harvest_date_end = "2008-04-25T12:00:00Z"

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "csv",
            "segment_size": None,
            "path": self.export_path,
            "dedupe": True,
            "item_date_start": item_date_start,
            "item_date_end": item_date_end,
            "harvest_date_start": harvest_date_start,
            "harvest_date_end": harvest_date_end,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_start=harvest_date_start,
            harvest_date_end=harvest_date_end)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, True,
                                               item_datetime_start,
                                               item_datetime_end, [], None)

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test1_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))

        name, _, kwargs = mock_producer.mock_calls[1]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("running", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(
            export_status_message["date_ended"]))
        self.assertEqual("test1", export_status_message["id"])
        self.assertEqual("Base Exporter", export_status_message["service"])
        self.assertEqual("testhost", export_status_message["host"])
        self.assertTrue(export_status_message["instance"])
예제 #4
0
    def test_export_full_json_segment(self):
        mock_warc_iter_cls = MagicMock()
        mock_warc_iter = MagicMock()
        mock_warc_iter_cls.side_effect = [mock_warc_iter]
        mock_warc_iter.iter.return_value = [
            IterItem(None, None, None, None, {
                "key1": "k1v1",
                "key2": "k2v1",
                "key3": "k3v1"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v2",
                "key2": "k2v2",
                "key3": "k3v2"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v3",
                "key2": "k2v3",
                "key3": "k3v3"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v4",
                "key2": "k2v4",
                "key3": "k3v4"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v5",
                "key2": "k2v5",
                "key3": "k3v5"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v6",
                "key2": "k2v6",
                "key3": "k3v6"
            }),
            IterItem(None, None, None, None, {
                "key1": "k1v7",
                "key2": "k2v7",
                "key3": "k3v7"
            })
        ]

        export_filepath = os.path.join(self.export_path, "test")
        now = datetime_now()
        limit_uids = [11, 14]

        exporter = BaseExporter(None,
                                mock_warc_iter_cls,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter._full_json_export(self.warcs, export_filepath, True, now,
                                   None, limit_uids, 3)

        mock_warc_iter_cls.assert_called_once_with(self.warcs, limit_uids)
        mock_warc_iter.iter.assert_called_once_with(dedupe=True,
                                                    item_date_start=now,
                                                    item_date_end=None,
                                                    limit_item_types=None)

        # file test_1.json, test_2.json , test_3.json
        for idx in range(3):
            file_path = export_filepath + '_' + str(idx + 1).zfill(3) + '.json'
            self.assertTrue(os.path.exists(file_path))
            with open(file_path, "r") as f:
                lines = f.readlines()
            # the test_3.json only has 1 row
            if idx == 2:
                self.assertEqual(1, len(lines))
            else:
                self.assertEqual(3, len(lines))
            self.assertDictEqual(
                {
                    "key1": "k1v" + str(1 + idx * 3),
                    "key2": "k2v" + str(1 + idx * 3),
                    "key3": "k3v" + str(1 + idx * 3)
                }, json.loads(lines[0]))
예제 #5
0
    def test_export_collection_and_seeds(self, mock_producer,
                                         mock_api_client_cls):
        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        warcs = [{
            "warc_id": "9dc0b9c3a93a49eb8f713330b43f954c",
            "path":
            "xtest_1-20151202165907873-00000-306-60892de9dfc6-8001.warc.gz",
            "sha1": "000ffb3371eadb507d77d181ca3f0c5d3c74a2fc",
            "bytes": 460518,
            "date_created": "2016-02-22T14:49:07Z"
        }]
        mock_api_client.warcs.side_effect = [warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        export_message = {
            "id": "test2",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "csv",
            "segment_size": None,
            "path": self.export_path
        }

        exporter = BaseExporter("http://test",
                                None,
                                None,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None)

        self.assertFalse(exporter.result.success)

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed failure", export_status_message["status"])
        self.assertTrue(
            iso8601.parse_date(export_status_message["date_started"]))
        self.assertTrue(iso8601.parse_date(
            export_status_message["date_ended"]))
        self.assertEqual("test2", export_status_message["id"])
        self.assertTrue(CODE_WARC_MISSING,
                        export_status_message["errors"][0]["code"])
        self.assertTrue(CODE_NO_WARCS,
                        export_status_message["errors"][0]["code"])
예제 #6
0
    def test_export_seeds(self, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        export_message = {
            "id":
            "test2",
            "type":
            "test_user",
            "seeds": [
                {
                    "id": "005b131f5f854402afa2b08a4b7ba960",
                    "uid": "uid1"
                },
                {
                    "id": "105b131f5f854402afa2b08a4b7ba960",
                    "uid": "uid2"
                },
            ],
            "format":
            "csv",
            "segment_size":
            None,
            "path":
            self.export_path,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id=None,
            seed_ids=[
                "005b131f5f854402afa2b08a4b7ba960",
                "105b131f5f854402afa2b08a4b7ba960"
            ],
            harvest_date_start=None,
            harvest_date_end=None)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False,
                                               None, None, ["uid1", "uid2"],
                                               None)

        self.assertTrue(exporter.result.success)
        csv_filepath = os.path.join(self.export_path, "test2_001.csv")
        self.assertTrue(os.path.exists(csv_filepath))
        with open(csv_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(3, len(lines))
예제 #7
0
    def test_export_dehydrate(self, mock_producer, mock_api_client_cls):
        mock_warc_iter_cls = MagicMock()
        mock_table_cls = MagicMock()
        mock_table = MagicMock(spec=BaseTable)
        mock_table_cls.side_effect = [mock_table]
        mock_table.__iter__ = Mock(return_value=iter([
            [("key1", "key2"), ("k1v1", "k2v1"), ("k1v2", "k2v2")],
        ]))
        mock_table.id_field.return_value = "key2"

        mock_api_client = MagicMock(spec=ApiClient)
        mock_api_client_cls.side_effect = [mock_api_client]
        mock_api_client.warcs.side_effect = [self.warcs]

        mock_connection = MagicMock(spec=Connection)
        mock_exchange = MagicMock(spec=Exchange)
        mock_exchange.name = "test exchange"

        export_message = {
            "id": "test1",
            "type": "test_user",
            "collection": {
                "id": "005b131f5f854402afa2b08a4b7ba960"
            },
            "format": "dehydrate",
            "segment_size": None,
            "path": self.export_path,
        }

        exporter = BaseExporter("http://test",
                                mock_warc_iter_cls,
                                mock_table_cls,
                                self.working_path,
                                warc_base_path=self.warc_base_path,
                                host="testhost")
        exporter.mq_config = True
        exporter._producer_connection = mock_connection
        exporter.exchange = mock_exchange

        exporter.routing_key = "export.start.test.test_user"
        exporter.message = export_message
        exporter.on_message()

        mock_api_client_cls.assert_called_once_with("http://test")
        mock_api_client.warcs.assert_called_once_with(
            collection_id="005b131f5f854402afa2b08a4b7ba960",
            seed_ids=[],
            harvest_date_end=None,
            harvest_date_start=None)
        mock_table_cls.assert_called_once_with(self.warc_filepaths, False,
                                               None, None, [], None)

        self.assertTrue(exporter.result.success)
        txt_filepath = os.path.join(self.export_path, "test1_001.txt")
        self.assertTrue(os.path.exists(txt_filepath))
        with open(txt_filepath, "r") as f:
            lines = f.readlines()
        self.assertEqual(2, len(lines))
        self.assertEqual("k2v1\n", lines[0])

        name, _, kwargs = mock_producer.mock_calls[3]
        self.assertEqual("export.status.test.test_user", kwargs["routing_key"])
        export_status_message = kwargs["body"]
        self.assertEqual("completed success", export_status_message["status"])
        self.assertEqual("test1", export_status_message["id"])