예제 #1
0
    def process_item(self, item, spider):
        if not spider.pluck:
            return item

        value = None
        if spider.package_pointer:
            try:
                package = _get_package(item)
            except NotImplementedError as e:
                value = f'error: {e}'
            else:
                value = _resolve_pointer(package, spider.package_pointer)
        else:  # spider.release_pointer
            if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results',
                                     'release_list', 'release'):
                data = _get_releases(item)
                if data:
                    value = max(_resolve_pointer(r, spider.release_pointer) for r in data)
            elif item['data_type'] in ('record_package', 'record_package_list', 'record_package_list_in_results',
                                       'record'):
                data = _get_records(item)
                if data:
                    # This assumes that the first record in the record package has the desired value.
                    data = data[0]
                    if 'releases' in data:
                        value = max(_resolve_pointer(r, spider.release_pointer) for r in data['releases'])
                    elif 'compiledRelease' in data:
                        value = _resolve_pointer(data['compiledRelease'], spider.release_pointer)

        if value and spider.truncate:
            value = value[:spider.truncate]

        return PluckedItem({'value': value})
예제 #2
0
def test_disabled():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname})
        extension = KingfisherPluck.from_crawler(spider.crawler)
        item = PluckedItem({'value': '2020-10-01'})

        extension.item_scraped(item, spider)
        extension.spider_closed(spider, 'itemcount')

        assert not glob(os.path.join(tmpdirname, 'pluck*.csv'))
예제 #3
0
def test_process_item_package_pointer(data_type, data):
    spider = spider_with_crawler(package_pointer='/publishedDate')

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': '2000-01-01T00:00:00Z'})
예제 #4
0
def test_process_item_release_pointer(data_type, data):
    spider = spider_with_crawler(release_pointer='/date', truncate=10)

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': '2020-10-01'})
예제 #5
0
def test_process_item_non_package_data_type():
    spider = spider_with_crawler(package_pointer='/publishedDate')

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(releases[0]),
        'data_type': 'release',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': 'error: /publishedDate not found'})
예제 #6
0
def test_process_item_nonexistent_pointer(kwargs):
    spider = spider_with_crawler(**kwargs)

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(release_package),
        'data_type': 'release_package',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': 'error: /nonexistent not found'})
예제 #7
0
def test_spider_closed_with_items():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname},
            release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)
        item = PluckedItem({'value': '2020-10-01'})

        extension.item_scraped(item, spider)
        extension.spider_closed(spider, 'itemcount')

        with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
            assert '2020-10-01,test\n' == f.read()
예제 #8
0
    def process_item(self, item, spider):
        if not spider.pluck:
            return item

        value = None
        if spider.package_pointer:
            pointer = spider.package_pointer
            if isinstance(item["data"], dict):
                value = _resolve_pointer(item["data"], pointer)
            else:
                try:
                    value = next(
                        ijson.items(item["data"], pointer[1:].replace("/", "."))
                    )
                except StopIteration:
                    value = f"error: {pointer} not found"
                except ijson.common.IncompleteJSONError as e:
                    message = str(e).split("\n", 1)[0]
                    if message.endswith(
                        (
                            # The JSON text can be truncated by a `bytes_received` handler.
                            "premature EOF",
                            # These messages occur if the JSON text is truncated at `"\\u` or `"\\`.
                            r"lexical error: invalid (non-hex) character occurs after '\u' inside string.",
                            r"lexical error: inside a string, '\' occurs before a character which it may not.",
                        )
                    ):
                        value = f"error: {pointer} not found within initial bytes"
                    else:
                        raise
        else:  # spider.release_pointer
            if isinstance(item["data"], dict):
                data = item["data"]
            else:
                data = json.loads(item["data"])

            if item["data_type"].startswith("release"):
                releases = data["releases"]
                if releases:
                    value = max(
                        _resolve_pointer(r, spider.release_pointer) for r in releases
                    )
            elif item["data_type"].startswith("record"):
                records = data["records"]
                if records:
                    # This assumes that the first record in the record package has the desired value.
                    record = records[0]
                    if "releases" in record:
                        value = max(
                            _resolve_pointer(r, spider.release_pointer)
                            for r in record["releases"]
                        )
                    elif "compiledRelease" in record:
                        value = _resolve_pointer(
                            record["compiledRelease"], spider.release_pointer
                        )

        if value and spider.truncate:
            value = value[: spider.truncate]

        return PluckedItem({"value": value})