def test_rebuild_for_passim(): input_bucket_name = S3_CANONICAL_BUCKET outp_dir = pkg_resources.resource_filename('impresso_commons', 'data/rebuilt-passim') input_issues = read_s3_issues("luxwort", "1848", input_bucket_name) issue_key, json_files = rebuild_issues(issues=input_issues[:50], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='passim', filter_language=['fr']) logger.info(f'{issue_key}: {json_files}')
def test_rebuild_solr(newspaper_id: str, year: int, limit: int): input_bucket_name = S3_CANONICAL_BUCKET outp_dir = pkg_resources.resource_filename('impresso_commons', 'data/rebuilt') input_issues = read_s3_issues(newspaper_id, year, input_bucket_name) print(f'{newspaper_id}/{year}: {len(input_issues)} issues to rebuild') print(f'limiting test rebuild to first {limit} issues.') issue_key, json_files = rebuild_issues(issues=input_issues[:limit], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr') result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def test_rebuild_JDG2(): input_bucket_name = "s3://original-canonical-fixed" outp_dir = pkg_resources.resource_filename( 'impresso_commons', 'data/rebuilt' ) input_issues = read_s3_issues("JDG", "1862", input_bucket_name) print(f'{len(input_issues)} issues to rebuild') issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr' ) result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def test_rebuild_indeplux(): input_bucket_name = "s3://TRANSFER" outp_dir = pkg_resources.resource_filename( 'impresso_commons', 'data/rebuilt' ) input_issues = read_s3_issues("indeplux", "1905", input_bucket_name) print(f'{len(input_issues)} issues to rebuild') issue_key, json_files = rebuild_issues( issues=input_issues[:50], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr', filter_language=['fr'] ) result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def test_rebuild_NZZ(): input_bucket_name = "original-canonical-data" outp_dir = pkg_resources.resource_filename( 'impresso_commons', 'data/rebuilt' ) input_issues = impresso_iter_bucket( input_bucket_name, prefix="NZZ/1784/12/", item_type="issue" ) issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr' ) result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None