def test_reports_timeout_errors(): with patch("common.commands.fetch_s2_data.requests") as mock_requests: mock_requests.get.side_effect = requests.exceptions.Timeout() command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2ApiException): run_command(command)
def run_commands_for_arxiv_ids( CommandClasses: CommandList, arxiv_id_list: List[str], pipeline_args: Namespace, ) -> PipelineDigest: " Run a sequence of pipeline commands for a list of arXiv IDs. " for CommandCls in CommandClasses: # Initialize arguments for each command to defaults. command_args_parser = ArgumentParser() CommandCls.init_parser(command_args_parser) command_args = command_args_parser.parse_known_args("")[0] # Pass pipeline arguments to command. command_args.arxiv_ids = arxiv_id_list command_args.arxiv_ids_file = None command_args.v = pipeline_args.v command_args.source = pipeline_args.source command_args.batch_size = pipeline_args.entity_batch_size command_args.keep_intermediate_files = pipeline_args.keep_intermediate_files command_args.log_names = [log_filename] command_args.schema = pipeline_args.database_schema command_args.create_tables = pipeline_args.database_create_tables command_args.data_version = pipeline_args.data_version if CommandCls == FetchArxivSources: command_args.s3_bucket = pipeline_args.s3_arxiv_sources_bucket if CommandCls in [StorePipelineLog, StoreResults]: command_args.s3_bucket = pipeline_args.s3_output_bucket if CommandCls == StorePipelineLog: logging.debug("Flushing file log before storing pipeline logs.") file_log_handler.flush() logging.debug( "Creating command %s with args %s", CommandCls.get_name(), vars(command_args), ) command = CommandCls(command_args) logging.info("Launching command %s", CommandCls.get_name()) try: run_command(command) # Catch-all for unexpected errors from running commands. With the amount of networking # and subprocess calls in the commands, it is simply unlikely that we can predict and # write exceptions for every possible exception that could be thrown. except Exception as exc: # pylint: disable=broad-except logging.error( "Unexpected exception processing papers: {}".format( arxiv_id_list), exc) raise exc logging.info("Finished running command %s", CommandCls.get_name()) # Create digest describing the result of running these commands for these papers processing_summary: PipelineDigest = {} for id_ in arxiv_id_list: processing_summary[id_] = make_paper_digest(entity_pipelines, id_) return processing_summary
def test_reports_rate_limiting(): with patch("common.commands.fetch_s2_data.requests") as mock_requests: mock_resp = Mock() mock_requests.get.return_value = mock_resp mock_resp.ok = False mock_resp.status_code = 429 command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2ApiRateLimitingException): run_command(command)
def test_no_s2_paper_references_raises_exeption(): with patch("common.commands.fetch_s2_data.requests") as mock_requests: mock_resp = Mock() mock_requests.get.return_value = mock_resp mock_resp.ok = True mock_resp.json.return_value = {"references": []} command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2ReferencesNotFoundException): run_command(command)
def test_no_s2_paper_raises_exception(): with patch("common.commands.fetch_s2_data.requests") as mock_requests: mock_resp = Mock() mock_requests.get.return_value = mock_resp mock_resp.ok = False mock_resp.status_code = 404 command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2PaperNotFoundException): run_command(command)
def test_reports_generic_exception_for_unhandled_non_2xxs(): with patch("common.commands.fetch_s2_data.requests") as mock_requests: mock_resp = Mock() mock_requests.get.return_value = mock_resp mock_resp.ok = False mock_resp.status_code = 499 command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2ApiException): run_command(command)
def test_breaks_retry_loop_as_soon_as_successful_fetch_from_arxiv(): with patch("common.commands.fetch_arxiv_sources.time.sleep") as mock_sleep: with patch("common.commands.fetch_arxiv_sources.fetch_from_arxiv" ) as mock_fetch: mock_fetch.return_value = "Some result" args = Args(arxiv_ids=["fakeid"], arxiv_ids_file=None, source="arxiv", s3_bucket=None) command = FetchArxivSources(args) run_command(command) assert mock_fetch.call_count == 1 assert mock_sleep.call_count == 1
def test_makes_up_to_k_attempts_to_fetch_from_arxiv(): with patch("common.commands.fetch_arxiv_sources.time.sleep") as mock_sleep: with patch("common.commands.fetch_arxiv_sources.fetch_from_arxiv" ) as mock_fetch: mock_fetch.side_effect = FetchFromArxivException() args = Args(arxiv_ids=["fakeid"], arxiv_ids_file=None, source="arxiv", s3_bucket=None) command = FetchArxivSources(args) with pytest.raises(FetchFromArxivException): run_command(command) assert mock_fetch.call_count == MAX_FETCH_ATTEMPTS assert mock_sleep.call_args_list == [ call(BACKOFF_FETCH_DELAY, ), call(BACKOFF_FETCH_DELAY, ), call(BACKOFF_FETCH_DELAY, ), ]
# Fetch pipeline config (includes credentials for accessing services). if args.config: fetch_config(args.config) # Load arXiv IDs from arguments or by fetching recent arXiv IDs from a database. arxiv_ids = load_arxiv_ids_using_args(args) if arxiv_ids is None and args.days is not None: logging.debug("Fetching new arXiv IDs for the last %d day(s).", args.days) arxiv_ids_path = "arxiv_ids.txt" fetch_command_args = create_args(v=args.v, days=args.days, output_file=arxiv_ids_path) fetch_arxiv_ids_command = FetchNewArxivIds(fetch_command_args) run_command(fetch_arxiv_ids_command) arxiv_ids = read_arxiv_ids_from_file(arxiv_ids_path) # Load options for the job from. Command line options for jobs take precedence over properties # defined in the job spec downloaded from S3. s3_job_spec = load_job_from_s3( args.s3_job_spec) if args.s3_job_spec else None arxiv_ids = s3_job_spec.arxiv_ids if ( not arxiv_ids) and s3_job_spec else arxiv_ids # If the list of arXiv IDs still hasn't been defined, set it to an empty list. This will # allow the rest of this script to run, and provide hopefully useful debugging messages. if arxiv_ids is None: arxiv_ids = [] if len(arxiv_ids) == 0: logging.warning( # pylint: disable=logging-not-lazy
def test_no_s2_paper_raises_exception(): command = FetchS2Metadata(Args(arxiv_ids=['fakeid'])) with pytest.raises(S2PaperNotFoundException): run_command(command)