Пример #1
0
def test_reports_timeout_errors():
    with patch("common.commands.fetch_s2_data.requests") as mock_requests:
        mock_requests.get.side_effect = requests.exceptions.Timeout()

        command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
        with pytest.raises(S2ApiException):
            run_command(command)
Пример #2
0
def run_commands_for_arxiv_ids(
    CommandClasses: CommandList,
    arxiv_id_list: List[str],
    pipeline_args: Namespace,
) -> PipelineDigest:
    " Run a sequence of pipeline commands for a list of arXiv IDs. "

    for CommandCls in CommandClasses:

        # Initialize arguments for each command to defaults.
        command_args_parser = ArgumentParser()
        CommandCls.init_parser(command_args_parser)
        command_args = command_args_parser.parse_known_args("")[0]

        # Pass pipeline arguments to command.
        command_args.arxiv_ids = arxiv_id_list
        command_args.arxiv_ids_file = None
        command_args.v = pipeline_args.v
        command_args.source = pipeline_args.source
        command_args.batch_size = pipeline_args.entity_batch_size
        command_args.keep_intermediate_files = pipeline_args.keep_intermediate_files
        command_args.log_names = [log_filename]
        command_args.schema = pipeline_args.database_schema
        command_args.create_tables = pipeline_args.database_create_tables
        command_args.data_version = pipeline_args.data_version
        if CommandCls == FetchArxivSources:
            command_args.s3_bucket = pipeline_args.s3_arxiv_sources_bucket
        if CommandCls in [StorePipelineLog, StoreResults]:
            command_args.s3_bucket = pipeline_args.s3_output_bucket

        if CommandCls == StorePipelineLog:
            logging.debug("Flushing file log before storing pipeline logs.")
            file_log_handler.flush()

        logging.debug(
            "Creating command %s with args %s",
            CommandCls.get_name(),
            vars(command_args),
        )
        command = CommandCls(command_args)
        logging.info("Launching command %s", CommandCls.get_name())
        try:
            run_command(command)
        # Catch-all for unexpected errors from running commands. With the amount of networking
        # and subprocess calls in the commands, it is simply unlikely that we can predict and
        # write exceptions for every possible exception that could be thrown.
        except Exception as exc:  # pylint: disable=broad-except
            logging.error(
                "Unexpected exception processing papers: {}".format(
                    arxiv_id_list), exc)
            raise exc

        logging.info("Finished running command %s", CommandCls.get_name())

    # Create digest describing the result of running these commands for these papers
    processing_summary: PipelineDigest = {}
    for id_ in arxiv_id_list:
        processing_summary[id_] = make_paper_digest(entity_pipelines, id_)
    return processing_summary
Пример #3
0
def test_reports_rate_limiting():
    with patch("common.commands.fetch_s2_data.requests") as mock_requests:
        mock_resp = Mock()
        mock_requests.get.return_value = mock_resp
        mock_resp.ok = False
        mock_resp.status_code = 429

        command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
        with pytest.raises(S2ApiRateLimitingException):
            run_command(command)
Пример #4
0
def test_no_s2_paper_references_raises_exeption():
    with patch("common.commands.fetch_s2_data.requests") as mock_requests:
        mock_resp = Mock()
        mock_requests.get.return_value = mock_resp
        mock_resp.ok = True
        mock_resp.json.return_value = {"references": []}

        command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
        with pytest.raises(S2ReferencesNotFoundException):
            run_command(command)
Пример #5
0
def test_no_s2_paper_raises_exception():
    with patch("common.commands.fetch_s2_data.requests") as mock_requests:
        mock_resp = Mock()
        mock_requests.get.return_value = mock_resp
        mock_resp.ok = False
        mock_resp.status_code = 404

        command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
        with pytest.raises(S2PaperNotFoundException):
            run_command(command)
Пример #6
0
def test_reports_generic_exception_for_unhandled_non_2xxs():
    with patch("common.commands.fetch_s2_data.requests") as mock_requests:
        mock_resp = Mock()
        mock_requests.get.return_value = mock_resp
        mock_resp.ok = False
        mock_resp.status_code = 499

        command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
        with pytest.raises(S2ApiException):
            run_command(command)
Пример #7
0
def test_breaks_retry_loop_as_soon_as_successful_fetch_from_arxiv():
    with patch("common.commands.fetch_arxiv_sources.time.sleep") as mock_sleep:
        with patch("common.commands.fetch_arxiv_sources.fetch_from_arxiv"
                   ) as mock_fetch:
            mock_fetch.return_value = "Some result"
            args = Args(arxiv_ids=["fakeid"],
                        arxiv_ids_file=None,
                        source="arxiv",
                        s3_bucket=None)
            command = FetchArxivSources(args)

            run_command(command)

            assert mock_fetch.call_count == 1
            assert mock_sleep.call_count == 1
Пример #8
0
def test_makes_up_to_k_attempts_to_fetch_from_arxiv():
    with patch("common.commands.fetch_arxiv_sources.time.sleep") as mock_sleep:
        with patch("common.commands.fetch_arxiv_sources.fetch_from_arxiv"
                   ) as mock_fetch:
            mock_fetch.side_effect = FetchFromArxivException()
            args = Args(arxiv_ids=["fakeid"],
                        arxiv_ids_file=None,
                        source="arxiv",
                        s3_bucket=None)
            command = FetchArxivSources(args)

            with pytest.raises(FetchFromArxivException):
                run_command(command)

            assert mock_fetch.call_count == MAX_FETCH_ATTEMPTS
            assert mock_sleep.call_args_list == [
                call(BACKOFF_FETCH_DELAY, ),
                call(BACKOFF_FETCH_DELAY, ),
                call(BACKOFF_FETCH_DELAY, ),
            ]
Пример #9
0
    # Fetch pipeline config (includes credentials for accessing services).
    if args.config:
        fetch_config(args.config)

    # Load arXiv IDs from arguments or by fetching recent arXiv IDs from a database.
    arxiv_ids = load_arxiv_ids_using_args(args)
    if arxiv_ids is None and args.days is not None:
        logging.debug("Fetching new arXiv IDs for the last %d day(s).",
                      args.days)
        arxiv_ids_path = "arxiv_ids.txt"
        fetch_command_args = create_args(v=args.v,
                                         days=args.days,
                                         output_file=arxiv_ids_path)
        fetch_arxiv_ids_command = FetchNewArxivIds(fetch_command_args)
        run_command(fetch_arxiv_ids_command)
        arxiv_ids = read_arxiv_ids_from_file(arxiv_ids_path)

    # Load options for the job from. Command line options for jobs take precedence over properties
    # defined in the job spec downloaded from S3.
    s3_job_spec = load_job_from_s3(
        args.s3_job_spec) if args.s3_job_spec else None
    arxiv_ids = s3_job_spec.arxiv_ids if (
        not arxiv_ids) and s3_job_spec else arxiv_ids

    # If the list of arXiv IDs still hasn't been defined, set it to an empty list. This will
    # allow the rest of this script to run, and provide hopefully useful debugging messages.
    if arxiv_ids is None:
        arxiv_ids = []
    if len(arxiv_ids) == 0:
        logging.warning(  # pylint: disable=logging-not-lazy
Пример #10
0
def test_no_s2_paper_raises_exception():
    command = FetchS2Metadata(Args(arxiv_ids=['fakeid']))
    with pytest.raises(S2PaperNotFoundException):
        run_command(command)