def too_soon_response(too_soon_urls): target_url = too_soon_urls[0] srcdir = FIXTURES_DIR.joinpath("job-save-too-soon") submit_resptext = srcdir.joinpath("submit-response.html").read_text() with responses.RequestsMock() as rsps: rsps.add( "POST", wb.url_for_savepage(target_url), body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) rsps.add( "GET", wb.url_for_jobstatus(wb.extract_job_id(submit_resptext)), body=srcdir.joinpath("status-0.json").read_text(), status=200, ) yield rsps
def test_snapshot_too_soon(): srcdir = FIXTURES_DIR.joinpath("job-save-too-soon") target_url = "https://plainlanguage.gov/" submit_resptext = srcdir.joinpath("submit-response.html").read_text() responses.add( "POST", wb.url_for_savepage(target_url), body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) responses.add( "GET", wb.url_for_jobstatus(wb.extract_job_id(submit_resptext)), body=srcdir.joinpath("status-0.json").read_text(), status=200, ) answer, meta = wb.snapshot(target_url) assert answer == meta.snapshot_url assert meta.subcommand == "snapshot" assert meta.was_new_snapshot_created() is False assert ( meta.too_soon() == "The same snapshot had been made 4 minutes and 18 seconds ago. We only allow new captures of the same URL every 20 minutes." )
def save_success_response(success_urls): srcdir = FIXTURES_DIR.joinpath("job-save-success") target_url = success_urls[0] submit_resptext = srcdir.joinpath("submit-response.html").read_text() expected_job_url = wb.url_for_jobstatus(submit_resptext) status_paths = iter( [ srcdir.joinpath("status-0.json"), srcdir.joinpath("status-1.json"), srcdir.joinpath("status-9.json"), srcdir.joinpath("status-10.json"), ] ) with responses.RequestsMock() as rsps: rsps.add( "POST", wb.url_for_savepage(target_url), body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) rsps.add_callback( "GET", expected_job_url, callback=lambda req: ( 200, {}, next(status_paths).read_text(), ), # 2nd arg is a headers dict ) yield rsps
def test_snapshot_submit_request(session): target_url = "https://plainlanguage.gov/" save_url = wb.url_for_savepage(target_url) resptext = FIXTURES_DIR.joinpath( "job-save-success/submit-response.html" ).read_text() responses.add( "POST", save_url, body=resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) resp = wb.submit_snapshot_request(session, target_url, headers={}) assert f'<h2 id="spn-title">Saving page {target_url}</h2>' in resp.text
def test_snapshot_too_many_for_period(): srcdir = FIXTURES_DIR.joinpath("job-save-too-many-today") submit_resptext = srcdir.joinpath("submit-response.html").read_text() target_url = "https://nytimes.com/" responses.add( "POST", wb.url_for_savepage(target_url), body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) # mock request for availability URL responses.add( "GET", wb.url_for_availability(target_url), body=srcdir.joinpath("check-availability.json").read_text(), ) answer, meta = wb.snapshot(target_url) assert answer == meta.snapshot_url assert meta.subcommand == "snapshot" assert meta.was_new_snapshot_created() == False assert ( meta.too_many_during_period() == """This URL has been already captured 10 times today. Please email us at "*****@*****.**" if you would like to discuss this more.""" ) # import pdb; pdb.set_trace() # server payload is the payload returned by availability API response assert meta.server_payload["archived_snapshots"]["closest"]["available"] is True
def test_snapshot_submit_request_not_ok(session): """not sure when this would happen, when server is down?""" target_url = "https://plainlanguage.gov/" save_url = wb.url_for_savepage(target_url) resptext = FIXTURES_DIR.joinpath( "job-save-success/submit-response.html" ).read_text() responses.add( "POST", save_url, body=resptext, status=503, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) with pytest.raises(ServerStatusError) as err: resp = wb.submit_snapshot_request(session, target_url, headers={}) assert ( f"Server status was NOT OK; returned 503 for: {save_url}" in err.value.args[0] )
def test_url_for_savepage(): target = "https://example.com/foo" assert (wb.url_for_savepage(target) == "http://web.archive.org/save/https://example.com/foo")
def test_snapshot_successful(success_status_paths): #### fixture setup (todo: refactor?) srcdir = FIXTURES_DIR.joinpath("job-save-success") target_url = "https://plainlanguage.gov/" save_url = wb.url_for_savepage(target_url) submit_resptext = srcdir.joinpath("submit-response.html").read_text() expected_job_id = wb.extract_job_id(submit_resptext) expected_job_url = wb.url_for_jobstatus(expected_job_id) #### mock responses responses.add( "POST", save_url, body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) responses.add_callback( "GET", expected_job_url, callback=lambda req: ( 200, {}, next(success_status_paths).read_text(), ), # 2nd arg is a headers dict ) answer, meta = wb.snapshot(target_url, user_agent="guy incognito", poll_interval=0) # make sure snapshot, as expected by the setup, exhausted the success_status_paths iterator assert next(success_status_paths, False) is False # test return values assert type(answer) is str assert type(meta) is wb.TaskMeta assert meta.subcommand == "snapshot" assert meta.target_url == target_url assert meta.created_at.strftime("%Y-%m-%d %H:%M:%S%z") == "2020-09-01 14:30:55+0000" data = meta.to_dict() # test that answer is snapshot url assert ( answer == wb.BASE_DOMAIN + "/web/" + data["server_payload"]["timestamp"] + "/" + target_url ) # test data response assert data["subcommand"] == "snapshot" assert data["was_new_snapshot_created"] is True assert data["snapshot_url"] == answer assert data["request_meta"]["user_agent"] == "guy incognito" issues = data["issues"] assert issues["too_soon"] is False assert issues["too_many_during_period"] is False jd = data["server_payload"] assert jd["status"] == "success" assert jd["timestamp"] in data["snapshot_url"] # not sure if this is always the case...what happens if there's a redirect? assert jd["original_url"] == target_url