def test_input_filtering(): '''test internal functions to filter urls''' resources_dir = os.path.join(TEST_DIR, 'resources') # load dictionary inputdict = cli.load_input_dict( os.path.join(resources_dir, 'list-process.txt'), set()) assert inputdict['https://httpbin.org'] == ['/status/200', '/status/404'] inputdict = cli.load_input_dict( os.path.join(resources_dir, 'list-process.txt'), {'httpbin.org/status/404'}) assert inputdict['https://httpbin.org'] == ['/status/200'] # deduplication and filtering myinput = [ 'https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6' ] myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'} inputdict = cli_utils.convert_inputlist(myblacklist, myinput, None, None) assert inputdict['https://example.org'] == ['/2', '/4', '/6'] # URL in blacklist my_urls = cli_utils.load_input_urls( os.path.join(resources_dir, 'list-process.txt')) my_blacklist = cli_utils.load_blacklist( os.path.join(resources_dir, 'list-discard.txt')) inputdict = cli_utils.convert_inputlist(my_blacklist, my_urls, None, None) assert len(inputdict) == 0 # URL filter my_urls = cli_utils.load_input_urls( os.path.join(resources_dir, 'list-process.txt')) assert len(cli.convert_inputlist(None, my_urls, ['status'], None)) == 1 my_urls = cli_utils.load_input_urls( os.path.join(resources_dir, 'list-process.txt')) assert len(cli.convert_inputlist(None, my_urls, ['teststring'], None)) == 0 my_urls = cli_utils.load_input_urls( os.path.join(resources_dir, 'list-process.txt')) assert len( cli.convert_inputlist(None, my_urls, ['status', 'teststring'], None)) == 1 # malformed URLs inputdict = cli_utils.convert_inputlist( {}, ['123345', 'https://www.example.org/1'], None, None) assert len(inputdict) == 1
def test_cli_pipeline(): '''test command-line processing pipeline''' # straight command-line input #testargs = ['', '<html><body>Text</body></html>'] #with patch.object(sys, 'argv', testargs): # args = cli.parse_args(testargs) #f = io.StringIO() #with redirect_stdout(f): # cli.process_args(args) #assert len(f.getvalue()) == 0 # test URL listing testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.url_processing_pipeline(args, dict()) is None # test conversion and storage inputdict = cli.convert_inputlist(None, ['ftps://www.example.org/'], None, None) assert inputdict == dict() inputdict = cli.convert_inputlist(None, ['https://www.example.org/'], None, None) assert cli_utils.url_processing_pipeline(args, inputdict) is None # test inputlist + blacklist testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) my_urls = cli_utils.load_input_urls(args.inputfile) assert my_urls is not None and len(my_urls) == 2 testargs = [ '', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist', os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.blacklist is not None # test backoff between domain requests inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None, None) reftime = datetime.now() cli_utils.url_processing_pipeline(args, inputdict) delta = (datetime.now() - reftime).total_seconds() assert delta > 2 # test blacklist and empty dict args.blacklist = cli_utils.load_blacklist(args.blacklist) assert len(args.blacklist) == 2 inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None, None) cli_utils.url_processing_pipeline(args, inputdict) # test backup testargs = ['', '--backup-dir', '/tmp/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.archive_html('00Test', args) # test date-based exclusion testargs = ['', '-out', 'xml', '--with-metadata'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test JSON output testargs = ['', '-out', 'json'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is not None # dry-run file processing pipeline testargs = ['', '--parallel', '1', '--inputdir', '/dev/null'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.file_processing_pipeline(args) # file processing pipeline on resources/ args.inputdir = RESOURCES_DIR cli_utils.file_processing_pipeline(args) # sitemaps testargs = ['', '--sitemap', 'https://httpbin.org/', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0 # config file testargs = [ '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() args.config_file = os.path.join(RESOURCES_DIR, args.config_file) config = use_config(filename=args.config_file) assert cli.examine(teststring, args) is None # CLI options testargs = ['', '--links', '--images'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs)