def test_parser(): '''test argument parsing for the command-line interface''' testargs = [ '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is True assert args.verbose is True assert args.notables is False assert args.xmltei is True assert args.URL == 'https://www.example.org' args = cli.map_args(args) assert args.output_format == 'xmltei' testargs = ['', '-out', 'csv', '-u', 'https://www.example.org'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is False assert args.verbose is False assert args.output_format == 'csv' # test args mapping testargs = ['', '--xml'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) args = cli.map_args(args) assert args.output_format == 'xml' args.xml = False args.csv = True args = cli.map_args(args) assert args.output_format == 'csv' args.csv = False args.json = True args = cli.map_args(args) assert args.output_format == 'json'
def test_queue(): 'Test creation, modification and download of URL queues.' # test conversion and storage inputdict = add_to_compressed_dict(['ftps://www.example.org/']) assert inputdict == dict() inputdict = add_to_compressed_dict(['https://www.example.org/']) # CLI args testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = parse_args(testargs) assert url_processing_pipeline(args, inputdict) is None # single/multiprocessing testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = parse_args(testargs) domain_dict = dict() domain_dict['https://httpbin.org'] = deque([ '/status/301', '/status/304', '/status/200', '/status/300', '/status/400', '/status/505' ]) args.archived = True args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg') config = use_config(filename=args.config_file) results = download_queue_processing(domain_dict, args, None, config) assert len(results[0]) == 6 and results[1] is None # test backoff algorithm testdict = dict() backoffdict = dict() testdict['http://test.org'] = deque(['/1']) assert draw_backoff_url(testdict, backoffdict, 0, set()) == ('http://test.org/1', dict(), dict(), 'http://test.org') testdict['http://test.org'] = deque(['/1']) backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert draw_backoff_url(testdict, backoffdict, 0, set()) == ('http://test.org/1', dict(), dict(), 'http://test.org') # code hangs, logical: #testdict['http://test.org'] = deque(['/1']) #backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263) #assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 0) # download buffer domain_dict = { 'https://test.org': deque(['/1', '/2', '/3']), 'https://test2.org': deque(['/1', '/2', '/3']), 'https://test3.org': deque(['/1', '/2', '/3']), 'https://test4.org': deque(['/1', '/2', '/3']), 'https://test5.org': deque(['/1', '/2', '/3']), 'https://test6.org': deque(['/1', '/2', '/3']) } bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), 0, threads=1) assert len(bufferlist) == 6 bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), 0, threads=2) assert len(bufferlist) == 6
def test_sysoutput(): '''test command-line output with respect to CLI arguments''' testargs = ['', '--csv', '-o', '/root/forbidden/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) filepath, destdir = cli_utils.determine_output_path( args, args.outputdir, '') assert len(filepath) >= 10 and filepath.endswith('.csv') assert destdir == '/root/forbidden/' assert cli_utils.check_outputdir_status(args.outputdir) is False testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.check_outputdir_status(args.outputdir) is True # test fileslug for name filepath, destdir = cli_utils.determine_output_path(args, args.outputdir, '', new_filename='AAZZ') assert filepath.endswith('AAZZ.xml') # test json output args2 = args args2.xml, args2.json = False, True args2 = cli.map_args(args2) filepath2, destdir2 = cli_utils.determine_output_path(args, args.outputdir, '', new_filename='AAZZ') assert filepath2.endswith('AAZZ.json') # test directory counter assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1' # test file writing testargs = ['', '--csv', '-o', '/dev/null/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) result = 'DADIDA' cli_utils.write_result(result, args) # process with no counter assert cli_utils.process_result('DADIDA', args, None, None, DEFAULT_CONFIG) is None # test keeping dir structure testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) filepath, destdir = cli_utils.determine_output_path( args, 'testfile.txt', '') assert filepath == 'test/testfile.txt' # test hash as output file name assert args.hash_as_name is False args.hash_as_name = True assert args.keep_dirs is True args.keep_dirs = False filepath, destdir = cli_utils.determine_output_path( args, 'testfile.txt', '') assert filepath == 'test/2jmj7l5rSw0yVb-vlWAYkK-YBwk.txt'
def test_parser(): '''test argument parsing for the command-line interface''' testargs = [ '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is True assert args.verbose is True assert args.notables is False assert args.xmltei is True assert args.URL == 'https://www.example.org' args = cli.map_args(args) assert args.output_format == 'xmltei' testargs = ['', '-out', 'csv', '-u', 'https://www.example.org'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is False assert args.verbose is False assert args.output_format == 'csv' # test args mapping testargs = ['', '--xml'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) args = cli.map_args(args) assert args.output_format == 'xml' args.xml, args.csv = False, True args = cli.map_args(args) assert args.output_format == 'csv' args.csv, args.json = False, True args = cli.map_args(args) assert args.output_format == 'json' # process_args args.inputdir = '/dev/null' args.verbose = True args.blacklist = os.path.join(TEST_DIR, 'resources/list-discard.txt') cli.process_args(args) assert len(args.blacklist) == 2 # filter testargs = [ '', '-i', 'resources/list-discard.txt', '--url-filter', 'test1', 'test2' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.inputfile == 'resources/list-discard.txt' assert args.url_filter == ['test1', 'test2'] resources_dir = os.path.join(TEST_DIR, 'resources') args.inputfile = os.path.join(resources_dir, 'list-discard.txt') args.blacklist == os.path.join(resources_dir, 'list-discard.txt') f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0
def test_sysoutput(): '''test command-line output with respect to CLI arguments''' testargs = ['', '--csv', '-o', '/root/forbidden/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) filename = cli_utils.determine_filename(args) assert len(filename) >= 10 and filename.endswith('.csv') assert cli_utils.check_outputdir_status(args.outputdir) is False testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.check_outputdir_status(args.outputdir) is True assert cli_utils.determine_filename(args).endswith('.xml')
def test_input_filtering(): '''test internal functions to filter urls''' testargs = [''] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) # load dictionary args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt') inputdict = cli.load_input_dict(args) assert inputdict['https://httpbin.org'] == deque(['/status/200', '/status/404']) args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt') args.blacklist = {'httpbin.org/status/404'} inputdict = cli.load_input_dict(args) assert inputdict['https://httpbin.org'] == deque(['/status/200']) # deduplication and filtering myinput = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6'] myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'} inputdict = add_to_compressed_dict(myinput, myblacklist) assert inputdict['https://example.org'] == deque(['/2', '/4', '/6']) # URL in blacklist args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt') my_urls = cli_utils.load_input_urls(args) my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt')) inputdict = add_to_compressed_dict(my_urls, my_blacklist) assert len(inputdict) == 0 # URL filter args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt') my_urls = cli_utils.load_input_urls(args) assert len(add_to_compressed_dict(my_urls, None, ['status'], None)) == 1 assert len(add_to_compressed_dict(my_urls, None, ['teststring'], None)) == 0 assert len(add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)) == 1 # malformed URLs inputdict = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None) assert len(inputdict) == 1
def test_download(): '''test page download and command-line interface''' testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli.examine(' ', args) is None assert cli.examine('0' * int(10e7), args) is None #assert utils.fetch_url('https://httpbin.org/status/404') is None #url = 'https://httpbin.org/status/200' #teststring = utils.fetch_url(url) #assert teststring is None # too small #assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/links/2/2' teststring = utils.fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/html' teststring = utils.fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is not None # multiprocessing domain_dict = dict() domain_dict['httpbin.org'] = [ 'https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/500' ] assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25, None) is None
def test_cli_pipeline(): '''test command-line processing pipeline''' # test URL listing testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.url_processing_pipeline(args, [], 0) is None assert cli_utils.url_processing_pipeline( args, ['https://www.example.org/'], 0) is None # test inputlist + blacklist resources_dir = os.path.join(TEST_DIR, 'resources') testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) my_urls = cli_utils.load_input_urls(args.inputfile) assert my_urls is not None and len(my_urls) == 2 resources_dir = os.path.join(TEST_DIR, 'resources') #testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt'), '--blacklist', os.path.join(resources_dir, 'list-discard.txt')] #with patch.object(sys, 'argv', testargs): # args = cli.parse_args(testargs) #print(args.blacklist) #assert args.blacklist is not None # test backoff between domain requests reftime = datetime.now() assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None delta = (datetime.now() - reftime).total_seconds() assert delta > 2 # test backup testargs = ['', '--backup-dir', '/tmp/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.archive_html('00Test', args) cli_utils.url_processing_pipeline(args, my_urls, 2) # test date-based exclusion testargs = ['', '-out', 'xml', '--with-metadata'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test timeout testargs = ['', '-out', 'xml', '--timeout'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is not None # test JSON output testargs = ['', '-out', 'json'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is not None # file processing pipeline testargs = ['', '--parallel', '1', '--inputdir', '/dev/null'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.file_processing_pipeline(args)
def test_download(): '''test page download and command-line interface''' testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli.examine(None, args) is None assert cli.examine(' ', args) is None assert cli.examine('0' * int(10e7), args) is None #url = 'https://httpbin.org/status/200' #teststring = utils.fetch_url(url) #assert teststring is None # too small #assert cli.examine(teststring, args, url) is None #url = 'https://httpbin.org/links/2/2' #teststring = utils.fetch_url(url) #assert teststring is not None #assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/html' teststring = utils.fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is not None # single/multiprocessing domain_dict = dict() domain_dict['https://httpbin.org'] = [ '/status/301', '/status/304', '/status/200', '/status/300', '/status/400', '/status/505' ] args.archived = True args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg') config = use_config(filename=args.config_file) results = cli_utils.download_queue_processing(domain_dict, args, None, config) assert len(results[0]) == 5 and results[1] is None # test backoff algorithm testdict = dict() backoffdict = dict() testdict['http://test.org'] = ['/1'] assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 0) == ('http://test.org/1', dict(), dict(), 0) testdict['http://test.org'] = ['/1'] backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 0) == ('http://test.org/1', dict(), dict(), 0) testdict['http://test.org'] = ['/1'] backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 3) testdict['http://test.org'] = ['/1'] backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 0)
def test_download(): '''test page download and command-line interface''' testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli.examine(None, args) is None assert cli.examine(' ', args) is None assert cli.examine('0' * int(10e7), args) is None #url = 'https://httpbin.org/status/200' #teststring = utils.fetch_url(url) #assert teststring is None # too small #assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/links/2/2' teststring = utils.fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/html' teststring = utils.fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is not None # multiprocessing domain_dict = dict() domain_dict['httpbin.org'] = [ 'https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/505' ] assert cli_utils.multi_threaded_processing( domain_dict, args, 0.25, None) == (['https://httpbin.org/status/301'], None) # test backoff algorithm testdict = dict() backoffdict = dict() testdict['test.org'] = ['http://test.org/1'] assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 0) == ('http://test.org/1', dict(), dict(), 0) testdict['test.org'] = ['http://test.org/1'] backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 0) == ('http://test.org/1', dict(), dict(), 0) testdict['test.org'] = ['http://test.org/1'] backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 3) testdict['test.org'] = ['http://test.org/1'] backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263) assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 0)
def test_input_type(): '''test input type errors''' testfile = 'docs/trafilatura-demo.gif' testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(testfile, 'rb') as f: teststring = f.read(1024) assert cli.examine(teststring, args) is None testfile = 'docs/index.rst' with open(testfile, 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None
def test_parser(): '''test argument parsing for the command-line interface''' testargs = [ '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is True assert args.verbose is True assert args.notables is False assert args.xmltei is True assert args.URL == 'https://www.example.org' args = cli.map_args(args) assert args.output_format == 'xmltei' testargs = ['', '-out', 'csv', '-u', 'https://www.example.org'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is False assert args.verbose is False assert args.output_format == 'csv' # test args mapping testargs = ['', '--xml'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) args = cli.map_args(args) assert args.output_format == 'xml' args.xml, args.csv = False, True args = cli.map_args(args) assert args.output_format == 'csv' args.csv, args.json = False, True args = cli.map_args(args) assert args.output_format == 'json' # process_args args.inputdir = '/dev/null' args.verbose = True args.blacklist = os.path.join(TEST_DIR, 'resources/list-discard.txt') cli.process_args(args) assert len(args.blacklist) == 4
def test_cli_pipeline(): '''test command-line processing pipeline''' # test URL listing testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.url_processing_pipeline(args, [], 0) is None assert cli_utils.url_processing_pipeline( args, ['https://www.example.org/'], 0) is None # test inputlist + blacklist resources_dir = os.path.join(TEST_DIR, 'resources') testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) my_urls = cli_utils.load_input_urls(args.inputfile) assert my_urls is not None and len(my_urls) == 2 # test backoff between domain requests reftime = datetime.now() assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None delta = (datetime.now() - reftime).total_seconds() assert delta > 2 # URL in blacklist testargs = [ '', '-i', os.path.join(resources_dir, 'list-process.txt'), '-b', os.path.join(resources_dir, 'list-discard.txt') ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) print(cli_utils.url_processing_checks(args, my_urls)) assert len(cli_utils.url_processing_checks(args, my_urls)) == 0 # test backup testargs = ['', '--backup-dir', '/tmp/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.archive_html('00Test', args) cli_utils.url_processing_pipeline(args, my_urls, 2)
def test_sysoutput(): '''test command-line output with respect to CLI arguments''' testargs = ['', '--csv', '-o', '/root/forbidden/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) filepath, destdir = cli_utils.determine_output_path(args, args.outputdir) print(filepath) assert len(filepath) >= 10 and filepath.endswith('.csv') assert destdir == '/root/forbidden/' assert cli_utils.check_outputdir_status(args.outputdir) is False testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.check_outputdir_status(args.outputdir) is True # test fileslug for name filepath, destdir = cli_utils.determine_output_path(args, args.outputdir, new_filename='AAZZ') assert filepath.endswith('AAZZ.xml') # test directory counter assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1' # test file writing testargs = ['', '--csv', '-o', '/dev/null/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) result = 'DADIDA' cli_utils.write_result(result, args) # process with no counter assert cli_utils.process_result('DADIDA', args, None, None) is None # test keeping dir structure testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt') print(filepath, destdir) assert filepath == 'test/testfile.txt'
def test_input_type(): '''test input type errors''' testfile = 'docs/trafilatura-demo.gif' testargs = ['', '-u', 'http'] with patch.object(sys, 'argv', testargs): assert cli.main() is None testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(testfile, 'rb') as f: teststring = f.read(1024) assert cli.examine(teststring, args) is None testfile = 'docs/usage.rst' with open(testfile, 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test file list assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 20
def test_input_type(): '''test input type errors''' testfile = 'docs/trafilatura-demo.gif' testargs = ['', '-u', 'http'] with patch.object(sys, 'argv', testargs): assert cli.main() is None testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(testfile, 'rb') as f: teststring = f.read(1024) assert cli.examine(teststring, args) is None testfile = 'docs/index.rst' with open(testfile, 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test file list assert cli_utils.generate_filelist(os.path.join(TEST_DIR, 'resources')) is not None
def test_download(): '''test page download and command-line interface''' testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli.examine(None, args) is None assert cli.examine(' ', args) is None assert cli.examine('0' * int(10e7), args) is None #url = 'https://httpbin.org/status/200' #teststring = fetch_url(url) #assert teststring is None # too small #assert cli.examine(teststring, args, url) is None #url = 'https://httpbin.org/links/2/2' #teststring = fetch_url(url) #assert teststring is not None #assert cli.examine(teststring, args, url) is None url = 'https://httpbin.org/html' teststring = fetch_url(url) assert teststring is not None assert cli.examine(teststring, args, url) is not None
def test_parser(): '''test argument parsing for the command-line interface''' testargs = [ '', '-fvv', '--xmltei', '--notables', '-u', 'https://www.example.org' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is True assert args.verbose == 2 assert args.notables is False and args.no_tables is False assert args.xmltei is True assert args.URL == 'https://www.example.org' args = cli.map_args(args) assert args.output_format == 'xmltei' testargs = [ '', '-out', 'csv', '--no-tables', '-u', 'https://www.example.org' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.fast is False assert args.verbose == 0 assert args.output_format == 'csv' assert args.no_tables is False # test args mapping testargs = ['', '--xml', '--nocomments', '--precision', '--recall'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) args = cli.map_args(args) assert args.output_format == 'xml' and args.no_comments is False # combination possible (?) assert args.precision is True and args.recall is True args.xml, args.csv = False, True args = cli.map_args(args) assert args.output_format == 'csv' args.csv, args.json = False, True args = cli.map_args(args) assert args.output_format == 'json' testargs = ['', '--with-metadata'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) args = cli.map_args(args) assert args.only_with_metadata is True # process_args args.inputdir = '/dev/null' args.verbose = 1 args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt') cli.process_args(args) assert len(args.blacklist) == 2 # filter testargs = [ '', '-i', 'resources/list-discard.txt', '--url-filter', 'test1', 'test2' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.inputfile == 'resources/list-discard.txt' assert args.url_filter == ['test1', 'test2'] args.inputfile = os.path.join(RESOURCES_DIR, 'list-discard.txt') args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt') f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0 # version testargs = ['', '--version'] with pytest.raises(SystemExit) as e, redirect_stdout(f): with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert e.type == SystemExit assert e.value.code == 0 assert re.match( r'Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]', f.getvalue())
def test_cli_pipeline(): '''test command-line processing pipeline''' # straight command-line input #testargs = ['', '<html><body>Text</body></html>'] #with patch.object(sys, 'argv', testargs): # args = cli.parse_args(testargs) #f = io.StringIO() #with redirect_stdout(f): # cli.process_args(args) #assert len(f.getvalue()) == 0 # test URL listing # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool os.environ['PYTHONIOENCODING'] = "utf-8" testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.url_processing_pipeline(args, dict()) is None # test inputlist + blacklist testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) my_urls = cli_utils.load_input_urls(args) assert my_urls is not None and len(my_urls) == 2 testargs = [ '', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist', os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.blacklist is not None # test backoff between domain requests inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None) reftime = datetime.now() cli_utils.url_processing_pipeline(args, inputdict) delta = (datetime.now() - reftime).total_seconds() assert delta > 2 # test blacklist and empty dict args.blacklist = cli_utils.load_blacklist(args.blacklist) assert len(args.blacklist) == 2 inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None) cli_utils.url_processing_pipeline(args, inputdict) # test backup testargs = ['', '--backup-dir', '/tmp/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.archive_html('00Test', args) # test date-based exclusion testargs = ['', '-out', 'xml', '--with-metadata'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None testargs = ['', '-out', 'xml', '--only-with-metadata', '--precision'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test JSON output testargs = ['', '-out', 'json', '--recall'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is not None # dry-run file processing pipeline testargs = ['', '--parallel', '1', '--inputdir', '/dev/null'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.file_processing_pipeline(args) # file processing pipeline on resources/ args.inputdir = RESOURCES_DIR cli_utils.file_processing_pipeline(args) # sitemaps testargs = ['', '--sitemap', 'https://httpbin.org/', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0 # config file testargs = [ '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() args.config_file = os.path.join(RESOURCES_DIR, args.config_file) # config = use_config(filename=args.config_file) assert cli.examine(teststring, args) is None # CLI options testargs = ['', '--links', '--images'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) #with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f: # teststring = f.read() #result = cli.examine(teststring, args) #assert '[link](testlink.html)' in result # and 'test.jpg' in result # Crawling testargs = ['', '--crawl', 'https://httpbin.org/html'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli_utils.cli_crawler(args) assert len(f.getvalue()) == 0 # links permitted testargs = [ '', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli_utils.cli_crawler(args) assert f.getvalue() == 'https://httpbin.org/links/1/0\n' # 0 links permitted args.crawl = 'https://httpbin.org/links/4/4' f = io.StringIO() with redirect_stdout(f): cli_utils.cli_crawler(args, n=0) # print(f.getvalue()) assert len(f.getvalue().split('\n')) == 5 # Exploration (Sitemap + Crawl) testargs = ['', '--explore', 'https://httpbin.org/html'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0
def test_queue(): 'Test creation, modification and download of URL queues.' # test conversion and storage inputdict = add_to_compressed_dict(['ftps://www.example.org/', 'http://']) assert inputdict == dict() inputdict = add_to_compressed_dict(['https://www.example.org/']) # CLI args testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = parse_args(testargs) assert url_processing_pipeline(args, inputdict) is None # single/multiprocessing testargs = ['', '-v'] with patch.object(sys, 'argv', testargs): args = parse_args(testargs) domain_dict = { 'https://httpbin.org': deque( [ '/status/301', '/status/304', '/status/200', '/status/300', '/status/400', '/status/505', ] ) } args.archived = True args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg') config = use_config(filename=args.config_file) config['DEFAULT']['SLEEP_TIME'] = '0.2' results = download_queue_processing(domain_dict, args, None, config) ## fixed: /301 missing, probably for a good reason... assert len(results[0]) == 5 and results[1] is None # test backoff algorithm backoffdict = {} testdict = {'http://test.org': deque(['/1'])} assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict()) testdict['http://test.org'] = deque(['/1']) backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263) assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict()) # concurrent domains testdict = {} backoffdict = {} testdict['http://test.org'] = deque(['/1']) testdict['http://example.org'] = deque(['/1']) # simulate recent request backoffdict['http://test.org'] = datetime.now() # must return the other domain test = draw_backoff_url(testdict, backoffdict, 5) assert test[0], test[1] == ('http://example.org/1', {'http://test.org': deque(['/1'])}) print(test) assert test[2] != {} # sleeps and returns the rest assert draw_backoff_url(testdict, backoffdict, 1) == ('http://test.org/1', {}, {}) # code hangs, logical: #testdict['http://test.org'] = deque(['/1']) #backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263) #assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict()) # download buffer domain_dict = {'https://test.org': deque(['/1', '/2', '/3']), 'https://test2.org': deque(['/1', '/2', '/3']), 'https://test3.org': deque(['/1', '/2', '/3']), 'https://test4.org': deque(['/1', '/2', '/3']), 'https://test5.org': deque(['/1', '/2', '/3']), 'https://test6.org': deque(['/1', '/2', '/3'])} bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=1) assert len(bufferlist) == 6 bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=2) assert len(bufferlist) == 6
def test_cli_pipeline(): '''test command-line processing pipeline''' # straight command-line input #testargs = ['', '<html><body>Text</body></html>'] #with patch.object(sys, 'argv', testargs): # args = cli.parse_args(testargs) #f = io.StringIO() #with redirect_stdout(f): # cli.process_args(args) #assert len(f.getvalue()) == 0 # test URL listing testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert cli_utils.url_processing_pipeline(args, dict()) is None # test conversion and storage inputdict = cli.convert_inputlist(None, ['ftps://www.example.org/'], None, None) assert inputdict == dict() inputdict = cli.convert_inputlist(None, ['https://www.example.org/'], None, None) assert cli_utils.url_processing_pipeline(args, inputdict) is None # test inputlist + blacklist testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) my_urls = cli_utils.load_input_urls(args.inputfile) assert my_urls is not None and len(my_urls) == 2 testargs = [ '', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist', os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) assert args.blacklist is not None # test backoff between domain requests inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None, None) reftime = datetime.now() cli_utils.url_processing_pipeline(args, inputdict) delta = (datetime.now() - reftime).total_seconds() assert delta > 2 # test blacklist and empty dict args.blacklist = cli_utils.load_blacklist(args.blacklist) assert len(args.blacklist) == 2 inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None, None) cli_utils.url_processing_pipeline(args, inputdict) # test backup testargs = ['', '--backup-dir', '/tmp/'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.archive_html('00Test', args) # test date-based exclusion testargs = ['', '-out', 'xml', '--with-metadata'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is None # test JSON output testargs = ['', '-out', 'json'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() assert cli.examine(teststring, args) is not None # dry-run file processing pipeline testargs = ['', '--parallel', '1', '--inputdir', '/dev/null'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) cli_utils.file_processing_pipeline(args) # file processing pipeline on resources/ args.inputdir = RESOURCES_DIR cli_utils.file_processing_pipeline(args) # sitemaps testargs = ['', '--sitemap', 'https://httpbin.org/', '--list'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): cli.process_args(args) assert len(f.getvalue()) == 0 # config file testargs = [ '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg' ] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f: teststring = f.read() args.config_file = os.path.join(RESOURCES_DIR, args.config_file) config = use_config(filename=args.config_file) assert cli.examine(teststring, args) is None # CLI options testargs = ['', '--links', '--images'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs)