def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, link['domain']) existing_file = wget_output_path(link) if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} CMD = [ # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html *'wget -N -E -np -x -H -k -K -S --restrict-file-names=unix'.split(' '), *(('-p', ) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html end() output = wget_output_path(link, look_in=domain_dir) # Check for common failure cases if result.returncode > 0: print(' got wget response code {}:'.format( result.returncode)) if result.returncode != 8: print('\n'.join(' ' + line for line in (result.stderr or result.stdout ).decode().rsplit('\n', 10)[-10:] if line.strip())) if b'403: Forbidden' in result.stderr: raise Exception('403 Forbidden (try changing WGET_USER_AGENT)') if b'404: Not Found' in result.stderr: raise Exception('404 Not Found') if b'ERROR 500: Internal Server Error' in result.stderr: raise Exception('500 Internal Server Error') if result.returncode == 4: raise Exception('Failed wget download') except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print PDF of site to file using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} if os.path.exists(os.path.join(link_dir, 'output.pdf')): return {'output': 'output.pdf', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf', link['url'] ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to print PDF') chmod_file('output.pdf', cwd=link_dir) output = 'output.pdf' except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return {'output': 'screenshot.png', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--screenshot', '--window-size={}'.format(resolution), '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # sreenshot.png end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to take screenshot') chmod_file('screenshot.png', cwd=link_dir) output = 'screenshot.png' except Exception as e: end() print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' Run to see full output:') print(' cd {};'.format(link_dir)) print(' {}'.format(' '.join(CMD))) output = e return { 'cmd': CMD, 'output': output, }
def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print HTML of site to file using chrome --dump-html""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} output_path = os.path.join(link_dir, 'output.html') if os.path.exists(output_path): return {'output': 'output.html', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--dump-dom', '--timeout={}'.format((timeout) * 1000), link['url'] ] end = progress(timeout, prefix=' ') try: with open(output_path, 'w+') as f: result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.html end() if result.returncode: print(' ', (result.stderr).decode()) raise Exception('Failed to fetch DOM') chmod_file('output.html', cwd=link_dir) output = 'output.html' except Exception as e: end() print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' Run to see full output:') print(' cd {};'.format(link_dir)) print(' {}'.format(' '.join(CMD))) output = e return { 'cmd': CMD, 'output': output, }
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION, firefox_profile=FIREFOX_PROFILE): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return {'output': 'screenshot.png', 'status': 'skipped'} CMD = [ *firefox_headless(profile=firefox_profile), '--screenshot', link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to take screenshot') chmod_file('screenshot.png', cwd=link_dir) output = 'screenshot.png' except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, link['domain']) existing_file = wget_output_path(link) if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} if warc: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ 'wget', # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '--restrict-file-names=unix', '--timeout={}'.format(timeout), *(() if warc else ('--timestamping', )), *(('--warc-file={}'.format(warc_path), ) if warc else ()), *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()), *(('--user-agent="{}"'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html end() output = wget_output_path(link, look_in=domain_dir) # Check for common failure cases if result.returncode > 0: print(' Got wget response code {}:'.format( result.returncode)) print('\n'.join( ' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip())) if b'403: Forbidden' in result.stderr: raise Exception('403 Forbidden (try changing WGET_USER_AGENT)') if b'404: Not Found' in result.stderr: raise Exception('404 Not Found') if b'ERROR 500: Internal Server Error' in result.stderr: raise Exception('500 Internal Server Error') raise Exception('Got an error from the server') except Exception as e: end() print(' {}Some resources were skipped: {}{}'.format( ANSI['lightyellow'], e, ANSI['reset'])) print(' Run to see full output:') print(' cd {};'.format(link_dir)) print(' {}'.format(' '.join(CMD))) output = e return { 'cmd': CMD, 'output': output, }
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, link['domain']) existing_file = wget_output_path(link) if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} if warc: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), *(() if warc else ('--timestamping', )), *(('--warc-file={}'.format(warc_path), ) if warc else ()), *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html end() output = wget_output_path(link, look_in=domain_dir) output_tail = [ ' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip() ] # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] else 0) # Check for common failure cases if result.returncode > 0 and files_downloaded < 1: print(' Got wget response code {}:'.format( result.returncode)) print('\n'.join(output_tail)) if b'403: Forbidden' in result.stderr: raise Exception('403 Forbidden (try changing WGET_USER_AGENT)') if b'404: Not Found' in result.stderr: raise Exception('404 Not Found') if b'ERROR 500: Internal Server Error' in result.stderr: raise Exception('500 Internal Server Error') raise Exception('Got an error from the server') except Exception as e: end() # to let the user copy-paste the command and run it safely we have # to quote some of the arguments that could have spaces in them quoted_cmd = ' '.join(CMD) quoted_cmd = quoted_cmd.replace(WGET_USER_AGENT, '"{}"'.format(WGET_USER_AGENT)) if COOKIES_FILE: quoted_cmd = quoted_cmd.replace(COOKIES_FILE, '"{}"'.format(COOKIES_FILE)) print(' {}Some resources were skipped: {}{}'.format( ANSI['lightyellow'], e, ANSI['reset'])) print(' Run to see full output:') print(' cd {};'.format(link_dir)) print(' {}'.format(quoted_cmd)) output = e return { 'cmd': CMD, 'output': output, }