def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): """download full site using wget""" if os.path.exists(os.path.join(link_dir, link['domain'])): return {'output': html_appended_url(link), 'status': 'skipped'} CMD = [ *'wget --timestamping --adjust-extension --no-parent'.split( ' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html *(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()), *(('--user-agent="{}"'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html end() output = html_appended_url(link) if result.returncode > 0: print(' got wget response code {}:'.format( result.returncode)) print('\n'.join(' ' + line for line in ( result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip())) # raise Exception('Failed to wget download') except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': html_appended_url(link)} if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return {'output': 'screenshot.png', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--screenshot', '--window-size={}'.format(resolution), link['url'] ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to take screenshot') chmod_file('screenshot.png', cwd=link_dir) output = 'screenshot.png' except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print PDF of site to file using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': html_appended_url(link)} if os.path.exists(os.path.join(link_dir, 'output.pdf')): return {'output': 'output.pdf', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf', link['url'] ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to print PDF') output = 'output.pdf' except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e return { 'cmd': CMD, 'output': output, }