def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? if namespace.file is sys.stdin and is_url(namespace.column): namespace.file = StringIO('url\n%s' % namespace.column) namespace.column = 'url' # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True input_headers, pos, reader = custom_reader(namespace.file, namespace.column) filename_pos = input_headers.index( namespace.filename) if namespace.filename else None indexed_input_headers = {h: p for p, h in enumerate(input_headers)} selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v # Reading output output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS if namespace.contents_in_report: output_headers.append('raw_content') flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) output_writer = csv.writer(output_file) if not resuming: output_writer.writerow(output_headers) else: # Reading report to know what need to be done _, rpos, resuming_reader = custom_reader(output_file, 'line') resuming_reader_loading = tqdm(resuming_reader, desc='Resuming', dynamic_ncols=True, unit=' lines') already_done = ContiguousRangeSet() for line in resuming_reader_loading: index = line[rpos] already_done.add(int(index)) # Loading bar total = namespace.total if total is not None and resuming: total -= len(already_done) loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): line = item[1] url = line[pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, line, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): if selected_pos: line = [line[p] for p in selected_pos] line.extend([ index, resolved or '', status or '', error or '', filename or '', encoding or '' ]) if namespace.contents_in_report: line.append(data or '') output_writer.writerow(line) errors = 0 status_codes = Counter() target_iterator = enumerate(reader) if resuming: target_iterator = (pair for pair in target_iterator if not already_done.stateful_contains(pair[0])) multithreaded_iterator = multithreaded_fetch(target_iterator, key=url_key, request_args=request_args, threads=namespace.threads, throttle=namespace.throttle) for result in multithreaded_iterator: line_index, line = result.item if not result.url: write_output(line_index, line) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=line[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, line)) else: filename = line[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( line_index, line, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(line_index, line, error=error_code) # Closing files if namespace.output is not None: output_file.close()
def fetch_action(cli_args, resolve=False, defer=None): # If we are hitting a single url we enable contents_in_report by default if not resolve and isinstance(cli_args.file, StringIO) and cli_args.contents_in_report is None: cli_args.contents_in_report = True if not resolve and cli_args.contents_in_report and cli_args.compress: raise InvalidArgumentsError('Cannot both --compress and output --contents-in-report!') # HTTP method http_method = cli_args.method # Cookie grabber get_cookie = None if cli_args.grab_cookies: get_cookie = grab_cookies(cli_args.grab_cookies) # Global headers global_headers = None if cli_args.headers: global_headers = {} for header in cli_args.headers: k, v = parse_http_header(header) global_headers[k] = v # Resume listener skipped_rows = 0 resuming_reader_loading = None if cli_args.resume and cli_args.output.can_resume(): resuming_reader_loading = LoadingBar( desc='Resuming', unit='line' ) def output_read_listener(event, row): nonlocal skipped_rows if event != 'output.row': return skipped_rows += 1 resuming_reader_loading.update() cli_args.output.listener = output_read_listener if resolve: additional_headers = RESOLVE_ADDITIONAL_HEADERS else: additional_headers = FETCH_ADDITIONAL_HEADERS if cli_args.contents_in_report: additional_headers = additional_headers + ['raw_contents'] # Enricher multiplex = None if cli_args.separator is not None: multiplex = (cli_args.column, cli_args.separator) enricher = casanova.threadsafe_enricher( cli_args.file, cli_args.output, add=additional_headers, keep=cli_args.select, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES, multiplex=multiplex ) if resuming_reader_loading is not None: resuming_reader_loading.close() if cli_args.column not in enricher.headers: raise InvalidArgumentsError('Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column) url_pos = enricher.headers[cli_args.column] filename_pos = None if not resolve and cli_args.filename is not None: if cli_args.filename not in enricher.headers: raise InvalidArgumentsError('Could not find the "%s" column containing the filenames in the given CSV file.' % cli_args.filename) filename_pos = enricher.headers[cli_args.filename] # Loading bar loading_bar = LoadingBar( desc='Fetching pages', total=enricher.total, unit='url', initial=skipped_rows ) defer(loading_bar.close) # NOTE: it could be dangerous with multithreaded execution, not to close it ourselves def update_loading_bar(result): nonlocal errors if result.error is not None: errors += 1 else: if resolve: status = result.stack[-1].status else: status = result.response.status if status >= 400: status_codes[status] += 1 stats = {'errors': errors} for code, count in status_codes.most_common(1): stats[str(code)] = count loading_bar.update_stats(**stats) loading_bar.update() only_shortened = getattr(cli_args, 'only_shortened', False) def url_key(item): url = item[1][url_pos].strip() if not url: return if only_shortened and not is_shortened_url(url): return # Url templating if cli_args.url_template: return cli_args.url_template.format(value=url) return url def request_args(domain, url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return { 'method': http_method, 'cookie': cookie, 'headers': headers } # Worker callback internals filename_builder = None files_writer = None if not resolve: try: filename_builder = FilenameBuilder( folder_strategy=cli_args.folder_strategy, template=cli_args.filename_template ) except TypeError: die([ 'Invalid "%s" --folder-strategy!' % cli_args.folder_strategy, 'Check the list at the end of the command help:', ' $ minet fetch -h' ]) files_writer = ThreadSafeFilesWriter(cli_args.output_dir) def worker_callback(result): # NOTE: at this point the callback is only fired on success row = result.item[1] response = result.response meta = result.meta if cli_args.keep_failed_contents and response.status != 200: return # First we need to build a filename filename_cell = row[filename_pos] if filename_pos else None formatter_kwargs = {} if cli_args.filename_template and 'line' in cli_args.filename_template: formatter_kwargs['line'] = enricher.wrap(row) try: filename = filename_builder( result.resolved, filename=filename_cell, ext=meta.get('ext'), formatter_kwargs=formatter_kwargs, compressed=cli_args.compress ) except FilenameFormattingError as e: result.error = e return meta['filename'] = filename # Decoding the response data? is_text = meta.get('is_text', False) original_encoding = meta.get('encoding', 'utf-8') data = response.data binary = True if is_text and (cli_args.standardize_encoding or cli_args.contents_in_report): data = data.decode(original_encoding, errors='replace') binary = False if cli_args.contents_in_report: meta['decoded_contents'] = data # Writing the file? # TODO: specify what should happen when contents are empty (e.g. POST queries) if data and not cli_args.contents_in_report: files_writer.write( filename, data, binary=binary, compress=cli_args.compress ) def write_fetch_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, mimetype=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', mimetype or '', encoding or '' ] if cli_args.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) def write_resolve_output(index, row, resolved=None, status=None, error=None, redirects=None, chain=None): addendum = [ resolved or '', status or '', error or '', redirects or '', chain or '' ] enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() common_kwargs = { 'key': url_key, 'insecure': cli_args.insecure, 'threads': cli_args.threads, 'throttle': cli_args.throttle, 'domain_parallelism': cli_args.domain_parallelism, 'max_redirects': cli_args.max_redirects, 'wait': False, 'daemonic': True } if cli_args.timeout is not None: common_kwargs['timeout'] = cli_args.timeout # Normal fetch if not resolve: multithreaded_iterator = multithreaded_fetch( enricher, request_args=request_args, callback=worker_callback, **common_kwargs ) for result in multithreaded_iterator: index, row = result.item if not result.url: write_fetch_output( index, row ) loading_bar.update() continue # Updating stats update_loading_bar(result) # No error if result.error is None: meta = result.meta # Final url target resolved_url = result.resolved if resolved_url == result.url: resolved_url = None # Reporting in output write_fetch_output( index, row, resolved=resolved_url, status=result.response.status, filename=meta.get('filename'), encoding=meta.get('encoding'), mimetype=meta.get('mimetype'), data=meta.get('decoded_contents') ) # Handling potential errors else: error_code = report_error(result.error) resolved = None if isinstance(result.error, InvalidURLError): resolved = result.error.url if isinstance(result.error, FilenameFormattingError): loading_bar.print(report_filename_formatting_error(result.error)) write_fetch_output( index, row, error=error_code, resolved=resolved ) # Resolve else: multithreaded_iterator = multithreaded_resolve( enricher, resolve_args=request_args, follow_meta_refresh=cli_args.follow_meta_refresh, follow_js_relocation=cli_args.follow_js_relocation, infer_redirection=cli_args.infer_redirection, **common_kwargs ) for result in multithreaded_iterator: index, row = result.item if not result.url: write_resolve_output( index, row ) loading_bar.update() continue # Updating stats update_loading_bar(result) # No error if result.error is None: # Reporting in output last = result.stack[-1] write_resolve_output( index, row, resolved=last.url, status=last.status, redirects=len(result.stack) - 1, chain='|'.join(step.type for step in result.stack) ) # Handling potential errors else: error_code = report_error(result.error) write_resolve_output( index, row, error=error_code, redirects=(len(result.stack) - 1) if result.stack else None, chain='|'.join(step.type for step in result.stack) if result.stack else None )
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? single_url = namespace.file is sys.stdin and is_url(namespace.column) if single_url: edit_namespace_with_csv_io(namespace, 'url') # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) # Resume listener listener = None resuming_reader_loading = None skipped = 0 if resuming: resuming_reader_loading = tqdm(desc='Resuming', dynamic_ncols=True, unit=' lines') def listener(event, row): nonlocal skipped if event == 'resume.output': resuming_reader_loading.update() if event == 'resume.input': skipped += 1 loading_bar.set_postfix(skipped=skipped) loading_bar.update() # Enricher enricher = casanova.threadsafe_enricher( namespace.file, output_file, resumable=resuming, auto_resume=False, add=OUTPUT_ADDITIONAL_HEADERS + (['raw_contents'] if namespace.contents_in_report else []), keep=namespace.select, listener=listener) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) url_pos = enricher.pos[namespace.column] filename_pos = None if namespace.filename is not None: if namespace.filename not in enricher.pos: die([ 'Could not find the "%s" column containing the filenames in the given CSV file.' % namespace.filename ]) filename_pos = enricher.pos[namespace.filename] indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)} if resuming: enricher.resume() resuming_reader_loading.close() # Loading bar total = namespace.total loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): url = item[1][url_pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', encoding or '' ] if namespace.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() fetch_kwargs = { 'threads': namespace.threads, 'throttle': namespace.throttle, 'domain_parallelism': namespace.domain_parallelism } if namespace.timeout is not None: fetch_kwargs['timeout'] = namespace.timeout multithreaded_iterator = multithreaded_fetch(enricher, key=url_key, request_args=request_args, **fetch_kwargs) for result in multithreaded_iterator: index, row = result.item if not result.url: write_output(index, row) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=row[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, row)) else: filename = row[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( index, row, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(index, row, error=error_code) # Closing files output_file.close()