def grab_facebook_cookie(source): if source == 'firefox' or source == 'chrome': get_cookie_for_url = grab_cookies(source) if get_cookie_for_url is None: return None cookie = get_cookie_for_url(FACEBOOK_URL + '/') else: cookie = source.strip() if not cookie: return None return fix_cookie(cookie)
def grab_facebook_cookie(namespace): if namespace.cookie == 'firefox' or namespace.cookie == 'chrome': get_cookie_for_url = grab_cookies(namespace.cookie) if get_cookie_for_url is None: die('Could not extract cookies from %s.' % namespace.cookie) cookie = get_cookie_for_url(FACEBOOK_URL + '/') else: cookie = namespace.cookie.strip() if not cookie: die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook pages.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) return fix_cookie(cookie)
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? if namespace.file is sys.stdin and is_url(namespace.column): namespace.file = StringIO('url\n%s' % namespace.column) namespace.column = 'url' # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True input_headers, pos, reader = custom_reader(namespace.file, namespace.column) filename_pos = input_headers.index( namespace.filename) if namespace.filename else None indexed_input_headers = {h: p for p, h in enumerate(input_headers)} selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v # Reading output output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS if namespace.contents_in_report: output_headers.append('raw_content') flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) output_writer = csv.writer(output_file) if not resuming: output_writer.writerow(output_headers) else: # Reading report to know what need to be done _, rpos, resuming_reader = custom_reader(output_file, 'line') resuming_reader_loading = tqdm(resuming_reader, desc='Resuming', dynamic_ncols=True, unit=' lines') already_done = ContiguousRangeSet() for line in resuming_reader_loading: index = line[rpos] already_done.add(int(index)) # Loading bar total = namespace.total if total is not None and resuming: total -= len(already_done) loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): line = item[1] url = line[pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, line, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): if selected_pos: line = [line[p] for p in selected_pos] line.extend([ index, resolved or '', status or '', error or '', filename or '', encoding or '' ]) if namespace.contents_in_report: line.append(data or '') output_writer.writerow(line) errors = 0 status_codes = Counter() target_iterator = enumerate(reader) if resuming: target_iterator = (pair for pair in target_iterator if not already_done.stateful_contains(pair[0])) multithreaded_iterator = multithreaded_fetch(target_iterator, key=url_key, request_args=request_args, threads=namespace.threads, throttle=namespace.throttle) for result in multithreaded_iterator: line_index, line = result.item if not result.url: write_output(line_index, line) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=line[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, line)) else: filename = line[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( line_index, line, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(line_index, line, error=error_code) # Closing files if namespace.output is not None: output_file.close()
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? single_url = namespace.file is sys.stdin and is_url(namespace.column) if single_url: edit_namespace_with_csv_io(namespace, 'url') # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) # Resume listener listener = None resuming_reader_loading = None skipped = 0 if resuming: resuming_reader_loading = tqdm(desc='Resuming', dynamic_ncols=True, unit=' lines') def listener(event, row): nonlocal skipped if event == 'resume.output': resuming_reader_loading.update() if event == 'resume.input': skipped += 1 loading_bar.set_postfix(skipped=skipped) loading_bar.update() # Enricher enricher = casanova.threadsafe_enricher( namespace.file, output_file, resumable=resuming, auto_resume=False, add=OUTPUT_ADDITIONAL_HEADERS + (['raw_contents'] if namespace.contents_in_report else []), keep=namespace.select, listener=listener) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) url_pos = enricher.pos[namespace.column] filename_pos = None if namespace.filename is not None: if namespace.filename not in enricher.pos: die([ 'Could not find the "%s" column containing the filenames in the given CSV file.' % namespace.filename ]) filename_pos = enricher.pos[namespace.filename] indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)} if resuming: enricher.resume() resuming_reader_loading.close() # Loading bar total = namespace.total loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): url = item[1][url_pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', encoding or '' ] if namespace.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() fetch_kwargs = { 'threads': namespace.threads, 'throttle': namespace.throttle, 'domain_parallelism': namespace.domain_parallelism } if namespace.timeout is not None: fetch_kwargs['timeout'] = namespace.timeout multithreaded_iterator = multithreaded_fetch(enricher, key=url_key, request_args=request_args, **fetch_kwargs) for result in multithreaded_iterator: index, row = result.item if not result.url: write_output(index, row) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=row[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, row)) else: filename = row[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( index, row, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(index, row, error=error_code) # Closing files output_file.close()