def process(self, item): digests = {} input_filename = "%(item_dir)s/%(warc_file_base)s.warc" % item output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item with open(input_filename, 'rb') as f_in, \ open(output_filename, 'wb') as f_out: writer = WARCWriter(filebuf=f_out, gzip=True) for record in ArchiveIterator(f_in): url = record.rec_headers.get_header('WARC-Target-URI') if url is not None and url.startswith('<'): url = re.search('^<(.+)>$', url).group(1) record.rec_headers.replace_header('WARC-Target-URI', url) if record.rec_headers.get_header('WARC-Type') == 'response': digest = record.rec_headers.get_header('WARC-Payload-Digest') if digest in digests: writer.write_record( self._record_response_to_revisit(writer, record, digests[digest]) ) else: digests[digest] = ( record.rec_headers.get_header('WARC-Record-ID'), record.rec_headers.get_header('WARC-Date'), record.rec_headers.get_header('WARC-Target-URI') ) writer.write_record(record) elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': record.rec_headers.replace_header('WARC-Filename', output_filename) writer.write_record(record) else: writer.write_record(record)
def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record(data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()
headers={'Accept-Encoding': 'identity'}, stream=True) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') print(resp.raw) record = writer.create_warc_record('http://example.com/', 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) #quit() all_posts = [] for post in facebook_scraper.get_posts(442978589179108, extra_info=True, pages=1, timeout=20): print(post['text'][:40]) all_posts.append(post) print(all_posts)
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, warcheader_version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info[ 'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + str( self.subprefix) # don't let yaml leave this as an int serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; to minimize open filehandles? ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, ttl, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(ttl) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += '\t'.join( (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def _fake_resp_headers(self, resp_headers, body_len, decompressed=False): prefix = b'X-Crawler-' ret = [] for h, v in resp_headers: hl = h.lower() if hl == b'content-length': if not (v.isdigit() and int(v) == body_len): ret.append((prefix + h, v)) ret.append((b'Content-Length', str(body_len))) elif hl == b'content-encoding': if decompressed: ret.append((prefix + h, v)) else: ret.append((h, v)) elif hl == b'transfer-encoding': if v.lower() == b'chunked': # aiohttp always undoes chunking ret.append((prefix + h, v)) else: ret.append((h, v)) else: ret.append((h, v)) return ret def write_request_response_pair(self, url, ip, req_headers, resp_headers, is_truncated, payload, digest=None, decompressed=False): if self.writer is None: self.open() req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) fake_resp_headers = self._fake_resp_headers(resp_headers, len(payload), decompressed=decompressed) resp_http_headers = StatusAndHeaders('200 OK', fake_resp_headers, protocol='HTTP/1.1') warc_headers_dict = OrderedDict() if ip is not None: # ip should be here unless we crawl through a proxy warc_headers_dict['WARC-IP-Address'] = ip if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) if options.output == sys.stdout: filename = options.input else: filename = options.output fo.write_record( fo.create_warcinfo_record(filename=filename, info={ 'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0' })) for record in f: # Initial checks if record.rec_type != 'response' and record.rec_type != 'resource': continue if record.rec_headers.get_header( 'WARC-Target-URI')[0] == '<' and record.rec_headers.get_header( 'WARC-Target-URI')[-1] == '>': url = record.rec_headers.get_header('WARC-Target-URI')[1:-1] else: url = record.rec_headers.get_header('WARC-Target-URI') if url == "unknown":
class WarcDownloader: """ Download URL with HTTP GET, save to a WARC file and return the decoded text """ def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True, err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2, limit_period=1, proxy_url=None, allow_cookies=False): if known_bad_urls is not None: # Setup the list of cached bad URLs to prevent trying to download them again with open(known_bad_urls, encoding='UTF-8') as fh: self.bad_urls = {line.strip() for line in fh} else: self.bad_urls = set() if not overwrite_warc: # Find out next nonexisting warc filename num = 0 while os.path.exists(filename): filename2, ext = os.path.splitext(filename) # Should be filename.warc.gz if ext == '.gz' and filename2.endswith('.warc'): filename2, ext2 = os.path.splitext(filename2) # Should be filename.warc ext = ext2 + ext # Should be .warc.gz filename = '{0}-{1:05d}{2}'.format(filename2, num, ext) num += 1 logger_.log('INFO', 'Creating archivefile: {0}'.format(filename)) self._output_file = open(filename, 'wb') self._logger_ = logger_ self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent} self._session = Session() # Setup session for speeding up downloads if proxy_url is not None: # Set socks proxy if provided self._session.proxies['http'] = proxy_url self._session.proxies['https'] = proxy_url self._allow_cookies = allow_cookies # Setup rate limiting to prevent hammering the server self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period, period=limit_period)(self._http_get_w_cookie_handling)) self._error_count = 0 self._error_threshold = err_threshold # Set the error threshold which cause aborting to prevent deinal self._writer = WARCWriter(self._output_file, gzip=True) if warcinfo_record_data is None: # INFO RECORD # Some custom information about the warc writer program and its settings info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]), 'format': 'WARC File Format 1.0', 'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'} info_record = self._writer.create_warcinfo_record(filename, info_headers) else: # Must recreate custom headers else they will not be copied custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\ encode('UTF-8') info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0], payload=BytesIO(custom_headers), length=len(custom_headers)) self._writer.write_record(info_record) def __del__(self): if hasattr(self, '_output_file'): # If the program opened a file, then it should gracefully close it on exit! self._output_file.close() def _http_get_w_cookie_handling(self, *args, **kwargs): """ Extend requests.get with optional cookie purging """ if not self._allow_cookies: self._session.cookies.clear() return self._session.get(*args, **kwargs) def _handle_request_exception(self, url, msg): self._logger_.log('WARNING', '\t'.join((url, msg))) self._error_count += 1 if self._error_count >= self._error_threshold: raise NameError('Too many error happened! Threshold exceeded! See log for details!') def download_url(self, url): scheme, netloc, path, params, query, fragment = urlparse(url) path = quote(path) # For safety urlencode the generated URL... url = urlunparse((scheme, netloc, path, params, query, fragment)) if url in self.bad_urls: self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url)) return None try: # The actual request resp = self._requests_get(url, headers=self._req_headers, stream=True) except RequestException as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code, resp.reason)) return None # REQUEST reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format(respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers) # RESPONSE resp_status = '{0} {1}'.format(resp.status_code, resp.reason) resp_headers_list = resp.raw.headers.items() # get raw headers from urllib3 # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... try: data = resp.raw.read() # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc))) text = data.decode(enc, 'ignore') data_stream = BytesIO(data) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc}) # Everything is OK, write the two WARC records self._writer.write_record(reqv_record) self._writer.write_record(resp_record) return text def write_record(self, record): self._writer.write_record(record)
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info['software'] = 'cocrawler/' + version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + self.subprefix serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; minimizes open filehandles ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, expires, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(expires - time.time()) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += host + '.\t' + str( ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def write_request_response_pair(self, url, req_headers, resp_headers, is_truncated, payload, digest=None): if self.writer is None: self.open() # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?) req_http_headers = StatusAndHeaders( 'GET / HTTP/1.1', headers_to_str_headers(req_headers)) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) resp_http_headers = StatusAndHeaders( '200 OK', headers_to_str_headers(resp_headers), protocol='HTTP/1.1') warc_headers_dict = {} if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
class SiteCrawler(object): def __init__(self, priority, multi_site_crawler, seed_urls, domain, config, scout=None): # Multi-site crawler object that manages current crawler self.multi_site_crawler = multi_site_crawler # Concurrency lock to ensure that only one process accesses URL lists (pending, visited and attempts) self.url_list_concurrency_lock = Lock() # Concurrency lock to ensure that only one process accesses to write the status and the output WARC file self.file_write_concurrency_lock = Lock() # If verbose is True, debuging level is set to INFO; otherwise it is ERROR logging.basicConfig( level=logging.INFO if config["verbose"] else logging.ERROR) # Domain corresponding to the seed URLs to be crawled self.domain = domain # Accepted TLDs in the crawl self.tlds = config["accepted_tlds"] # Set of URLs that have been already crawled self.visited = set() # Map that counts the number of times a URL is visited and could not be accessed self.attempts = {} # Links that must not be re-crawled until some time has passed self.asleep_links = {} # Maximum number of attempts to visit a website and receiving an error until it is discarded self.max_attempts = config["max_attempts"] # Maximum self.max_folder_tree_depth = config["max_folder_tree_depth"] # Accepted content time (for example: (text/html) ) self.accepted_content_type = config["accepted_content"] # List of regular expressions to discard URLs self.url_blacklist_re = config["url_blacklist"] # If interrupt is set to False, crawling stops self.interrupt = False self.sleep_thread = None # Variable that keeps the current size of the crawling self.crawl_size = 0.0 # Priority of the process when added to the queue that manages all the crawlers in MultiSiteCrawler self.priority = priority # Path to the file that stores crawling state dump self.dumpfile = config["output_dir"] + "/" + self.domain + ".state" # If a path is provided, the previous crawling status is restored to resume crawling if config["resume_crawling"]: self.load_status(pickle.load(open(self.dumpfile, 'rb'))) # Path to the file where WARC is writen output_file_name = config["output_dir"] + "/" + self.domain + ".warc.gz" metadata_output_file_name = config[ "output_dir"] + "/" + self.domain + ".metadata.gz" name_counter = 1 while os.path.isfile(output_file_name): output_file_name = config[ "output_dir"] + "/" + self.domain + "." + str( name_counter) + ".warc.gz" metadata_output_file_name = config[ "output_dir"] + "/" + self.domain + "." + str( name_counter) + ".metadata.gz" name_counter += 1 f_out = open(output_file_name, 'wb') self.writer = WARCWriter(f_out, gzip=True) self.metadata_writer = gzip.open(metadata_output_file_name, "wb") # Scout object that will determine if the website is promising and if crawling should be interrupted self.scout = scout # The user will only keep documents in these languages self.langs_of_interest = config["langs_of_interest"] # User agent of the crawl self.user_agent = config["user_agent"] # Connection timeout self.conn_timeout = config["connection_timeout"] # Setting default crawling delay self.default_delay = config["crawl_delay"] # Init list of pending URLs from seed URLs; every URL is checked to confirm that it can be visited self.pending_urls = [] # Robots parser: it is initialised from the first valid seed URL found self.robots = SiteRobots(self.user_agent, self.default_delay, self.conn_timeout) self.url_list_concurrency_lock.acquire() for url in seed_urls: if url.is_valid(): self.add_url_to_list(url) self.url_list_concurrency_lock.release() # Maximum crawling size for this site if "max_size_per_site" not in config: self.max_size = None else: self.max_size = config["max_size_per_site"] # Maximum crawling time for this site if "max_time_per_site" not in config: self.max_time = None else: self.max_time = config["max_time_per_site"] # Starting time of the crawling; it is used to decide when max_time is reached self.starts = int(time.time()) # Time of the last connection; it is used to make sure that delay is fulfilled self.last_connection = self.starts - self.default_delay def extend_url_list(self, url_list): self.url_list_concurrency_lock.acquire() for u in url_list: self.add_url_to_list(u) self.url_list_concurrency_lock.release() # Adding URL to the list of URLs to be visited during crawling; before doing so, checks if it was already visited or # if it infringes TLD restrictions def add_url_to_list(self, url): if not url.is_valid(): logging.info('"%s" is not a valid URL', url.get_norm_url()) if url.get_norm_url() in self.visited or url in self.pending_urls: logging.info( '"%s" already used before (it may be pending of crawling)', url.get_norm_url()) else: logging.info('"%s" added to pending URLs', url.get_norm_url()) self.pending_urls.append(url) def get_pending_url(self): url = None try: self.url_list_concurrency_lock.acquire() sleeping_urls = [] while len(self.pending_urls) > 0 and url is None: # Next URL is picked from the list of pending URLs and is added to the list of visited URLs tmp_url = self.pending_urls.pop() if tmp_url.wait_until is not None and tmp_url.wait_until > time.time( ): sleeping_urls.append(url) else: self.visited.add(tmp_url.get_norm_url()) url = tmp_url self.pending_urls.extend(sleeping_urls) finally: self.url_list_concurrency_lock.release() #threading.current_thread().name = "crawling: "+url.get_norm_url() return url def _process_link(self, link, url): logging.debug("\t\t" + threading.current_thread().name + "--- going to process " + link.get_norm_url()) # Longer than limit set by the standard RFC7230 are discarded if not link.is_valid(): return None # Filter url using URL blacklist_re for f in self.url_blacklist_re: if re.search(f, link.get_norm_url()): return None if self.domain == link.get_domain(): logging.debug("\t\t" + threading.current_thread().name + "--- adding URL to list " + link.get_norm_url()) self.url_list_concurrency_lock.acquire() self.add_url_to_list(link) self.url_list_concurrency_lock.release() return link elif link.get_tld() in self.tlds: self.url_list_concurrency_lock.acquire() if link.get_norm_url() in self.visited: logging.info('"%s" already used to extend list of seed URLs', link.get_norm_url()) self.url_list_concurrency_lock.release() else: logging.info('"%s" used to extend list of seed URLs', link.get_norm_url()) self.visited.add(link.get_norm_url()) self.url_list_concurrency_lock.release() self.multi_site_crawler.extend_seed_urls(link) return link else: logging.info('"%s" discarded: not in the same TLD', link.get_norm_url()) return None def _calc_depth(self, url): # calculate url depth return len( url.replace('https', 'http').replace( self.root_url, '').rstrip('/').split('/')) - 1 def connect_to_server(self, url): res = None try: logging.info('Connecting to: %s', url.get_norm_url()) self.last_connection = time.time() # Connections are done with a delay to avoid blocking the server if url.get_url_parts().scheme == 'http': try: conn = http.client.HTTPConnection( url.get_url_parts().netloc, timeout=self.conn_timeout) except: conn = http.client.HTTPSConnection( url.get_url_parts().netloc, timeout=self.conn_timeout) else: conn = http.client.HTTPSConnection(url.get_url_parts().netloc, timeout=self.conn_timeout) logging.info('Connection obtained: %s', url.get_norm_url()) conn.request('GET', quote(url.get_url_parts().path, '?=&%/'), headers={'User-Agent': self.user_agent}) logging.info('Get request set %s', url.get_norm_url()) res = conn.getresponse() logging.info('Response obtained from: %s', url.get_norm_url()) except (http.client.HTTPException, EnvironmentError) as e: logging.info("HTTPException!") conn = None self.process_failed_url(url) except socket.timeout: logging.info("Socket timeout!") if conn is not None: conn.close() self.process_failed_url(url) except ssl.CertificateError: logging.info("CertificateError!") if conn is not None: conn.close() self.process_failed_url(url) except ConnectionResetError: logging.info("ConnectionResetError!") if conn is not None: conn.close() self.process_failed_url(url) except Exception as ex: logging.info(str(ex)) if conn is not None: conn.close() if conn is None: logging.info('Connection is closed') else: logging.info('Connection is correct') return conn, res # The method returns True if the response status is 2XX and the document should be processed; otherwhise it takes # the corresponding action (manage redirects or errors) def deal_with_response_status(self, url, response): if 200 <= response.status <= 226: return True elif 301 <= response.status <= 308: rlink = self._process_link(Link(response.getheader('location')), url) if rlink is not None: logging.info('%s Redirect: %s -> %s', threading.current_thread().name, url.get_norm_url(), rlink.get_norm_url()) elif 400 <= response.status <= 407 or 409 <= response.status <= 412 or 414 <= response.status <= 427 or 431 <= response.status: self.process_failed_url(url, retry=False) elif response.status == 408: self.process_failed_url(url, retry=True) elif response.status == 413 or response.status == 428: waiting_time = response.getheader('Retry-After') if waiting_time is None: url.wait_until = time.time() + 500 else: url.wait_until = time.time() + int(waiting_time) self.process_failed_url(url, retry=True) else: self.process_failed_url(url, retry=False) return False def crawl_one_page(self): self.multi_site_crawler.new_running_crawler() url = self.get_pending_url() if not self.interrupt and url is not None: if not self.robots.fetch(url, self.max_attempts, self.domain): logging.info("robots.txt forbids crawling URL: %s", url.get_norm_url()) return logging.debug("\t" + threading.current_thread().name + " >>>> Connecting " + url.get_norm_url() + "...") connection, server_response = self.connect_to_server(url) logging.debug("\t" + threading.current_thread().name + "<<<< Connected " + url.get_norm_url()) # If response is 2XX, the web page is processed if server_response is not None and self.deal_with_response_status( url, server_response): # Check content type content_type = server_response.getheader('Content-Type') logging.debug("\t" + threading.current_thread().name + "<<<< Content type: " + str(content_type)) doc = None if content_type is not None and not re.search( self.accepted_content_type, content_type): logging.info("%s discarded: wrong file type", url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + ">>>> Extracting doc from " + url.get_norm_url()) doc = WebDocument(server_response, url, self.max_attempts) logging.debug("\t" + threading.current_thread().name + "<<<< Document extracted " + url.get_norm_url()) connection.close() logging.debug("\t" + threading.current_thread().name + "<<<< Connection closed: " + url.get_norm_url()) if doc is not None: if doc.utf_text: links_set = doc.get_link_set() # We can shuffle links to avoid to get biased by the structure of the site # random.shuffle(linksset) listoflinks = [] for li in links_set: listoflinks.append(li.get_norm_url()) logging.debug("\t" + threading.current_thread().name + "<<<< Processing " + str(len(links_set)) + " links... " + url.get_norm_url() + "... " + " ".join(listoflinks)) for link in links_set: self._process_link(link, doc.url) logging.debug("\t" + threading.current_thread().name + "<<<< Links processed " + url.get_norm_url()) if doc.get_lang() is None or not doc.get_lang( ).is_reliable: logging.info( "%s discarded: language detection is not reliable", url.get_norm_url()) elif doc.get_lang( ).language not in self.langs_of_interest: logging.info( "%s discarded: language not among languages of interest (detected=%s)", url.get_norm_url(), doc.get_lang().language) else: logging.debug("\t" + threading.current_thread().name + ">>>> Running scout " + url.get_norm_url()) self.run_scout(doc) logging.debug("\t" + threading.current_thread().name + "<<<< Scout run " + url.get_norm_url()) # The document is writen to the warc logging.debug("\t" + threading.current_thread().name + ">>>> Write document " + url.get_norm_url()) self.write_document(doc) logging.debug("\t" + threading.current_thread().name + "<<<< Document saved " + url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + "<<<< Document was none: " + url.get_norm_url()) else: logging.debug("\t" + threading.current_thread().name + "<<<< Connection was none") if connection is not None: connection.close() if self.max_size is not None and self.crawl_size > self.max_size: self.interrupt_crawl() elif self.max_time is not None and time.time( ) - self.crawlstarts > self.max_time: self.interrupt_crawl() elif len(self.pending_urls) == 0: self.interrupt = True # If the crawler is allowed to continue crawling, wait until delay has passed and continue if not self.interrupt: self.sleep_thread = Thread(target=self._wait_and_queue) self.sleep_thread.daemon = False self.sleep_thread.name = self.sleep_thread.name + "_sleep" self.sleep_thread.start() else: self.multi_site_crawler.new_done_crawler() def _wait_and_queue(self): sleeptime = self.robots.get_delay() - (time.time() - self.last_connection) if sleeptime > 0: time.sleep(sleeptime) self.multi_site_crawler.crawler_ready(self) self.multi_site_crawler.new_done_crawler() # Scout is run until the recommendation_ready is ready; once it is, the object scout is deleted def run_scout(self, doc): if self.scout is not None: self.scout.step(doc) if self.scout.recommendation_ready(): if not self.scout.recommendation_keep_crawling(): logging.info( "Website discarded after crawling %s due to infringement of scout rule", doc.url.get_norm_url()) self.interrupt = True else: logging.info( "Scout recommends keep crawling website after downloading %s; langs of interest found: %s", doc.url.get_norm_url(), str(self.scout.lang_evidence)) self.scout = None def process_failed_url(self, url, retry=True): if not retry: self.url_list_concurrency_lock.acquire() self.visited.add(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: the URL does not exist', url.get_norm_url()) else: if url.get_norm_url() not in self.attempts: self.url_list_concurrency_lock.acquire() self.add_url_to_list(url) self.attempts[url.get_norm_url()] = 1 self.visited.remove(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: retrying (attempt 1)', url.get_norm_url()) else: if self.attempts[url.get_norm_url()] <= self.max_attempts: logging.info('%s: retrying (attempt %s)', url, str(self.attempts[url.get_norm_url()])) self.url_list_concurrency_lock.acquire() self.add_url_to_list(url) self.attempts[url.get_norm_url()] += 1 self.visited.remove(url.get_norm_url()) self.url_list_concurrency_lock.release() else: self.url_list_concurrency_lock.acquire() del self.attempts[url.get_norm_url()] self.visited.add(url.get_norm_url()) self.url_list_concurrency_lock.release() logging.info('%s: given up after %s attempts', url.get_norm_url(), str(self.max_attempts)) def write_document(self, doc): self.file_write_concurrency_lock.acquire() try: headers_list = doc.response.getheaders() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') norm_url = doc.url.get_norm_url() record = self.writer.create_warc_record(norm_url, 'response', payload=io.BytesIO( doc.text), http_headers=http_headers) self.writer.write_record(record) self.crawl_size += sys.getsizeof(doc.text) / 1000000.0 if self.metadata_writer is not None: self.metadata_writer.write( ("%s\t%s\t%s\n" % (doc.url.get_norm_url(), str( doc.encoding), str(doc.get_lang()))).encode()) self.metadata_writer.flush() finally: self.file_write_concurrency_lock.release() def get_status_object(self): targets = [] for u in self.pending_urls: targets.append(u.get_norm_url()) return { 'visited': self.visited, 'pendingurls': targets, 'attempts': self.attempts } def load_status(self, status_obj): try: self.file_write_concurrency_lock.acquire() self.visited = status_obj['visited'] self.pending_urls = [] for u in status_obj['pendingurls']: self.pending_urls.append(Link(u)) self.attempts = status_obj['attempts'] finally: self.file_write_concurrency_lock.release() def save_status(self): try: self.file_write_concurrency_lock.acquire() if self.dumpfile is not None: pickle.dump(self.get_status_object(), open(self.dumpfile, 'wb')) finally: self.file_write_concurrency_lock.release() def interrupt_crawl(self): try: self.url_list_concurrency_lock.acquire() self.interrupt = True self.save_status() self.metadata_writer.close() finally: self.url_list_concurrency_lock.release() def __hash__(self): return hash(self.domain) def one_thread_less(self): self.threads += 1
def mergeWarc(files, output): # stats unique = 0 revisit = 0 uniqueLength = 0 revisitLength = 0 payloadMap = {} writer = WARCWriter(output, gzip=True) # Add an additional warcinfo record, describing the transformations. This # is not ideal, since # “A ‘warcinfo’ record describes the records that # follow it […] until next ‘warcinfo’” # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo # A warcinfo record is expected at the beginning of every file. But it # might have written by a different software, so we don’t want to # strip/replace that information, but supplement it. warcinfo = { 'software': getSoftwareInfo(), 'tool': 'crocoite-merge', # not the name of the cli tool 'parameters': { 'inputs': files }, } payload = BytesIO(json.dumps(warcinfo, indent=2).encode('utf-8')) record = writer.create_warc_record( packageUrl('warcinfo'), 'warcinfo', payload=payload, warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) writer.write_record(record) for l in files: with open(l, 'rb') as fd: for record in ArchiveIterator(fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') length = int(headers.get_header('Content-Length')) dup = payloadMap.get(csum, None) if dup is None: payloadMap[csum] = { 'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date') } unique += 1 uniqueLength += length else: logging.debug( f'Record {rid} is duplicate of {dup["id"]}') # Payload may be identical, but HTTP headers are # (probably) not. Include them. record = writer.create_revisit_record( headers.get_header('WARC-Target-URI'), digest=csum, refers_to_uri=dup['uri'], refers_to_date=dup['date'], http_headers=record.http_headers) record.rec_headers.add_header('WARC-Truncated', 'length') record.rec_headers.add_header('WARC-Refers-To', dup['id']) revisit += 1 revisitLength += length else: unique += 1 writer.write_record(record) json.dump(dict( unique=dict(records=unique, bytes=uniqueLength), revisit=dict(records=revisit, bytes=revisitLength), ratio=dict(records=unique / (unique + revisit), bytes=uniqueLength / (uniqueLength + revisitLength)), ), sys.stdout, cls=StrJsonEncoder)
def facebook_user_ads(self, username, nsid, iso2c, access_token): assert username or nsid limit_per_page = 500 if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid and access_token and iso2c: # start scraping request_url = "https://graph.facebook.com/v5.0/ads_archive" request_params = { "access_token": access_token, "limit": limit_per_page, "search_page_ids": str(nsid), "ad_active_status": "ALL", "ad_reached_countries": iso2c, # todo "fields": "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency" } api_result = requests.get(request_url, params=request_params) print(api_result.text) random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token # write to warc with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(api_result.json(), default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) time.sleep(1.2) # sleep to avoid getting blocked by api else: log.debug( "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s", str(access_token), str(iso2c))
def facebook_user_bio(self, username): """Scrapes Facebook bio and returns info on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal) @param username: Facebook username @return: a dictionary of account attributes """ user_email_fb = self.message['credentials']['user_email_fb'] user_password_fb = self.message['credentials']['user_password_fb'] # ensure username is clean and can be accessed if username.startswith( "https://www.facebook.com/") or username.startswith( "http://www.facebook.com/"): username = re.sub(r'^.+facebook\.com\/', '', username) # possibly also remove trailing / username = re.sub(r'\/$', '', username) # created at field fb_general = base_fb_url + username # bio info fb_about = base_fb_url + username + "/about/?ref=page_internal" # site transparency (e.g. admins) m_fb_general = "http://m.facebook.com/" + username # request the html r = requests.get(fb_general) # ensure no 404's if not r: log.debug("Couldn't access profile site: %s", fb_general) return soup = BeautifulSoup(r.content, "html.parser") # scrape creation date created_at = soup.find('div', {"class": "_3qn7"}) created_at = created_at.select_one("span").text created_at = re.sub(r"(Seite erstellt)", "", created_at) created_at = created_at[3:] # scrape n of likes # find span with like number spans = soup.find('span', {"class": "_52id _50f5 _50f7"}) # isolate likes via regex likes = re.search(r'^[\d]+.[^\s]+', spans.text).group() bio_dict = { "username": fb_general, "n_likes": likes, "created_at": created_at } # request about html r_about = requests.get(fb_about) # ensure no 404's if not r_about: log.debug("Couldn't access username/about site: %s", fb_about) return about_soup = BeautifulSoup(r_about.content, "html.parser") mission_text = about_soup.find_all('div', {'class': "_4bl9"}) for divs in mission_text: describing_div = divs.find('div', {'class': '_50f4'}) content_div = divs.find('div', {'class': '_3-8w'}) if describing_div and content_div: bio_dict[describing_div.text] = content_div.text # photos # Retrieves profile and cover photo of public facebook page # bio going to the 'about' page, parsing html and getting # the links to photos from script tag, these can then be passed # harvest_media # this is not affected by the harvest_media options but will always happen all_scripts = about_soup.find_all('script') for js in all_scripts: for content in js.contents: if 'cover_photo' in content: # isolate relevant links links = re.findall(r'https\:\\/\\/scontent[^"]*', content) # remove escaped front slashes for val, link in enumerate(links): links[val] = re.sub(r'\\', "", link) self._harvest_media_url(links[val]) if m_fb_general: user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt" site_transparency_detail_id = "u_0_d" chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') chrome_options.add_argument('start-maximised') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--window-size=1200x800') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument(f"user-agent={user_agent}") # this will connect to the selenium container starting scraping driver = webdriver.Remote("host.docker.internal:4444/wd/hub", {'browserName': 'chrome'}) driver.get("http://m.facebook.com") driver.maximize_window() # accept cookies cookies = driver.find_element_by_id('accept-cookie-banner-label') # more or less random wait to replicate user behavior, ensure politeness time.sleep(random.uniform(3, 9)) cookies.click() # Search & Enter the Email or Phone field & Enter Password username_fb = driver.find_element_by_id("m_login_email") password_fb = driver.find_element_by_id("m_login_password") submit = driver.find_element_by_css_selector("._56b_") # send keys and make sure not prepolutaed # 2fa has to be deactivated username_fb.clear() password_fb.clear() username_fb.send_keys(user_email_fb) password_fb.send_keys(user_password_fb) time.sleep(random.uniform(3, 9)) # Step 4) Click Login submit.click() time.sleep(random.uniform(3, 9)) # navigate to site driver.get(m_fb_general) time.sleep(random.uniform(3, 9)) driver.execute_script("window.scrollTo(0, 800)") # site info only loads on scroll # use class name and div content (todo) time.sleep(random.uniform(20, 25)) element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.CSS_SELECTOR, site_transparency_class_selector))) site_transparency = driver.find_elements_by_css_selector( site_transparency_class_selector) #site transparency should always be below about site_transparency[1].click() time.sleep(random.uniform(20, 15)) # simply get the whole text of the transparency box of site # the exact info can be extracted ex-post element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.ID, site_transparency_detail_id))) time.sleep(random.uniform(3, 9)) site_transparency_text = driver.find_element_by_id( site_transparency_detail_id).text time.sleep(random.uniform(3, 9)) driver.close() log.info("Finished scraping transparency box") bio_dict['transparency_text'] = site_transparency_text # ensure that only warc will be written if sites were found # else nothing will happen if r_about or r: # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(bio_dict, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir)
def facebook_user_timeline(self, seed_id, username, nsid): """This function will scrape the user timeline""" log.debug("Harvesting user %s with seed_id %s.", username, seed_id) # make sure either username or nsid is present to start scraping assert username or nsid # Possibly look up username if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid: # report back whether user id was found log.info("FB userid %s", nsid) # todo - need to add timeout and what to do if blocked # todo - post ids will sometimes be empty, account for that for incremental incremental = self.message.get("options", {}).get("incremental", False) harvest_media = self.message.get("options", {}).get("harvest_media", False) if incremental: # search for since_id of post since_id = self.state_store.get_state( __name__, u"timeline.{}.since_id".format(nsid)) scrape_result = [] for post in facebook_scraper.get_posts(nsid, pages=self.pages, extra_info=True, timeout=20): scrape_result.append(post) self.result.harvest_counter["posts"] += 1 self.result.increment_stats("posts") if harvest_media and post[ 'images']: #last condition avoids parsing empty lists (i.e. no media) log.info("Harvesting media from post") # get media content from links - should automatically be caught within warc stream # all photos on fb are jpgs, so the list comprehension checks whether this is the case # for the stream, if not (e.g. video) it will not harvest [ self._harvest_media_url(media_url) for media_url in post['images'] if 'jpg' in media_url ] if incremental and post["post_id"] == since_id: log.info( "Stopping, found last post that was previously harvested with id: %s", post["post_id"]) break # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(scrape_result, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) # write to state store incremental = self.message.get("options", {}).get("incremental", False) key = "timeline.{}.since_id".format(nsid) max_post_time = scrape_result[0].get("time") max_post_id = scrape_result[0].get("post_id") assert max_post_time and max_post_id if incremental: self.state_store.set_state( __name__, key, max_post_id) if incremental else None log.info("Wrote first scraped post to state_store") else: msg = "NSID not found for user {}".format(username) log.exception(msg) self.result.warnings.append( Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait): cmd = "" if time_limit: cmd += "timeout {} ".format(time_limit) waitoption = "" if wait is not None: waitoption = "--wait " + wait agentoption = "" if agent is not None: agentoption = "--user-agent \"" + agent + "\"" filetypesoption = "" if filetypes is not None: filetypesoption = "-A \"" + filetypes + "\"" warcoption = "" warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")] if warcfilename is not None: warcoption = "--warc-file \"" + warcfilebasename + "\"" if check_wget_compression("wget --help | grep 'no-warc-compression'"): warcoption += " --no-warc-compression" cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format( WAIT=waitoption, FILETYPES=filetypesoption, URL=url, DOWNLOAD_PATH=out_path, AGENT=agentoption, WARC=warcoption) # print("cmd", cmd) try: system_check(cmd) except subprocess.CalledProcessError as grepexc: sys.stderr.write( "Warning: Some files could not be downloaded with wget\n") with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) try: for record in ArchiveIterator(f_in): if record.http_headers: if record.http_headers.get_header( 'Transfer-Encoding') == "chunked": continue try: record.http_headers.to_ascii_bytes() except UnicodeEncodeError: # if header is non ascii, create a new header, with status code only # content length and content type will be filled before writing record.http_headers = StatusAndHeaders( record.http_headers.get_statuscode(), []) uri = record.rec_headers.get_header('WARC-Target-URI') # ignore metadata records if not uri or uri.startswith( 'metadata://gnu.org/software/wget/warc/'): continue record.length = None writer.write_record(record) except Exception as e: print(e, file=sys.stderr) pass system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
class WarcHandler(EventHandler): __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') def __init__(self, fd, logger, maxBodySize=defaultSettings.maxBodySize): self.logger = logger self.writer = WARCWriter(fd, gzip=True) self.maxBodySize = maxBodySize self.logEncoding = 'utf-8' self.log = BytesIO() # max log buffer size (bytes) self.maxLogSize = 500 * 1024 # maps document urls to WARC record ids, required for DomSnapshotEvent # and ScreenshotEvent self.documentRecords = {} # record id of warcinfo record self.warcinfoRecordId = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self._flushLogEntries() def writeRecord(self, url, kind, payload, warc_headers_dict=None, http_headers=None): """ Thin wrapper around writer.create_warc_record and writer.write_record. Adds default WARC headers. """ d = {} if self.warcinfoRecordId: d['WARC-Warcinfo-ID'] = self.warcinfoRecordId d.update(warc_headers_dict) warc_headers_dict = d record = self.writer.create_warc_record( url, kind, payload=payload, warc_headers_dict=warc_headers_dict, http_headers=http_headers) self.writer.write_record(record) return record def _writeRequest(self, item): logger = self.logger.bind(reqId=item.id) req = item.request resp = item.response url = urlsplit(resp['url']) path = url.path if url.query: path += '?' + url.query httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format( req['method'], path), item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) initiator = item.initiator warcHeaders = { 'X-Chrome-Initiator': json.dumps(initiator), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date( datetime.utcfromtimestamp(item.chromeRequest['wallTime'])), } try: bodyTruncated = None payload, payloadBase64Encoded = item.requestBody except ValueError: # oops, don’t know what went wrong here bodyTruncated = 'unspecified' logger.error('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') if bodyTruncated: warcHeaders['WARC-Truncated'] = bodyTruncated payload = None if payload: payload = BytesIO(payload) warcHeaders['X-Chrome-Base64Body'] = str(payloadBase64Encoded) record = self.writeRecord(req['url'], 'request', payload=payload, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id rawBody = None base64Encoded = False bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s # body). bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: bodyTruncated = 'length' # check body size first, since we’re loading everything into memory self.logger.error('body for {} too large {} vs {}'.format( reqId, item.encodedDataLength, self.maxBodySize)) else: try: rawBody, base64Encoded = item.body except ValueError: # oops, don’t know what went wrong here bodyTruncated = 'unspecified' # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get('protocol', ''), 'X-Chrome-FromDiskCache': str(resp.get('fromDiskCache')), 'X-Chrome-ConnectionReused': str(resp.get('connectionReused')), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date( datetime.utcfromtimestamp(item.chromeRequest['wallTime'] + (item.chromeResponse['timestamp'] - item.chromeRequest['timestamp']))), } if bodyTruncated: warcHeaders['WARC-Truncated'] = bodyTruncated else: warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'], item.statusText), item.responseHeaders, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.get('mimeType') if contentType: if not base64Encoded: contentType += '; charset=utf-8' httpHeaders.replace_header('content-type', contentType) if rawBody is not None: httpHeaders.replace_header('content-length', '{:d}'.format(len(rawBody))) bodyIo = BytesIO(rawBody) else: bodyIo = BytesIO() record = self.writeRecord(resp['url'], 'response', warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID') def _writeScript(self, item): writer = self.writer encoding = 'utf-8' self.writeRecord( packageUrl('script/{}'.format(item.path)), 'metadata', payload=BytesIO(str(item).encode(encoding)), warc_headers_dict={ 'Content-Type': 'application/javascript; charset={}'.format(encoding) }) def _writeItem(self, item): if item.failed: # should have been handled by the logger already return concurrentTo = self._writeRequest(item) self._writeResponse(item, concurrentTo) def _addRefersTo(self, headers, url): refersTo = self.documentRecords.get(url) if refersTo: headers['WARC-Refers-To'] = refersTo else: self.logger.error('No document record found for {}'.format(url)) return headers def _writeDomSnapshot(self, item): writer = self.writer warcHeaders = { 'X-DOM-Snapshot': str(True), 'X-Chrome-Viewport': item.viewport, 'Content-Type': 'text/html; charset=utf-8', } self._addRefersTo(warcHeaders, item.url) self.writeRecord(item.url, 'conversion', payload=BytesIO(item.document), warc_headers_dict=warcHeaders) def _writeScreenshot(self, item): writer = self.writer warcHeaders = { 'Content-Type': 'image/png', 'X-Crocoite-Screenshot-Y-Offset': str(item.yoff) } self._addRefersTo(warcHeaders, item.url) self.writeRecord(item.url, 'conversion', payload=BytesIO(item.data), warc_headers_dict=warcHeaders) def _writeControllerStart(self, item): payload = BytesIO(json.dumps(item.payload, indent=2).encode('utf-8')) writer = self.writer warcinfo = self.writeRecord( packageUrl('warcinfo'), 'warcinfo', warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, payload=payload) self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] def _flushLogEntries(self): writer = self.writer self.log.seek(0) # XXX: we should use the type continuation here self.writeRecord(packageUrl('log'), 'resource', payload=self.log, warc_headers_dict={ 'Content-Type': 'text/plain; encoding={}'.format(self.logEncoding) }) self.log = BytesIO() def _writeLog(self, item): """ Handle log entries, called by .logger.WarcHandlerConsumer only """ self.log.write(item.encode(self.logEncoding)) self.log.write(b'\n') # instead of locking, check we’re running in the main thread if self.log.tell () > self.maxLogSize and \ threading.current_thread () is threading.main_thread (): self._flushLogEntries() route = { Script: _writeScript, Item: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, } def push(self, item): processed = False for k, v in self.route.items(): if isinstance(item, k): v(self, item) processed = True break if not processed: self.logger.debug('unknown event {}'.format(repr(item)))