def test_load_http2_warc_convert_protocol(self): filename = self.get_test_file('http2.github.io.har') temp_filename = os.path.join(tempfile.gettempdir(), tempfile.gettempprefix() + '-http2.warc') try: # write then read same file with open(temp_filename, 'w+b') as fh: har2warc(filename, fh) fh.seek(0) ai = ArchiveIterator(fh, verify_http=True) record = next(ai) assert record.rec_type == 'warcinfo' record = next(ai) assert record.rec_type == 'response' # ensure protocol vonerted to HTTP/1.1 assert record.http_headers.protocol == 'HTTP/1.1' finally: os.remove(temp_filename)
def write_from_wet(wet_url): print(wet_url) t = Timer() r = requests.get(wet_url, stream=True) records = ArchiveIterator(r.raw) n_documents = 0 file_path = wet_url.replace("https://", "") file_path = file_path.replace('/', '.') file_path = "./processed_wet/" + file_path.replace(".gz", ".xml") f = open(file_path, 'w') for i, record in enumerate(records): url, title, doc, lang = read_doc_wet(record) if not doc or not url or len(doc) < 1000: continue if record.rec_type == "conversion" and lang == 'en': f.write("<doc url=" + url + " title=" + title.replace(' ', '_') + " lang=" + lang + ">" + '\n') f.write(doc + '\n') f.write("</doc>" + '\n') n_documents += 1 if n_documents % 100 == 0: print(i, "documents processed", n_documents, "documents added...") sleep(0.01) f.close() return file_path
def parse_news_file(self, output_path): archive_date = self.args.path.split('/')[1] rest = '_'.join(self.args.path.split('/')[2:]) allow_all = self.args.allow_all if allow_all == False: allow_all = None print('Allow all', allow_all == True) out_prefix = 'propaganda-' if self.args.propaganda else '' out_key = '{}{}/{}.jsonl'.format(out_prefix, archive_date, rest) with TemporaryFile(mode='w+b', dir=output_path) as warctemp: self.s3client.download_fileobj('commoncrawl', self.args.path, warctemp) warctemp.seek(0) with NamedTemporaryFile(mode='w', dir=output_path) as f: for record in tqdm( ArchiveIterator(warctemp, no_record_parse=False)): for parsed_record in parse_record( record, propaganda=self.args.propaganda, allow_all=allow_all): f.write(json.dumps(parsed_record) + '\n') self.s3client.upload_file(f.name, self.args.bucket_name, out_key) print("I guess I'm done now") return rest
def _extract_html_from_cc_for_warc_path(warc_path): logger = _get_logger() logger.info( f'Extracting websites from common crawl for warc path {warc_path}.') html = [] url = [] s3link = [] s3 = get_s3_client() obj = s3.Object(bucket_name='commoncrawl', key=warc_path) response = obj.get() data = response['Body'] logger.info(f'Start iterating over extracted common crawl websites...') for record in ArchiveIterator(data): if record.rec_type == 'response': if record.http_headers.get_header('Content-Type') == 'text/html': html.append(record.content_stream().read()) url.append(record.rec_headers.get_header('WARC-Target-URI')) s3link.append(warc_path) logger.info(f'Successfully extracted {len(html)} websites') # make dataset d = {'url': url, 'html_full': html, 's3Link': s3link} df = pd.DataFrame(data=d) timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f") _persist_html_df(df, file_suffix=f'_{timestamp}_extracted') logger.info( f'Successfully persisted html pickle file with timestamp {timestamp}')
def metadata_by_ext(self, fromfile, file_types=SUPPORTED_FILE_TYPES, fields=None, output='metadata.jsonl'): """Reads and returns all metadata from list of file types inside selected file container""" if file_types is None: file_types = SUPPORTED_FILE_TYPES if output is None: output = 'metadata.jsonl' logging.debug('Preparing %s' % fromfile) file_mimes = {} for mime, ext in MIME_MAP.items(): if ext in file_types: file_mimes[mime] = ext resp = open(fromfile, 'rb') out = open(output, 'w', encoding='utf8') for record in ArchiveIterator(resp, arc2warc=True): matched = False if record.rec_type == 'response': h = record.http_headers.get_header('content-type') url = record.rec_headers.get_header('WARC-Target-URI') filename = url.rsplit('?', 1)[0].rsplit('/', 1)[-1].lower() ext = filename.rsplit('.', 1)[-1] if h and h in file_mimes: matched = True else: if len(ext) in [3, 4] and ext in file_types: matched = True if matched: result = processWarcRecord(record, url, filename, mime=h) result['source'] = os.path.basename(fromfile) out.write(json.dumps(result, ensure_ascii=False) + '\n') out.close()
def test_validate_json_metadata(self): first = True with open('test-transc2.warc.gz', 'rb') as fh: for record in ArchiveIterator(fh): if record.rec_type == 'resource': # skip first, which is original if first: first = False continue assert record.rec_headers['Content-Type'] == 'application/vnd.youtube-dl_formats+json' data = record.raw_stream.read() assert record.rec_headers.get('WARC-Date') == '2019-01-03T02:00:00Z' assert record.rec_headers.get('WARC-Creation-Date') > record.rec_headers.get('WARC-Date') metadata = json.loads(data.decode('utf-8')) assert len(metadata['formats']) == 5 assert metadata['webpage_url'] == 'http://www.example.com/containing/page.html' assert metadata['webpage_timestamp'] == '20190103020000' assert metadata['selector'] == 'object, embed' formats = ['png', 'webm', 'mp4', 'mkv', 'flv'] assert [format_['ext'] for format_ in metadata['formats']] == formats
def iter_warc_records(inputs): """iter warc records, including appending request data to matching response""" for filename in iter_file_or_dir(inputs): with open(filename, "rb") as fh: for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record.rec_type in ("resource", "response", "revisit"): yield record
def get_records(wet_url): #wet_url = warc_url.replace('/warc/', '/wet/').replace('warc.gz', 'warc.wet.gz')#根据URL来进行替换的 r = requests.get(wet_url, stream=True) records = ArchiveIterator(r.raw) record = next(records) #records的第一条是records的摘要信息 assert record.rec_type == 'warcinfo' return records
def analyze(self, fromfile, mode='mime'): """Reads data from WARC file and provides analysis""" logging.debug('Preparing %s' % fromfile) resp = open(fromfile, 'rb') total = 0 mimes = {} exts = {} n = 0 for record in ArchiveIterator(resp, arc2warc=False): # logging.debug('Processing record %d' % (n)) if record.rec_type == 'response': n += 1 if n % 10000 == 0: logging.info('Processed %d records' % (n)) h = record.http_headers.get_header('content-type') url = record.rec_headers.get_header('WARC-Target-URI') filename = url.rsplit('?', 1)[0].rsplit('/', 1)[-1].lower() total += 1 if mode == 'mimes': if h is not None: h = h.split(';', 1)[0] v = mimes.get(h, {'total': 0, 'size': 0}) v['total'] += 1 v['size'] += record.length mimes[h] = v elif mode == 'exts': if filename.find('.') > -1: ext = filename.rsplit('.', 1)[-1] else: ext = '' v = exts.get(ext, {'total': 0, 'size': 0}) v['total'] += 1 v['size'] += record.length exts[ext] = v table = [] total = ['#total', 0, 0, 100] records = mimes if mode == 'mimes' else exts for fd in sorted(records.items(), key=lambda item: item[1]['size'], reverse=True): total[1] += fd[1]['total'] total[2] += fd[1]['size'] total[3] = 100 for fd in sorted(records.items(), key=lambda item: item[1]['size'], reverse=True): record = [ fd[0], fd[1]['total'], fd[1]['size'], fd[1]['size'] * 100.0 / total[2] ] table.append(record) table.append(total) headers = [mode, 'files', 'size', 'share'] print(tabulate(table, headers=headers))
def __next__(self): """ Returns the next entry in the warc IO archive. :return: int(), str() - offset, text """ __archive = open(self.source, "rb") __archive_stream = ArchiveIterator(__archive) wrong_encoding_list = list() current_element = 0 for record in __archive_stream: # Extracts the responses if record.rec_type == 'response' and record.http_headers.get_header('Content-Type') in self.utf_8: soup = BeautifulSoup(record.content_stream(), 'lxml', from_encoding='utf-8') for script in soup(["script", "style"]): script.extract() try: text = soup.body.get_text(separator=' ') text = "\n".join([line.strip() for line in text.split("\n") if line.strip() != ""]) except AttributeError: wrong_encoding_list.append(__archive_stream.get_record_offset()) else: current_element += 1 # Prints the current element to command line if it is divisable by 100 if current_element % 100 == 0: self.__clear_line() print("{} elements hashed ...".format(current_element)) # Checks if the maximum number of elements has passed and stops if so if current_element >= self.max_elements: self.__clear_line() print("{} elements hashed ...".format(current_element)) print("Wrong encoding found in offsets {}".format(wrong_encoding_list)) return __archive_stream.get_record_offset(), text # returns offset and text else: yield __archive_stream.get_record_offset(), text # yields offset and text print('Wrong Encoding at offsets {}'.format(wrong_encoding_list))
def test_no_brotli(self): res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D', headers={'Accept-Encoding': 'gzip, deflate, br'}) assert '"C": "D"' in res.text with open(self.warc_name, 'rb') as fh: for record in ArchiveIterator(fh): last_record = record assert record.http_headers['Accept-Encoding'] == 'gzip, deflate'
def test_single_warc_record(self): dir_name = os.path.join(self.root_dir, '_test_colls', 'test-dedup', 'archive') files = os.listdir(dir_name) assert len(files) == 1 records = [] with open(os.path.join(dir_name, files[0]), 'rb') as fh: for record in ArchiveIterator(fh): records.append(record.rec_type) # ensure only one response/request pair written assert records == ['response', 'request']
def test_read_warcinfo(self): self.warc.seek(0) metadata = [] for record in ArchiveIterator(self.warc): if record.rec_type == 'warcinfo': stream = record.content_stream() warcinfo = {} while True: line = stream.readline().decode('utf-8') if not line: break parts = line.split(': ', 1) warcinfo[parts[0].strip()] = parts[1].strip() assert set(warcinfo.keys()) == {'software', 'format', 'creator', 'isPartOf', 'json-metadata'} assert warcinfo['software'].startswith('Webrecorder Platform ') assert warcinfo['format'] == 'WARC File Format 1.0' assert warcinfo['creator'] == 'test' assert warcinfo['isPartOf'] in ('default-collection', 'default-collection/rec-sesh', 'default-collection/another-sesh') metadata.append(json.loads(warcinfo['json-metadata'])) assert len(metadata) == 3 assert metadata[0]['type'] == 'collection' assert set(metadata[0].keys()) == {'created_at', 'updated_at', 'title', 'desc', 'type', 'size', 'lists', 'public', 'public_index'} assert metadata[0]['title'] == 'Default Collection' assert 'This is your first' in metadata[0]['desc'] assert metadata[1]['type'] == 'recording' assert set(metadata[1].keys()) == {'created_at', 'updated_at', 'recorded_at', 'title', 'desc', 'type', 'size', 'pages'} assert metadata[0]['created_at'] <= metadata[0]['updated_at'] for metadata_item in metadata: for field in TestUpload.timestamps.keys(): if field == 'recorded_at' and metadata_item['type'] == 'collection': continue TestUpload.timestamps[field][metadata_item['title']] = RedisUniqueComponent.to_iso_date(metadata_item[field]) assert set(TestUpload.timestamps['created_at'].keys()) == {'rec-sesh', 'another-sesh', 'Default Collection'}
def process_warcs(i_, iterator): try: s3pattern = re.compile('^s3://([^/]+)/(.+)') base_dir = os.path.abspath(os.path.dirname(__file__)) no_sign_request = botocore.client.Config( signature_version=botocore.UNSIGNED) s3client = boto3.client('s3', config=no_sign_request) for uri in iterator: if uri.startswith('s3://'): s3match = s3pattern.match(uri) bucketname = s3match.group(1) path = s3match.group(2) warctemp = TemporaryFile(mode='w+b') try: s3client.download_fileobj(bucketname, path, warctemp) except botocore.client.ClientError as exception: print('Failed to download from s3', exception) warctemp.close() continue warctemp.seek(0) stream = warctemp elif uri.startswith('file:'): uri = uri[5:] uri = os.path.join(base_dir, uri) try: stream = open(uri, 'rb') except IOError as exception: print("Failed to read data from local", exception) continue else: print("Unknown file system") try: for record in ArchiveIterator(stream): processed = process_record(record) if processed: yield processed continue except ArchiveLoadFailed as exception: print('Invalid WARC', exception) finally: stream.close() except: print("URL invalid")
def test_user_agent(): """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" found = False for warc in glob.glob( "/output/.tmp*/collections/capture/archive/*.warc.gz"): with open(warc, "rb") as fh: for record in ArchiveIterator(fh): if record.rec_type == "request": print(record.http_headers) ua = record.http_headers.get_header("User-Agent") if ua: assert "iPhone" in ua assert ua.endswith(" +Zimit [email protected]") found = True # should find at least one assert found
s3client = boto3.client('s3') parser = argparse.ArgumentParser() parser.add_argument('-path', type=str, default='crawl-data/CC-MAIN-2017-13/segments/1490218186353.38/warc/CC-MAIN-20170322212946-00000-ip-10-233-31-227.ec2.internal.warc.gz', help='in path') parser.add_argument('-bucket_name', type=str, help='out path') parser.add_argument('-propaganda', action='store_true', help='Download some propaganda instead of real news') args = parser.parse_args() archive_date = args.path.split('/')[1] rest = '_'.join(args.path.split('/')[2:]) out_prefix = 'propaganda-' if args.propaganda else '' out_key = '{}{}/{}.jsonl'.format(out_prefix, args.path.split('/')[1], rest) with TemporaryFile(mode='w+b', dir='/home/ubuntu/temp/') as warctemp: s3client.download_fileobj('commoncrawl', args.path, warctemp) warctemp.seek(0) with NamedTemporaryFile(mode='w', dir='/home/ubuntu/temp/') as f: for record in tqdm(ArchiveIterator(warctemp, no_record_parse=False)): for parsed_record in parse_record(record, propaganda=args.propaganda): f.write(json.dumps(parsed_record) + '\n') s3client.upload_file(f.name, args.bucket_name, out_key) print("I guess I'm done now")
import requests from warcio import ArchiveIterator import xml.etree.ElementTree as ET f = open('test_wet.txt', 'w') f.close() # wet_url = 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/wet/CC-MAIN-20210128134124-20210128164124-00799.warc.wet.gz' wet_url = 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2021-04/segments/1610703495901.0/wet/CC-MAIN-20210115134101-20210115164101-00001.warc.wet.gz' r = requests.get(wet_url, stream=True) records = ArchiveIterator(r.raw) record = next(records) assert record.rec_type == 'warcinfo' i = 0 add = ET.Element('add') with open('file.xml', 'w') as f: f.write('<?xml version="1.0"?>') ET.ElementTree(add).write(f, encoding="unicode") while i <= 2: # if next(records, 'STOP') == 'STOP': # break # else: record = next(records, 'STOP') URL = record.rec_headers.get_header('WARC-Target-URI') text = record.content_stream().read().decode('utf-8') doc = ET.SubElement(add, 'doc')
def verify_warc_and_zim(self, warcfile, zimfile): assert os.path.isfile(warcfile) assert os.path.isfile(zimfile) # autoescape=False to allow injecting html entities from translated text env = Environment( loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) head_insert = env.get_template("sw_check.html").render().encode( "utf-8") # track to avoid checking duplicates, which are not written to ZIM warc_urls = set() zim_fh = Archive(zimfile) for record in iter_warc_records([warcfile]): url = get_record_url(record) if not url: continue if url in warc_urls: continue if record.rec_type not in (("response", "resource", "revisit")): continue # ignore revisit records that are to the same url if (record.rec_type == "revisit" and record.rec_headers["WARC-Refers-To-Target-URI"] == url): continue # parse headers as record, ensure headers match url_no_scheme = url.split("//", 2)[1] print(url_no_scheme) parsed_record = next( ArchiveIterator( BytesIO(zim_fh.get_content("H/" + url_no_scheme)))) assert record.rec_headers == parsed_record.rec_headers assert record.http_headers == parsed_record.http_headers # ensure payloads match try: payload = zim_fh.get_item("A/" + url_no_scheme) except KeyError: payload = None if record.rec_type == "revisit" or ( record.http_headers and record.http_headers.get("Content-Length") == "0"): assert not payload else: payload_content = payload.content.tobytes() # if HTML_RAW, still need to account for the head insert, otherwise should have exact match if payload.mimetype == HTML_RAW: assert head_insert in payload_content assert (payload_content.replace( head_insert, b"") == record.buffered_stream.read()) else: assert payload_content == record.buffered_stream.read() warc_urls.add(url)
def _generate_docs(index, filename, nlp, counter): """ Generate Elasticsearch index docs. :param index: Elasticsearch index :param filename: WARC file name :param nlp: SpaCy language model :param counter: Spark counter :return: Generator of index doc actions """ email_regex = re.compile( r'((?:[a-zA-Z0-9_\-./+]+)@(?:(?:\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|' + r'(?:(?:[a-zA-Z0-9\-]+\.)+))(?:[a-zA-Z]{2,}|[0-9]{1,3})(?:\]?))') def split_header(header_name, header_dict, split_regex=','): headers = [ re.sub(r'\s+', ' ', h).strip() for h in re.split(split_regex, header_dict.get(header_name, '')) if h.strip() ] if not headers: return None return headers if len(headers) > 1 else headers[0] with open(filename, 'rb') as f: iterator = ArchiveIterator(f) for record in iterator: warc_headers = record.rec_headers body = record.content_stream().read() mail = email.message_from_bytes(body) doc_id = warc_headers.get_header('WARC-Record-ID') mail_text = '\n'.join( util.decode_message_part(p) for p in mail.walk() if p.get_content_type() == 'text/plain').strip() mail_html = '\n'.join( util.decode_message_part(p) for p in mail.walk() if p.get_content_type() == 'text/html').strip() mail_headers = {h.lower(): str(mail[h]) for h in mail} from_header = mail_headers.get('from', '') from_email = re.search(email_regex, from_header) try: d = email.utils.parsedate_to_datetime(mail_headers.get('date')) if not d.tzinfo or d.tzinfo.utcoffset(d) is None: d = pytz.utc.localize(d) # Convert offset outside +/-18:00 to UTC+0, since they would throw errors in Java's DateTime parser. if abs(d.utcoffset().total_seconds()) > 18 * 60 * 60: d = d.astimezone(pytz.utc) mail_date = str(d) except TypeError: mail_date = None try: lang = nlp(mail_text[:nlp.max_length])._.language['language'] except Exception as e: lang = 'UNKNOWN' logger.error(e) counter.add(1) yield { "_index": index, "_type": "message", "_id": doc_id, "_op_type": "update", "scripted_upsert": True, "script": { "source": """ if (ctx._source.containsKey("lang")) { params.doc.remove("lang"); } ctx._source.putAll(params.doc); """, "params": { "doc": { "modified": int(time() * 1000), "id_hash": hash(doc_id), "group": os.path.basename(os.path.dirname(filename)), "warc_file": os.path.join( os.path.basename(os.path.dirname(filename)), os.path.basename(filename)), "warc_offset": iterator.offset, "warc_id": doc_id, "news_url": warc_headers.get_header("WARC-News-URL"), "headers": { "date": mail_date, "message_id": mail_headers.get("message-id"), "from": from_header, "from_email": from_email.group(0) if from_email is not None else "", "subject": mail_headers.get("subject"), "to": split_header("to", mail_headers), "cc": split_header("cc", mail_headers), "in_reply_to": split_header("in-reply-to", mail_headers), "references": split_header("references", mail_headers, split_regex=r"\s"), "list_id": mail_headers.get("list-id") }, "lang": lang, "text_plain": mail_text, "text_html": mail_html } } }, "upsert": {} }