예제 #1
0
    def test_load_http2_warc_convert_protocol(self):
        filename = self.get_test_file('http2.github.io.har')

        temp_filename = os.path.join(tempfile.gettempdir(), tempfile.gettempprefix() + '-http2.warc')

        try:
            # write then read same file
            with open(temp_filename, 'w+b') as fh:
                har2warc(filename, fh)

                fh.seek(0)

                ai = ArchiveIterator(fh, verify_http=True)

                record = next(ai)
                assert record.rec_type == 'warcinfo'

                record = next(ai)
                assert record.rec_type == 'response'

                # ensure protocol vonerted to HTTP/1.1
                assert record.http_headers.protocol == 'HTTP/1.1'

        finally:
            os.remove(temp_filename)
def write_from_wet(wet_url):
    print(wet_url)
    t = Timer()
    r = requests.get(wet_url, stream=True)
    records = ArchiveIterator(r.raw)

    n_documents = 0

    file_path = wet_url.replace("https://", "")
    file_path = file_path.replace('/', '.')
    file_path = "./processed_wet/" + file_path.replace(".gz", ".xml")
    f = open(file_path, 'w')
    for i, record in enumerate(records):
        url, title, doc, lang = read_doc_wet(record)
        if not doc or not url or len(doc) < 1000:
            continue

        if record.rec_type == "conversion" and lang == 'en':
            f.write("<doc url=" + url + " title=" + title.replace(' ', '_') +
                    " lang=" + lang + ">" + '\n')
            f.write(doc + '\n')
            f.write("</doc>" + '\n')
            n_documents += 1
        if n_documents % 100 == 0:
            print(i, "documents processed", n_documents, "documents added...")
        sleep(0.01)
    f.close()
    return file_path
예제 #3
0
    def parse_news_file(self, output_path):
        archive_date = self.args.path.split('/')[1]
        rest = '_'.join(self.args.path.split('/')[2:])
        allow_all = self.args.allow_all
        if allow_all == False:
            allow_all = None
        print('Allow all', allow_all == True)

        out_prefix = 'propaganda-' if self.args.propaganda else ''
        out_key = '{}{}/{}.jsonl'.format(out_prefix, archive_date, rest)

        with TemporaryFile(mode='w+b', dir=output_path) as warctemp:
            self.s3client.download_fileobj('commoncrawl', self.args.path,
                                           warctemp)
            warctemp.seek(0)

            with NamedTemporaryFile(mode='w', dir=output_path) as f:
                for record in tqdm(
                        ArchiveIterator(warctemp, no_record_parse=False)):
                    for parsed_record in parse_record(
                            record,
                            propaganda=self.args.propaganda,
                            allow_all=allow_all):
                        f.write(json.dumps(parsed_record) + '\n')

                self.s3client.upload_file(f.name, self.args.bucket_name,
                                          out_key)

            print("I guess I'm done now")
        return rest
def _extract_html_from_cc_for_warc_path(warc_path):
    logger = _get_logger()
    logger.info(
        f'Extracting websites from common crawl for warc path {warc_path}.')

    html = []
    url = []
    s3link = []
    s3 = get_s3_client()
    obj = s3.Object(bucket_name='commoncrawl', key=warc_path)
    response = obj.get()
    data = response['Body']

    logger.info(f'Start iterating over extracted common crawl websites...')
    for record in ArchiveIterator(data):
        if record.rec_type == 'response':
            if record.http_headers.get_header('Content-Type') == 'text/html':
                html.append(record.content_stream().read())
                url.append(record.rec_headers.get_header('WARC-Target-URI'))
                s3link.append(warc_path)

    logger.info(f'Successfully extracted {len(html)} websites')

    # make dataset
    d = {'url': url, 'html_full': html, 's3Link': s3link}
    df = pd.DataFrame(data=d)
    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")
    _persist_html_df(df, file_suffix=f'_{timestamp}_extracted')
    logger.info(
        f'Successfully persisted html pickle file with timestamp {timestamp}')
예제 #5
0
 def metadata_by_ext(self,
                     fromfile,
                     file_types=SUPPORTED_FILE_TYPES,
                     fields=None,
                     output='metadata.jsonl'):
     """Reads and returns all metadata from list of file types inside selected file container"""
     if file_types is None:
         file_types = SUPPORTED_FILE_TYPES
     if output is None:
         output = 'metadata.jsonl'
     logging.debug('Preparing %s' % fromfile)
     file_mimes = {}
     for mime, ext in MIME_MAP.items():
         if ext in file_types:
             file_mimes[mime] = ext
     resp = open(fromfile, 'rb')
     out = open(output, 'w', encoding='utf8')
     for record in ArchiveIterator(resp, arc2warc=True):
         matched = False
         if record.rec_type == 'response':
             h = record.http_headers.get_header('content-type')
             url = record.rec_headers.get_header('WARC-Target-URI')
             filename = url.rsplit('?', 1)[0].rsplit('/', 1)[-1].lower()
             ext = filename.rsplit('.', 1)[-1]
             if h and h in file_mimes:
                 matched = True
             else:
                 if len(ext) in [3, 4] and ext in file_types:
                     matched = True
             if matched:
                 result = processWarcRecord(record, url, filename, mime=h)
                 result['source'] = os.path.basename(fromfile)
                 out.write(json.dumps(result, ensure_ascii=False) + '\n')
     out.close()
예제 #6
0
    def test_validate_json_metadata(self):
        first = True
        with open('test-transc2.warc.gz', 'rb') as fh:
            for record in ArchiveIterator(fh):
                if record.rec_type == 'resource':
                    # skip first, which is original
                    if first:
                        first = False
                        continue

                    assert record.rec_headers['Content-Type'] == 'application/vnd.youtube-dl_formats+json'
                    data = record.raw_stream.read()

        assert record.rec_headers.get('WARC-Date') == '2019-01-03T02:00:00Z'

        assert record.rec_headers.get('WARC-Creation-Date') > record.rec_headers.get('WARC-Date')

        metadata = json.loads(data.decode('utf-8'))

        assert len(metadata['formats']) == 5

        assert metadata['webpage_url'] == 'http://www.example.com/containing/page.html'
        assert metadata['webpage_timestamp'] == '20190103020000'
        assert metadata['selector'] == 'object, embed'

        formats = ['png', 'webm', 'mp4', 'mkv', 'flv']
        assert [format_['ext'] for format_ in metadata['formats']] == formats
예제 #7
0
def iter_warc_records(inputs):
    """iter warc records, including appending request data to matching response"""
    for filename in iter_file_or_dir(inputs):
        with open(filename, "rb") as fh:
            for record in buffering_record_iter(ArchiveIterator(fh),
                                                post_append=True):
                if record.rec_type in ("resource", "response", "revisit"):
                    yield record
예제 #8
0
def get_records(wet_url):
    #wet_url = warc_url.replace('/warc/', '/wet/').replace('warc.gz', 'warc.wet.gz')#根据URL来进行替换的
    r = requests.get(wet_url, stream=True)
    records = ArchiveIterator(r.raw)
    record = next(records)
    #records的第一条是records的摘要信息
    assert record.rec_type == 'warcinfo'
    return records
예제 #9
0
    def analyze(self, fromfile, mode='mime'):
        """Reads data from WARC file and provides analysis"""
        logging.debug('Preparing %s' % fromfile)
        resp = open(fromfile, 'rb')
        total = 0
        mimes = {}
        exts = {}
        n = 0
        for record in ArchiveIterator(resp, arc2warc=False):
            #            logging.debug('Processing record %d' % (n))
            if record.rec_type == 'response':
                n += 1
                if n % 10000 == 0:
                    logging.info('Processed %d records' % (n))
                h = record.http_headers.get_header('content-type')
                url = record.rec_headers.get_header('WARC-Target-URI')
                filename = url.rsplit('?', 1)[0].rsplit('/', 1)[-1].lower()
                total += 1
                if mode == 'mimes':
                    if h is not None:
                        h = h.split(';', 1)[0]

                    v = mimes.get(h, {'total': 0, 'size': 0})
                    v['total'] += 1
                    v['size'] += record.length
                    mimes[h] = v
                elif mode == 'exts':
                    if filename.find('.') > -1:
                        ext = filename.rsplit('.', 1)[-1]
                    else:
                        ext = ''
                    v = exts.get(ext, {'total': 0, 'size': 0})
                    v['total'] += 1
                    v['size'] += record.length
                    exts[ext] = v

        table = []
        total = ['#total', 0, 0, 100]
        records = mimes if mode == 'mimes' else exts
        for fd in sorted(records.items(),
                         key=lambda item: item[1]['size'],
                         reverse=True):
            total[1] += fd[1]['total']
            total[2] += fd[1]['size']
        total[3] = 100
        for fd in sorted(records.items(),
                         key=lambda item: item[1]['size'],
                         reverse=True):
            record = [
                fd[0], fd[1]['total'], fd[1]['size'],
                fd[1]['size'] * 100.0 / total[2]
            ]
            table.append(record)
        table.append(total)

        headers = [mode, 'files', 'size', 'share']
        print(tabulate(table, headers=headers))
예제 #10
0
    def __next__(self):
        """
        Returns the next entry in the warc IO archive.

        :return: int(), str() - offset, text
        """
        __archive = open(self.source, "rb")
        __archive_stream = ArchiveIterator(__archive)

        wrong_encoding_list = list()

        current_element = 0
        for record in __archive_stream:

            # Extracts the responses
            if record.rec_type == 'response' and record.http_headers.get_header('Content-Type') in self.utf_8:
                soup = BeautifulSoup(record.content_stream(), 'lxml', from_encoding='utf-8')
                for script in soup(["script", "style"]):
                    script.extract()

                try:
                    text = soup.body.get_text(separator=' ')
                    text = "\n".join([line.strip() for line in text.split("\n") if line.strip() != ""])
                except AttributeError:
                    wrong_encoding_list.append(__archive_stream.get_record_offset())
                else:
                    current_element += 1

                    # Prints the current element to command line if it is divisable by 100
                    if current_element % 100 == 0:
                        self.__clear_line()
                        print("{} elements hashed ...".format(current_element))

                    # Checks if the maximum number of elements has passed and stops if so
                    if current_element >= self.max_elements:
                        self.__clear_line()
                        print("{} elements hashed ...".format(current_element))
                        print("Wrong encoding found in offsets {}".format(wrong_encoding_list))
                        return __archive_stream.get_record_offset(), text  # returns offset and text
                    else:
                        yield __archive_stream.get_record_offset(), text  # yields offset and text

        print('Wrong Encoding at offsets {}'.format(wrong_encoding_list))
예제 #11
0
    def test_no_brotli(self):
        res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D',
                               headers={'Accept-Encoding': 'gzip, deflate, br'})
        assert '"C": "D"' in res.text

        with open(self.warc_name, 'rb') as fh:
            for record in ArchiveIterator(fh):
                last_record = record

        assert record.http_headers['Accept-Encoding'] == 'gzip, deflate'
예제 #12
0
    def test_single_warc_record(self):
        dir_name = os.path.join(self.root_dir, '_test_colls', 'test-dedup', 'archive')
        files = os.listdir(dir_name)
        assert len(files) == 1

        records = []

        with open(os.path.join(dir_name, files[0]), 'rb') as fh:
            for record in ArchiveIterator(fh):
                records.append(record.rec_type)

        # ensure only one response/request pair written
        assert records == ['response', 'request']
예제 #13
0
    def test_read_warcinfo(self):
        self.warc.seek(0)
        metadata = []

        for record in ArchiveIterator(self.warc):
            if record.rec_type == 'warcinfo':
                stream = record.content_stream()
                warcinfo = {}

                while True:
                    line = stream.readline().decode('utf-8')
                    if not line:
                        break

                    parts = line.split(': ', 1)
                    warcinfo[parts[0].strip()] = parts[1].strip()

                assert set(warcinfo.keys()) == {'software', 'format', 'creator', 'isPartOf', 'json-metadata'}
                assert warcinfo['software'].startswith('Webrecorder Platform ')
                assert warcinfo['format'] == 'WARC File Format 1.0'
                assert warcinfo['creator'] == 'test'
                assert warcinfo['isPartOf'] in ('default-collection', 'default-collection/rec-sesh', 'default-collection/another-sesh')

                metadata.append(json.loads(warcinfo['json-metadata']))

        assert len(metadata) == 3
        assert metadata[0]['type'] == 'collection'
        assert set(metadata[0].keys()) == {'created_at', 'updated_at',
                                           'title', 'desc', 'type', 'size',
                                           'lists', 'public', 'public_index'}

        assert metadata[0]['title'] == 'Default Collection'
        assert 'This is your first' in metadata[0]['desc']

        assert metadata[1]['type'] == 'recording'
        assert set(metadata[1].keys()) == {'created_at', 'updated_at', 'recorded_at',
                                           'title', 'desc', 'type', 'size',
                                           'pages'}

        assert metadata[0]['created_at'] <= metadata[0]['updated_at']

        for metadata_item in metadata:
            for field in TestUpload.timestamps.keys():
                if field == 'recorded_at' and metadata_item['type'] == 'collection':
                    continue

                TestUpload.timestamps[field][metadata_item['title']] = RedisUniqueComponent.to_iso_date(metadata_item[field])

        assert set(TestUpload.timestamps['created_at'].keys()) == {'rec-sesh', 'another-sesh', 'Default Collection'}
예제 #14
0
def process_warcs(i_, iterator):
    try:

        s3pattern = re.compile('^s3://([^/]+)/(.+)')
        base_dir = os.path.abspath(os.path.dirname(__file__))

        no_sign_request = botocore.client.Config(
            signature_version=botocore.UNSIGNED)
        s3client = boto3.client('s3', config=no_sign_request)

        for uri in iterator:
            if uri.startswith('s3://'):
                s3match = s3pattern.match(uri)
                bucketname = s3match.group(1)
                path = s3match.group(2)
                warctemp = TemporaryFile(mode='w+b')
            
                try:
                    s3client.download_fileobj(bucketname, path, warctemp)
                except botocore.client.ClientError as exception:
                    print('Failed to download from s3', exception)
                    warctemp.close()
                    continue
                warctemp.seek(0)
                stream = warctemp

            elif uri.startswith('file:'):
                uri = uri[5:]
                uri = os.path.join(base_dir, uri)
                try:
                    stream = open(uri, 'rb')
                except IOError as exception:
                    print("Failed to read data from local", exception)
                    continue
            else:
                print("Unknown file system")

            try:
                for record in ArchiveIterator(stream):
                    processed = process_record(record)
                    if processed:
                        yield processed
                    continue
            except ArchiveLoadFailed as exception:
                print('Invalid WARC', exception)
            finally:
                stream.close()
    except: 
        print("URL invalid")
예제 #15
0
def test_user_agent():
    """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""

    found = False
    for warc in glob.glob(
            "/output/.tmp*/collections/capture/archive/*.warc.gz"):
        with open(warc, "rb") as fh:
            for record in ArchiveIterator(fh):
                if record.rec_type == "request":
                    print(record.http_headers)
                    ua = record.http_headers.get_header("User-Agent")
                    if ua:
                        assert "iPhone" in ua
                        assert ua.endswith(" +Zimit [email protected]")
                        found = True

    # should find at least one
    assert found
예제 #16
0
파일: process_ccrawl.py 프로젝트: m1-1/g1
s3client = boto3.client('s3')

parser = argparse.ArgumentParser()
parser.add_argument('-path', type=str,
                    default='crawl-data/CC-MAIN-2017-13/segments/1490218186353.38/warc/CC-MAIN-20170322212946-00000-ip-10-233-31-227.ec2.internal.warc.gz',
                    help='in path')
parser.add_argument('-bucket_name', type=str,
                    help='out path')
parser.add_argument('-propaganda', action='store_true',
                    help='Download some propaganda instead of real news')
args = parser.parse_args()

archive_date = args.path.split('/')[1]
rest = '_'.join(args.path.split('/')[2:])
out_prefix = 'propaganda-' if args.propaganda else ''

out_key = '{}{}/{}.jsonl'.format(out_prefix, args.path.split('/')[1], rest)

with TemporaryFile(mode='w+b', dir='/home/ubuntu/temp/') as warctemp:
    s3client.download_fileobj('commoncrawl', args.path, warctemp)
    warctemp.seek(0)

    with NamedTemporaryFile(mode='w', dir='/home/ubuntu/temp/') as f:
        for record in tqdm(ArchiveIterator(warctemp, no_record_parse=False)):
            for parsed_record in parse_record(record, propaganda=args.propaganda):
                f.write(json.dumps(parsed_record) + '\n')

        s3client.upload_file(f.name, args.bucket_name, out_key)

    print("I guess I'm done now")
예제 #17
0
import requests
from warcio import ArchiveIterator
import xml.etree.ElementTree as ET

f = open('test_wet.txt', 'w')
f.close()

# wet_url = 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/wet/CC-MAIN-20210128134124-20210128164124-00799.warc.wet.gz'
wet_url = 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2021-04/segments/1610703495901.0/wet/CC-MAIN-20210115134101-20210115164101-00001.warc.wet.gz'
r = requests.get(wet_url, stream=True)
records = ArchiveIterator(r.raw)

record = next(records)
assert record.rec_type == 'warcinfo'

i = 0
add = ET.Element('add')

with open('file.xml', 'w') as f:
    f.write('<?xml version="1.0"?>')
    ET.ElementTree(add).write(f, encoding="unicode")

while i <= 2:
    # if next(records, 'STOP') == 'STOP':
    #    break
    # else:
    record = next(records, 'STOP')
    URL = record.rec_headers.get_header('WARC-Target-URI')
    text = record.content_stream().read().decode('utf-8')

    doc = ET.SubElement(add, 'doc')
예제 #18
0
    def verify_warc_and_zim(self, warcfile, zimfile):
        assert os.path.isfile(warcfile)
        assert os.path.isfile(zimfile)

        # autoescape=False to allow injecting html entities from translated text
        env = Environment(
            loader=PackageLoader("warc2zim", "templates"),
            extensions=["jinja2.ext.i18n"],
            autoescape=False,
        )

        head_insert = env.get_template("sw_check.html").render().encode(
            "utf-8")

        # track to avoid checking duplicates, which are not written to ZIM
        warc_urls = set()

        zim_fh = Archive(zimfile)
        for record in iter_warc_records([warcfile]):
            url = get_record_url(record)
            if not url:
                continue

            if url in warc_urls:
                continue

            if record.rec_type not in (("response", "resource", "revisit")):
                continue

            # ignore revisit records that are to the same url
            if (record.rec_type == "revisit" and
                    record.rec_headers["WARC-Refers-To-Target-URI"] == url):
                continue

            # parse headers as record, ensure headers match
            url_no_scheme = url.split("//", 2)[1]
            print(url_no_scheme)
            parsed_record = next(
                ArchiveIterator(
                    BytesIO(zim_fh.get_content("H/" + url_no_scheme))))

            assert record.rec_headers == parsed_record.rec_headers
            assert record.http_headers == parsed_record.http_headers

            # ensure payloads match
            try:
                payload = zim_fh.get_item("A/" + url_no_scheme)
            except KeyError:
                payload = None

            if record.rec_type == "revisit" or (
                    record.http_headers
                    and record.http_headers.get("Content-Length") == "0"):
                assert not payload
            else:
                payload_content = payload.content.tobytes()

                # if HTML_RAW, still need to account for the head insert, otherwise should have exact match
                if payload.mimetype == HTML_RAW:
                    assert head_insert in payload_content
                    assert (payload_content.replace(
                        head_insert, b"") == record.buffered_stream.read())
                else:
                    assert payload_content == record.buffered_stream.read()

            warc_urls.add(url)
def _generate_docs(index, filename, nlp, counter):
    """
    Generate Elasticsearch index docs.

    :param index: Elasticsearch index
    :param filename: WARC file name
    :param nlp: SpaCy language model
    :param counter: Spark counter
    :return: Generator of index doc actions
    """
    email_regex = re.compile(
        r'((?:[a-zA-Z0-9_\-./+]+)@(?:(?:\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|'
        + r'(?:(?:[a-zA-Z0-9\-]+\.)+))(?:[a-zA-Z]{2,}|[0-9]{1,3})(?:\]?))')

    def split_header(header_name, header_dict, split_regex=','):
        headers = [
            re.sub(r'\s+', ' ', h).strip()
            for h in re.split(split_regex, header_dict.get(header_name, ''))
            if h.strip()
        ]
        if not headers:
            return None
        return headers if len(headers) > 1 else headers[0]

    with open(filename, 'rb') as f:
        iterator = ArchiveIterator(f)
        for record in iterator:
            warc_headers = record.rec_headers
            body = record.content_stream().read()
            mail = email.message_from_bytes(body)
            doc_id = warc_headers.get_header('WARC-Record-ID')

            mail_text = '\n'.join(
                util.decode_message_part(p) for p in mail.walk()
                if p.get_content_type() == 'text/plain').strip()
            mail_html = '\n'.join(
                util.decode_message_part(p) for p in mail.walk()
                if p.get_content_type() == 'text/html').strip()

            mail_headers = {h.lower(): str(mail[h]) for h in mail}
            from_header = mail_headers.get('from', '')
            from_email = re.search(email_regex, from_header)

            try:
                d = email.utils.parsedate_to_datetime(mail_headers.get('date'))
                if not d.tzinfo or d.tzinfo.utcoffset(d) is None:
                    d = pytz.utc.localize(d)
                # Convert offset outside +/-18:00 to UTC+0, since they would throw errors in Java's DateTime parser.
                if abs(d.utcoffset().total_seconds()) > 18 * 60 * 60:
                    d = d.astimezone(pytz.utc)
                mail_date = str(d)
            except TypeError:
                mail_date = None

            try:
                lang = nlp(mail_text[:nlp.max_length])._.language['language']
            except Exception as e:
                lang = 'UNKNOWN'
                logger.error(e)

            counter.add(1)

            yield {
                "_index": index,
                "_type": "message",
                "_id": doc_id,
                "_op_type": "update",
                "scripted_upsert": True,
                "script": {
                    "source": """
                        if (ctx._source.containsKey("lang")) {
                            params.doc.remove("lang");
                        }
                        ctx._source.putAll(params.doc);
                    """,
                    "params": {
                        "doc": {
                            "modified":
                            int(time() * 1000),
                            "id_hash":
                            hash(doc_id),
                            "group":
                            os.path.basename(os.path.dirname(filename)),
                            "warc_file":
                            os.path.join(
                                os.path.basename(os.path.dirname(filename)),
                                os.path.basename(filename)),
                            "warc_offset":
                            iterator.offset,
                            "warc_id":
                            doc_id,
                            "news_url":
                            warc_headers.get_header("WARC-News-URL"),
                            "headers": {
                                "date":
                                mail_date,
                                "message_id":
                                mail_headers.get("message-id"),
                                "from":
                                from_header,
                                "from_email":
                                from_email.group(0)
                                if from_email is not None else "",
                                "subject":
                                mail_headers.get("subject"),
                                "to":
                                split_header("to", mail_headers),
                                "cc":
                                split_header("cc", mail_headers),
                                "in_reply_to":
                                split_header("in-reply-to", mail_headers),
                                "references":
                                split_header("references",
                                             mail_headers,
                                             split_regex=r"\s"),
                                "list_id":
                                mail_headers.get("list-id")
                            },
                            "lang":
                            lang,
                            "text_plain":
                            mail_text,
                            "text_html":
                            mail_html
                        }
                    }
                },
                "upsert": {}
            }