Exemplo n.º 1
0
 def write_warcinfo_record(self, warc):
     """Writes the initial warcinfo record."""
     headers = [
         (WarcRecord.TYPE, WarcRecord.WARCINFO),
         (WarcRecord.DATE, warc_datetime_str(datetime.now())),
         (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
     ]
     data = "software=%s\nhostname=%s\nip=%s" % (self.software,
                                                 self.hostname, self.ip)
     if self.description is not None:
         data += "\ndescription=%s" % self.description
     record = WarcRecord(headers=headers,
                         content=("application/warc-fields", data))
     record.write_to(warc, gzip=self.gzip)
     warc.flush()
Exemplo n.º 2
0
def warcinfo_record(warc_filename):
    """Return warcinfo WarcRecord.
    Required to write in the beginning of a WARC file.
    """
    warc_date = warc_datetime_str(datetime.utcnow())
    metadata = "\r\n".join((
        "format: WARC File Format 1.0",
        "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"
    ))
    return WarcRecord(headers=[
        (WarcRecord.TYPE, WarcRecord.WARCINFO),
        (WarcRecord.CONTENT_TYPE, b'application/warc-fields'),
        (WarcRecord.ID, warc_uuid(metadata + warc_date)),
        (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename)
    ],
                      content=(b'application/warc-fields', metadata + "\r\n"),
                      version=b"WARC/1.0")
Exemplo n.º 3
0
    def write_record(self, headers, mime, data):
        """Writes a WARC record.

        Arguments:
        headers -- Array of WARC headers.
        mime -- MIME type of the data.
        data -- the data block.

        """
        record = WarcRecord(headers=headers, content=(mime, data))
        logger.debug("Getting WARC: %s" % str(self.warcs.keys()))
        name = self.pool.get()
        logger.debug("Writing to: %s" % name)
        fh = self.warcs[name]
        record.write_to(fh, gzip=self.gzip)
        fh.flush()
        if not self.warc_reached_max_size(name):
            logger.debug("%s undersized; adding back to the pool." % name)
            self.pool.put(name)
Exemplo n.º 4
0
def create_metadata_record_bytes(
    url='http://example.com/',
    content_type='image/png',
    date='2016-08-03T10:49:41Z',
    content=b'',
    include_block_digest=True):
    """Build WARC metadata record bits."""

    headers = {
        WarcRecord.TYPE: WarcRecord.METADATA,
        WarcRecord.URL: url.encode('utf-8'),
        WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'),
        WarcRecord.DATE: date.encode('utf-8')
        }
    if include_block_digest:
        hasher = hashlib.sha1(content)
        block_digest = base64.b32encode(hasher.digest())
        headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest

    # XXX - I wish I could use WarcRecord. Current implementation of
    # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out
    # hex-encoded SHA256 calculated from the content.
    out = io.BytesIO()
    if False:
        rec = WarcRecord(
            headers=headers.items(),
            content=(content_type.encode('utf-8'), content)
            )
        out = io.BytesIO()
        rec.write_to(out, gzip=True)
        return out.getvalue()
    else:
        z = GzipFile(fileobj=out, mode='wb')
        z.write(b'WARC/1.0\r\n')
        for k, v in headers.items():
            z.write(b''.join((k, b': ', v, b'\r\n')))
        z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii'))
        z.write(b'\r\n')
        z.write(content)
        z.write(b'\r\n\r\n')
        z.flush()
        z.close()
        return out.getvalue()
Exemplo n.º 5
0
    def _init_file(self):
        warcinfo_headers = [
            (WarcRecord.TYPE, WarcRecord.WARCINFO),
            (WarcRecord.ID, WarcRecord.random_warc_uuid()),
            (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
            (WarcRecord.FILENAME, os.path.basename(self._file_name)),
            (Warc.MAIN_URL, self._main_url),
        ]

        warcinfo_fields = "\r\n".join([
            "software: bardo",
            "format: WARC File Format 1.0",
            "conformsTo: " + CONFORMS_TO,
            "robots: unknown",
        ])

        warcinfo_content = ("application/warc-fields", warcinfo_fields)

        warcinfo_record = WarcRecord(headers=warcinfo_headers, \
                content=warcinfo_content)

        self.write_record(warcinfo_record)
Exemplo n.º 6
0
def tweet_warc_record(tweet_json):
    """Parse Tweet JSON and return WarcRecord.
    """
    try:
        tweet = json.loads(tweet_json)
        # skip deleted tweet
        if 'user' not in tweet:
            return
        url = "https://twitter.com/%s/status/%s" % (
            tweet['user']['screen_name'], tweet['id'])
    except Exception as ex:
        logging.error('error in tweet_warc_record', exc_info=1)
        return None

    warc_date = warc_datetime_str(
        datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0))
    return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE),
                               (WarcRecord.CONTENT_TYPE, b'application/json'),
                               (WarcRecord.ID, warc_uuid(url + warc_date)),
                               (WarcRecord.URL, url),
                               (WarcRecord.DATE, warc_date)],
                      content=(b'application/json', tweet_json + "\r\n"),
                      version=b"WARC/1.0")
Exemplo n.º 7
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'ab')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url + record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))

            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields',
                                    'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers,
                                        content=warcinfo_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url + record.date + "-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content = ('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers,
                                        content=warcmeta_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type = "application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])

                warcrecord = WarcRecord(headers=headers,
                                        content=(content_type, content),
                                        version=version)

                warcrecord.write_to(out, gzip=options.gzip)

        fh.close()

    return 0