示例#1
0
    def process(self, infn, outfn, delete=False):
        """Process a WARC at a given infn, producing plain text via Tika
        where suitable, and writing a new WARC file to outfn."""
        # These are objects of type RecordStream (or a subclass), unlike with
        # the IA library
        inwf = WarcRecord.open_archive(infn, mode='rb')
        outf = open(outfn, 'wb')
        self._openfiles.add(outfn)
#        try:
#            fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            # Get locks on both files
#        except IOError:
#            print ("Unable to get file locks processing", infn, "so will "
#                   "try later")
#            return False
        print "Processing", infn
        for record in inwf:
            try:
                if record.type == WarcRecord.WARCINFO:
                    self.add_description_to_warcinfo(record)
                elif (record.type == WarcRecord.RESPONSE
                      or record.type == WarcRecord.RESOURCE):
                    if record.get_header('WARC-Segment-Number'):
                        raise WarcTikaException("Segmented response/resource "
                                                "record. Not processing.")
                    else:
                        record = self.generate_new_record(record)
                # If 'metadata', 'request', 'revisit', 'continuation',
                # 'conversion' or something exotic, we can't do anything more
                # interesting than immediately re-writing it to the new file

                newrecord = WarcRecord(headers=record.headers,
                        content=record.content)

            except Exception as e:
                print ("Warning: WARCTikaProcessor.process() failed on "+
                       record.url+": "+str(e.message)+
                       "\n\tWriting old record to new WARC.")
                traceback.print_exc()
                newrecord = record
            finally:
                newrecord.write_to(outf, gzip=outfn.endswith('.gz'))
        print "****Finished file. Tika status codes:", self.tikacodes.items()
        self.tikacodes = defaultdict(int)
        inwf.close()
        outf.close()
        self._openfiles.remove(outfn)

        # Check that the file has written correctly - for an excess of caution
        validrc = os.system("warcvalid "+outfn)

        if validrc:
            print "New file", outfn, "appears not to be valid. Deleting it." 
            os.unlink(outfn)
        if delete and not validrc:
            print "Deleting", infn
            os.unlink(infn)
        return True
示例#2
0
 def write_warcinfo_record(self, warc):
     """Writes the initial warcinfo record."""
     headers = [
         (WarcRecord.TYPE, WarcRecord.WARCINFO),
         (WarcRecord.DATE, warc_datetime_str(datetime.now())),
         (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
     ]
     data = "software=%s\nhostname=%s\nip=%s" % (self.software, self.hostname, self.ip)
     if self.description is not None:
         data += "\ndescription=%s" % self.description
     record = WarcRecord(headers=headers, content=("application/warc-fields", data))
     record.write_to(warc, gzip=self.gzip)
     warc.flush()
示例#3
0
 def write_warcinfo_record(self, warc):
     """Writes the initial warcinfo record."""
     headers = [
         (WarcRecord.TYPE, WarcRecord.WARCINFO),
         (WarcRecord.DATE, warc_datetime_str(datetime.now())),
         (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
     ]
     data = "software=%s\nhostname=%s\nip=%s" % (self.software,
                                                 self.hostname, self.ip)
     if self.description is not None:
         data += "\ndescription=%s" % self.description
     record = WarcRecord(headers=headers,
                         content=("application/warc-fields", data))
     record.write_to(warc, gzip=self.gzip)
     warc.flush()
示例#4
0
    def write_record(self, headers, mime, data):
        """Writes a WARC record.

        Arguments:
        headers -- Array of WARC headers.
        mime -- MIME type of the data.
        data -- the data block.

        """
        record = WarcRecord(headers=headers, content=(mime, data))
        logger.debug("Getting WARC: %s" % str(self.warcs.keys()))
        name = self.pool.get()
        logger.debug("Writing to: %s" % name)
        fh = self.warcs[name]
        record.write_to(fh, gzip=self.gzip)
        fh.flush()
        if not self.warc_reached_max_size(name):
            logger.debug("%s undersized; adding back to the pool." % name)
            self.pool.put(name)
示例#5
0
def create_metadata_record_bytes(
    url='http://example.com/',
    content_type='image/png',
    date='2016-08-03T10:49:41Z',
    content=b'',
    include_block_digest=True):
    """Build WARC metadata record bits."""

    headers = {
        WarcRecord.TYPE: WarcRecord.METADATA,
        WarcRecord.URL: url.encode('utf-8'),
        WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'),
        WarcRecord.DATE: date.encode('utf-8')
        }
    if include_block_digest:
        hasher = hashlib.sha1(content)
        block_digest = base64.b32encode(hasher.digest())
        headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest

    # XXX - I wish I could use WarcRecord. Current implementation of
    # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out
    # hex-encoded SHA256 calculated from the content.
    out = io.BytesIO()
    if False:
        rec = WarcRecord(
            headers=headers.items(),
            content=(content_type.encode('utf-8'), content)
            )
        out = io.BytesIO()
        rec.write_to(out, gzip=True)
        return out.getvalue()
    else:
        z = GzipFile(fileobj=out, mode='wb')
        z.write(b'WARC/1.0\r\n')
        for k, v in headers.items():
            z.write(b''.join((k, b': ', v, b'\r\n')))
        z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii'))
        z.write(b'\r\n')
        z.write(content)
        z.write(b'\r\n\r\n')
        z.flush()
        z.close()
        return out.getvalue()
示例#6
0
    def write_record(self, headers, mime, data):
        """Writes a WARC record.

        Arguments:
        headers -- Array of WARC headers.
        mime -- MIME type of the data.
        data -- the data block.

        """
        record = WarcRecord(headers=headers, content=(mime, data))
        logger.debug("Getting WARC: %s" % str(self.warcs.keys()))
        name = self.pool.get()
        logger.debug("Writing to: %s" % name)
        fh = self.warcs[name]
        record.write_to(fh, gzip=self.gzip)
        fh.flush()
        if not self.warc_reached_max_size(name):
            logger.debug("%s undersized; adding back to the pool." % name)
            self.pool.put(name)
示例#7
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'wb')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")
        
    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url+record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))


            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url+record.date+"-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content =('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type="application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE ),
                    (WarcRecord.URL,record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])
            
                warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version)

                warcrecord.write_to(out, gzip=options.gzip)


        fh.close()



    return 0
示例#8
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'ab')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url + record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))

            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields',
                                    'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers,
                                        content=warcinfo_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url + record.date + "-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content = ('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers,
                                        content=warcmeta_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type = "application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])

                warcrecord = WarcRecord(headers=headers,
                                        content=(content_type, content),
                                        version=version)

                warcrecord.write_to(out, gzip=options.gzip)

        fh.close()

    return 0
示例#9
0
    webbase_header = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>"
    content = ""
    headers = [("WARC-Filename", filename), ("WARC-Type", "response")]
    finished_headers = False
    first_line = fh.readline()
    assert first_line.startswith(webbase_header)
    for line in fh:
        if line.startswith(webbase_header):
            yield headers, ("text/html", content)
            content = ""
        else:
            if finished_headers:
                content += line
            elif "" == line.strip():
                finished_headers = True
            else:
                add_header(headers, line)


i = 0
warc_out = open("out.warc.gz", "w")
for headers, content in get_wb_record("2pages"):
    print i
    i += 1
    # print headers
    # print content
    record = WarcRecord(headers=headers, content=content)
    record.write_to(warc_out, gzip=True)
    record.dump()
    print "_" * 80