def process(self, infn, outfn, delete=False): """Process a WARC at a given infn, producing plain text via Tika where suitable, and writing a new WARC file to outfn.""" # These are objects of type RecordStream (or a subclass), unlike with # the IA library inwf = WarcRecord.open_archive(infn, mode='rb') outf = open(outfn, 'wb') self._openfiles.add(outfn) # try: # fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB) # fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB) # # Get locks on both files # except IOError: # print ("Unable to get file locks processing", infn, "so will " # "try later") # return False print "Processing", infn for record in inwf: try: if record.type == WarcRecord.WARCINFO: self.add_description_to_warcinfo(record) elif (record.type == WarcRecord.RESPONSE or record.type == WarcRecord.RESOURCE): if record.get_header('WARC-Segment-Number'): raise WarcTikaException("Segmented response/resource " "record. Not processing.") else: record = self.generate_new_record(record) # If 'metadata', 'request', 'revisit', 'continuation', # 'conversion' or something exotic, we can't do anything more # interesting than immediately re-writing it to the new file newrecord = WarcRecord(headers=record.headers, content=record.content) except Exception as e: print ("Warning: WARCTikaProcessor.process() failed on "+ record.url+": "+str(e.message)+ "\n\tWriting old record to new WARC.") traceback.print_exc() newrecord = record finally: newrecord.write_to(outf, gzip=outfn.endswith('.gz')) print "****Finished file. Tika status codes:", self.tikacodes.items() self.tikacodes = defaultdict(int) inwf.close() outf.close() self._openfiles.remove(outfn) # Check that the file has written correctly - for an excess of caution validrc = os.system("warcvalid "+outfn) if validrc: print "New file", outfn, "appears not to be valid. Deleting it." os.unlink(outfn) if delete and not validrc: print "Deleting", infn os.unlink(infn) return True
def write_warcinfo_record(self, warc): """Writes the initial warcinfo record.""" headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), ] data = "software=%s\nhostname=%s\nip=%s" % (self.software, self.hostname, self.ip) if self.description is not None: data += "\ndescription=%s" % self.description record = WarcRecord(headers=headers, content=("application/warc-fields", data)) record.write_to(warc, gzip=self.gzip) warc.flush()
def write_record(self, headers, mime, data): """Writes a WARC record. Arguments: headers -- Array of WARC headers. mime -- MIME type of the data. data -- the data block. """ record = WarcRecord(headers=headers, content=(mime, data)) logger.debug("Getting WARC: %s" % str(self.warcs.keys())) name = self.pool.get() logger.debug("Writing to: %s" % name) fh = self.warcs[name] record.write_to(fh, gzip=self.gzip) fh.flush() if not self.warc_reached_max_size(name): logger.debug("%s undersized; adding back to the pool." % name) self.pool.put(name)
def create_metadata_record_bytes( url='http://example.com/', content_type='image/png', date='2016-08-03T10:49:41Z', content=b'', include_block_digest=True): """Build WARC metadata record bits.""" headers = { WarcRecord.TYPE: WarcRecord.METADATA, WarcRecord.URL: url.encode('utf-8'), WarcRecord.CONTENT_TYPE: content_type.encode('utf-8'), WarcRecord.DATE: date.encode('utf-8') } if include_block_digest: hasher = hashlib.sha1(content) block_digest = base64.b32encode(hasher.digest()) headers[WarcRecord.BLOCK_DIGEST] = b'sha1:' + block_digest # XXX - I wish I could use WarcRecord. Current implementation of # WarcRecord.write_to() ignores Warc-Block-Digest passed and writes out # hex-encoded SHA256 calculated from the content. out = io.BytesIO() if False: rec = WarcRecord( headers=headers.items(), content=(content_type.encode('utf-8'), content) ) out = io.BytesIO() rec.write_to(out, gzip=True) return out.getvalue() else: z = GzipFile(fileobj=out, mode='wb') z.write(b'WARC/1.0\r\n') for k, v in headers.items(): z.write(b''.join((k, b': ', v, b'\r\n'))) z.write('Content-Length: {}\r\n'.format(len(content)).encode('ascii')) z.write(b'\r\n') z.write(content) z.write(b'\r\n\r\n') z.flush() z.close() return out.getvalue()
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'wb') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url+record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url+record.date+"-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content =('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type="application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE ), (WarcRecord.URL,record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'ab') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url + record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url + record.date + "-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content = ('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type = "application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
webbase_header = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>" content = "" headers = [("WARC-Filename", filename), ("WARC-Type", "response")] finished_headers = False first_line = fh.readline() assert first_line.startswith(webbase_header) for line in fh: if line.startswith(webbase_header): yield headers, ("text/html", content) content = "" else: if finished_headers: content += line elif "" == line.strip(): finished_headers = True else: add_header(headers, line) i = 0 warc_out = open("out.warc.gz", "w") for headers, content in get_wb_record("2pages"): print i i += 1 # print headers # print content record = WarcRecord(headers=headers, content=content) record.write_to(warc_out, gzip=True) record.dump() print "_" * 80