def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) if options.strip_404s and not options.decode_http: raise RuntimeError("--strip-404s requires --decode_http") with open(options.output, "wb") as out: if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb") try: previous_record = None for record in fh: process(record, previous_record, out, options) previous_record = record finally: fh.close() else: for name in input_files: previous_record = None fh = WarcRecord.open_archive(name, gzip="auto", mode="rb") try: for record in fh: process(record, previous_record, out, options) previous_record = record finally: fh.close() return 0
def dump_payload_from_file(filename, offset=None, length=None, output_filename="/tmp/warc_dump"): print("ci siamo:", filename, offset, length, output_filename) print(WarcRecord.open_archive) print(closing) print("ok") fp = WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length) print("dopo open_archive") print(fp) with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh: print("ho aperto il file") return dump_payload_from_stream(fh)
def __init__(self, url_or_io, bytes_range=None): if isinstance(url_or_io, str): self.archive = WarcRecord.open_archive(file_handle=response_as_file(url_or_io, bytes_range)) elif isinstance(url_or_io, IterContentAsFile): self.archive = WarcRecord.open_archive(file_handle=url_or_io) else: self.archive = WarcRecord.open_archive(file_handle=stream_as_file("upload.warc.gz", url_or_io)) self.path_types = {} self.files = {} self.errors = [] self.offset = 0 self.buffer = []
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") total = 0 # print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length' for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if record: print name, offset, record.type, record.url, record.id, record.content_type, record.content_length total += record.content_length elif errors: pass # ignore else: pass # no errors at tail fh.close() print total return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: if not os.path.exists(options.output): os.makedirs(options.output) output_dir = options.output else: output_dir = os.getcwd() collisions = 0 if len(args) < 1: log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb') log_headers(log_file) with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback) else: for filename in args: log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file log_file = open(log_file, 'wb') log_headers(log_file) try: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback) except StandardError, e: print >> sys.stderr, "exception in handling", filename, e
def build_from_warcs(self, warcs): for warc in warcs: fh = WarcRecord.open_archive(warc, gzip="auto") try: for (offset, record, errors) in fh.read_records(limit=None): if record: if record.type == WarcRecord.METADATA: for line in StringIO(record.content[1]): if line.startswith("outlink: "): outlink = line.strip().split()[1] self.inverted_index[outlink] = record.url if record.type == WarcRecord.RESPONSE: f = FileHTTPResponse(record.content_file) f.begin() if f.status == 200 and record.url.startswith( "http"): self.crawled_uris.append( (record.url, f.getheader("content-type"), record.date, record.content_length)) elif errors: pass else: pass finally: fh.close()
def process(self, infn, outfn, delete=False): """Process a WARC at a given infn, producing plain text via Tika where suitable, and writing a new WARC file to outfn.""" # These are objects of type RecordStream (or a subclass), unlike with # the IA library inwf = WarcRecord.open_archive(infn, mode='rb') outf = open(outfn, 'wb') self._openfiles.add(outfn) # try: # fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB) # fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB) # # Get locks on both files # except IOError: # print ("Unable to get file locks processing", infn, "so will " # "try later") # return False print "Processing", infn for record in inwf: try: if record.type == WarcRecord.WARCINFO: self.add_description_to_warcinfo(record) elif (record.type == WarcRecord.RESPONSE or record.type == WarcRecord.RESOURCE): if record.get_header('WARC-Segment-Number'): raise WarcTikaException("Segmented response/resource " "record. Not processing.") else: record = self.generate_new_record(record) # If 'metadata', 'request', 'revisit', 'continuation', # 'conversion' or something exotic, we can't do anything more # interesting than immediately re-writing it to the new file newrecord = WarcRecord(headers=record.headers, content=record.content) except Exception as e: print ("Warning: WARCTikaProcessor.process() failed on "+ record.url+": "+str(e.message)+ "\n\tWriting old record to new WARC.") traceback.print_exc() newrecord = record finally: newrecord.write_to(outf, gzip=outfn.endswith('.gz')) print "****Finished file. Tika status codes:", self.tikacodes.items() self.tikacodes = defaultdict(int) inwf.close() outf.close() self._openfiles.remove(outfn) # Check that the file has written correctly - for an excess of caution validrc = os.system("warcvalid "+outfn) if validrc: print "New file", outfn, "appears not to be valid. Deleting it." os.unlink(outfn) if delete and not validrc: print "Deleting", infn os.unlink(infn) return True
def __init__(self, url_or_io, bytes_range=None): if isinstance(url_or_io, str): self.archive = WarcRecord.open_archive( file_handle=response_as_file(url_or_io, bytes_range)) elif isinstance(url_or_io, IterContentAsFile): self.archive = WarcRecord.open_archive(file_handle=url_or_io) else: self.archive = WarcRecord.open_archive( file_handle=stream_as_file("upload.warc.gz", url_or_io)) self.path_types = {} self.files = {} self.errors = [] self.offset = 0 self.buffer = []
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) if options.strip_404s and not options.decode_http: raise RuntimeError("--strip-404s requires --decode_http") if options.json_hrefs_file and not options.decode_http: raise RuntimeError("--json-hrefs-file requires --decode_http") if options.json_hrefs_file: found_hrefs = set() else: found_hrefs = None with open(options.output, "wb") as out: if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb") try: previous_record = None for record in fh: process(record, previous_record, out, options, found_hrefs) previous_record = record finally: fh.close() else: for name in input_files: previous_record = None fh = WarcRecord.open_archive(name, gzip="auto", mode="rb") try: for record in fh: process(record, previous_record, out, options, found_hrefs) previous_record = record finally: fh.close() if found_hrefs is not None: fh = bz2.BZ2File(options.json_hrefs_file, "wb") try: fh.write("\n".join(sorted(found_hrefs)) + "\n") finally: fh.close() return 0
def run(self): path = self.path idx_file = "%s.idx" % path records = None if os.path.exists(idx_file) and os.path.getmtime( idx_file) >= os.path.getmtime(path): print "Loading " + path + " from cache" self.status = "loading-cache" with open(idx_file, "rb") as f: def update_progress(): self.bytes_read = f.tell() f_pr = IOWithProgress(f, update_progress) data = cPickle.load(f_pr) self.bytes_read = self.bytes_total if "version" in data and data["version"] == 1: records = data["records"] if not records: self.status = "indexing" self.bytes_total = os.path.getsize(self.path) print "Loading " + path records = OrderedDict() warc = WarcRecord.open_archive(path, gzip="auto") for (offset, record, errors) in warc.read_records(limit=None): if self.cancel: raise Exception("Loading " + path + " canceled") if record and re.sub( r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub( r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE: http_response = parse_http_response(record) records[canonicalize_url(record.url)] = { "offset": offset, "code": http_response[0], "type": http_response[1] } self.bytes_read = offset warc.close() with open(idx_file, "wb") as f: cPickle.dump({"version": 1, "records": records}, f) if self.cancel: raise Exception("Loading " + path + " canceled") print "Indexed " + path + ". Found " + str(len(records)) + " URLs" self.status = "indexed" self.records = records
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: process(record, out, options) else: for name in input_files: fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: process(record, out, options) fh.close() return 0
def _load_warc_info(self): self._warc_file_read.seek(0) wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \ gzip="record") temp = wrs.read_records(limit=1) if not temp or (temp[0].type != WarcRecord.WARCINFO): raise ValueError("WARC info not found") return temp[0]
def readRecord(filename, offset): """ :type filename: str :type offset: int :rtype : WarcRecord """ w = WarcRecord.open_archive(filename, offset=offset) g = w.read_records(limit=1) r = g.next()[1] w.close() return r
def loadWarcFileRecords(name): """ Generator function for records from the file 'name' """ f = WarcRecord.open_archive(name, gzip="auto") for (offset, r, err) in f.read_records(limit=None): if err: print "warc errors at %s:%d" % (name, offset or 0) for e in err: print '\t', e if r: yield (r, offset) f.close()
def find_record(self, url): self._warc_file_read.seek(0) wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \ gzip="record") for (offset, record, errors) in wrs.read_records(limit=None): if record and (record.type == WarcRecord.RESPONSE) \ and (record.content[0] == ResponseMessage.CONTENT_TYPE) \ and (record.url == url): return record return None
def warc_record_for_uri(self, uri): found = False for (path, uris) in self.indices.iteritems(): if uri in uris: warc = WarcRecord.open_archive(path, gzip="auto") warc.seek(uris[uri]["offset"]) for record in warc.read_records(limit=1, offsets=uris[uri]["offset"]): found = True yield record warc.close() if not found: yield None
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) try: # python3 out = sys.stdout.buffer except AttributeError: # python2 out = sys.stdout if len(input_files) < 1: fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) for record in fh: process(record, out, options) else: for name in expand_files(input_files): fh = WarcRecord.open_archive(name, gzip="auto") for record in fh: process(record, out, options) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() return 0
def run(self): path = self.path idx_file = "%s.idx" % path records = None if os.path.exists(idx_file) and os.path.getmtime(idx_file) >= os.path.getmtime(path): print "Loading " + path + " from cache" self.status = "loading-cache" with open(idx_file, "rb") as f: def update_progress(): self.bytes_read = f.tell() f_pr = IOWithProgress(f, update_progress) data = cPickle.load(f_pr) self.bytes_read = self.bytes_total if "version" in data and data["version"] == 1: records = data["records"] if not records: self.status = "indexing" self.bytes_total = os.path.getsize(self.path) print "Loading " + path records = OrderedDict() warc = WarcRecord.open_archive(path, gzip="auto") for (offset, record, errors) in warc.read_records(limit=None): if self.cancel: raise Exception("Loading " + path + " canceled") if record and re.sub(r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub(r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE: http_response = parse_http_response(record) records[canonicalize_url(record.url)] = { "offset":offset, "code":http_response[0], "type":http_response[1] } self.bytes_read = offset warc.close() with open(idx_file, "wb") as f: cPickle.dump({ "version": 1, "records": records }, f) if self.cancel: raise Exception("Loading " + path + " canceled") print "Indexed "+path+". Found "+str(len(records))+" URLs" self.status = "indexed" self.records = records
def doc_from_warc(infn, gzip='auto'): """Generator to process a WARC at a given infn.""" # These are objects of type RecordStream (or a subclass), unlike with # the IA library inwf = WarcRecord.open_archive(infn, mode='rb', gzip=gzip) sys.stderr.write("Processing "+str(infn)+"\n") for record in inwf: # print "\nStarting record: "+str(record.url) try: if record.get_header('WARC-Segment-Number'): raise Exception("Segmented response/resource record " "for "+record.url+". Not processing.") # We can process resource records (and conversion records, # which we assume are all of resource type (contain a document # rather than an HTTP transaction with nested document). This # may be unsafe, but conversion records are almost unknown in # the wild. The only ones we'll be handling here are those # output from WarcTika, which are in that format. # TODO: generalise this. # We also handle HTTP response records. if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): httpcode, mimetype, charset, body = parse_http_response_charset(record) elif (record.type == WarcRecord.RESOURCE or record.type == WarcRecord.CONVERSION): mimetype, body = record.content httpcode = 200 # "Success" for stored content charset = None # Not recorded # If 'metadata', 'request', 'revisit', 'continuation', # or something exotic, we can't do anything interesting elif (record.type == WarcRecord.METADATA or record.type == WarcRecord.WARCINFO or record.type == WarcRecord.REQUEST): continue else: sys.stderr.write("Can't handle"+str(record.type)+", "+str(record.url)) yield (record.url, mimetype, body, httpcode, charset) except Exception: # General catch to avoid multiprocessing taking down the whole job # for one bogus record sys.stderr.write("\n\n***** Uncaught exception reading "+record.url +" from file "+infn+":\n") traceback.print_exc() sys.stderr.write("Continuing.\n\n\n") inwf.close()
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-", offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh, name) fh.close() return 0
def process_file(self, filename): f = WarcRecord.open_archive(filename, gzip="auto") for (offset, record, errors) in f.read_records(limit=None): if record: if record.type=="response": self._process_response(record) elif record.type=="request": self._process_request(record) elif record.type=="resource": self._process_resource(record) elif errors: raise WarcException, "Cannot decode WARC: %s" % errors self.current_request = None f.close()
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing( ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def read_record(path, num_pages=10): warcr = WarcRecord.open_archive(path, gzip='auto') i = 0 documents = [] urls = [] for record in warcr: if i >= num_pages: break if record.type == b'response' and record.content[ 0] == b'application/http; msgtype=response': url = "" for (h, v) in record.headers: if h == b'WARC-Target-URI': url = str(v, errors="ignore") # domain = re.sub(r'^(www\.)?','',urlparse(url.decode("ISO-8859-1"))[1].lower()) # urls.append(url.decode("ISO-8859-1").lower()) urls.append(url) # documents.append(extract_text(record.content[1].decode("ISO-8859-1"))) documents.append( extract_text(str(record.content[1], errors="ignore"))) i += 1 return documents, urls
def build_from_warcs(self, warcs): for warc in warcs: fh = WarcRecord.open_archive(warc, gzip="auto") try: for (offset, record, errors) in fh.read_records(limit=None): if record: if record.type == WarcRecord.METADATA: for line in StringIO(record.content[1]): if line.startswith("outlink: "): outlink = line.strip().split()[1] self.inverted_index[outlink] = record.url if record.type == WarcRecord.RESPONSE: f = FileHTTPResponse(record.content_file) f.begin() if f.status == 200 and record.url.startswith("http"): self.crawled_uris.append((record.url, f.getheader("content-type"), record.date, record.content_length)) elif errors: pass else: pass finally: fh.close()
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: # dump a record from the filename, with optional offset filename = args[0] if len(args) > 1: offset = int(args[1]) else: offset = 0 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: fh.seek(offset) dump_record(fh) return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) # prepare regular expressions link_ignore_expressions = prepare_link_ignore_re(options.ignore_links) print "parsing WARC archives" all_urls = [] for filename in expand_files(input_files): print "WARC: "+filename link_cache_filename = filename+'.urls' if options.persist_links and os.path.exists(link_cache_filename): url_fh = open(link_cache_filename, 'r') urls = pickle.load(url_fh) url_fh.close() all_urls += urls else: urls = [] fh = WarcRecord.open_archive(filename, gzip="auto") for record in fh: record = record """@type : ArchiveRecord """ if not record.is_response(): continue urls.append({ 'url': record.url, 'content-type': record.content_content_type }) # urls.sort(cmp=url_cmp) if options.persist_links: url_fh = open(link_cache_filename, 'w+') pickle.dump(urls, url_fh) url_fh.close() fh.close() all_urls += urls if options.dump_links is not None: f = open(options.dump_links, 'w+') all_urls.sort() for url in all_urls: # skip ignorable links skip_addition = False for expression in link_ignore_expressions: if expression.match(url['url']): skip_addition = True break if not skip_addition: f.write(url['url']) f.write('\n') f.close() if options.web_start is not False: urltree = UrlTree() for url in all_urls: # skip filtered links via regex skip_addition = False for expression in link_ignore_expressions: if expression.match(url['url']): skip_addition = True break # skip links filtered by content_type filter if options.content_type: if not url['content-type'].startswith(options.content_type): skip_addition = True if options.content_type_not: if url['content-type'].startswith(options.content_type_not): skip_addition = True if not skip_addition: urltree.add_url(url['url']) print "Total urls: "+str(urltree.childcount) webserver.run(urltree)
r.seed(1818118181) # Arbitrary content = [] rejects = defaultdict(int) #Load all the objects into memory first try: with open(picklefn, "rb") as fh: print "Unpickling selected sample." content = pickle.load(fh) except IOError: print "Pickled file does not appear to exist. Loading content." for fn in os.listdir(dirname): if not fn.endswith('.warc.gz'): continue wf = WarcRecord.open_archive(dirname + '/' + fn, mode='rb') try: print fn for record in wf: if not record.type in [ WarcRecord.RESPONSE, WarcRecord.RESOURCE, WarcRecord.CONVERSION ]: continue if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): ccode, cmime, cbody = parse_http_response(record) if ccode not in successcodes: continue else: ccode = None
from __future__ import print_function import sys from hanzo.warctools import WarcRecord import argparse parser = argparse.ArgumentParser(description='Attempt to fix WARC files with ' 'a broken gzipped record. Most WARC tools use the iterator reader, which ' 'fails if any one of the gzip records is damaged.') parser.add_argument('infn', help='Input gzipped WARC filename.') parser.add_argument('outfn', help='Output gzipped WARC filename.') args = parser.parse_args() inwf = WarcRecord.open_archive(args.infn, gzip="auto") outwf = open(args.outfn, 'wb') for (offset, record, errors) in inwf.read_records(limit=None): # Generates an offset (or None) plus *either* a valid record (and empty # list for errors, *or* a list of errors (and None for record). if errors: print("warc errors at %s:%d"%(args.infn, offset), file=sys.stderr) print(errors, file=sys.stderr) break elif record is not None and record.validate(): # ugh name, returns errorsa print("warc errors at %s:%d"%(args.infn, offset), file=sys.stderr) print(record.validate(), file=sys.stderr) break try: record.validate() record.write_to(outwf, gzip=True)
uuidsexcluded = set() exclist = parse_exc_args(args.pattern) # In theory this could be agnostic as to whether the stream is compressed or # not. In practice, the gzip guessing code reads the stream for marker bytes # and then attempts to rewind, which fails for stdin unless an elaborate # stream wrapping class is set up. gzi = 'auto' if args.gzipped_input: gzi = 'record' elif args.plain_input: gzi = False if args.in_filename is None: inwf = WarcRecord.open_archive(file_handle=sys.stdin, mode='rb', gzip=gzi) else: inwf = WarcRecord.open_archive(filename=args.in_filename, mode='rb', gzip=gzi) ##### #MAIN ##### outf = sys.stdout if args.out_filename is not None: outf = open(args.out_filename, 'wb') for record in inwf: # How many matches constitutes failure? write = len(exclist)
def __init__( self, warc ): self.warc = warc logger.debug( "Mounting %s" % self.warc ) self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" ) self.tree = Tree() self._get_records()
r.seed(1818118181) # Arbitrary content = [] rejects = defaultdict(int) #Load all the objects into memory first try: with open(picklefn, "rb") as fh: print "Unpickling selected sample." content = pickle.load(fh) except IOError: print "Pickled file does not appear to exist. Loading content." for fn in os.listdir(dirname): if not fn.endswith('.warc.gz'): continue wf = WarcRecord.open_archive(dirname+'/'+fn, mode='rb') try: print fn for record in wf: if not record.type in [WarcRecord.RESPONSE, WarcRecord.RESOURCE, WarcRecord.CONVERSION]: continue if (record.type == WarcRecord.RESPONSE and record.url.startswith('http')): ccode, cmime, cbody = parse_http_response(record) if ccode not in successcodes: continue else: ccode = None cmime = record.content[0]