def main(argv):
	(options, input_files) = parser.parse_args(args=argv[1:])

	if options.strip_404s and not options.decode_http:
		raise RuntimeError("--strip-404s requires --decode_http")

	with open(options.output, "wb") as out:
		if len(input_files) < 1:
			fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb")
			try:
				previous_record = None
				for record in fh:
					process(record, previous_record, out, options)
					previous_record = record
			finally:
				fh.close()
		else:
			for name in input_files:
				previous_record = None
				fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
				try:
					for record in fh:
						process(record, previous_record, out, options)
						previous_record = record
				finally:
					fh.close()

	return 0
Exemplo n.º 2
0
def dump_payload_from_file(filename, offset=None, length=None, output_filename="/tmp/warc_dump"):
    print("ci siamo:", filename, offset, length, output_filename)
    print(WarcRecord.open_archive)
    print(closing)
    print("ok")
    fp = WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)
    print("dopo open_archive")
    print(fp)
    with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh:
       print("ho aperto il file")
       return dump_payload_from_stream(fh)
Exemplo n.º 3
0
  def __init__(self, url_or_io, bytes_range=None):
    if isinstance(url_or_io, str):
      self.archive = WarcRecord.open_archive(file_handle=response_as_file(url_or_io, bytes_range))
    elif isinstance(url_or_io, IterContentAsFile):
      self.archive = WarcRecord.open_archive(file_handle=url_or_io)
    else:
      self.archive = WarcRecord.open_archive(file_handle=stream_as_file("upload.warc.gz", url_or_io))

    self.path_types = {}

    self.files = {}
    self.errors = []

    self.offset = 0
    self.buffer = []
Exemplo n.º 4
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0        
#    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail




        fh.close()
    print total


    return 0
Exemplo n.º 5
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e
Exemplo n.º 6
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0
    #    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail

        fh.close()
    print total

    return 0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e
 def build_from_warcs(self, warcs):
     for warc in warcs:
         fh = WarcRecord.open_archive(warc, gzip="auto")
         try:
             for (offset, record, errors) in fh.read_records(limit=None):
                 if record:
                     if record.type == WarcRecord.METADATA:
                         for line in StringIO(record.content[1]):
                             if line.startswith("outlink: "):
                                 outlink = line.strip().split()[1]
                                 self.inverted_index[outlink] = record.url
                     if record.type == WarcRecord.RESPONSE:
                         f = FileHTTPResponse(record.content_file)
                         f.begin()
                         if f.status == 200 and record.url.startswith(
                                 "http"):
                             self.crawled_uris.append(
                                 (record.url, f.getheader("content-type"),
                                  record.date, record.content_length))
                 elif errors:
                     pass
                 else:
                     pass
         finally:
             fh.close()
Exemplo n.º 9
0
    def process(self, infn, outfn, delete=False):
        """Process a WARC at a given infn, producing plain text via Tika
        where suitable, and writing a new WARC file to outfn."""
        # These are objects of type RecordStream (or a subclass), unlike with
        # the IA library
        inwf = WarcRecord.open_archive(infn, mode='rb')
        outf = open(outfn, 'wb')
        self._openfiles.add(outfn)
#        try:
#            fcntl.lockf(inwf.file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            fcntl.lockf(outf, fcntl.LOCK_EX | fcntl.LOCK_NB)
#            # Get locks on both files
#        except IOError:
#            print ("Unable to get file locks processing", infn, "so will "
#                   "try later")
#            return False
        print "Processing", infn
        for record in inwf:
            try:
                if record.type == WarcRecord.WARCINFO:
                    self.add_description_to_warcinfo(record)
                elif (record.type == WarcRecord.RESPONSE
                      or record.type == WarcRecord.RESOURCE):
                    if record.get_header('WARC-Segment-Number'):
                        raise WarcTikaException("Segmented response/resource "
                                                "record. Not processing.")
                    else:
                        record = self.generate_new_record(record)
                # If 'metadata', 'request', 'revisit', 'continuation',
                # 'conversion' or something exotic, we can't do anything more
                # interesting than immediately re-writing it to the new file

                newrecord = WarcRecord(headers=record.headers,
                        content=record.content)

            except Exception as e:
                print ("Warning: WARCTikaProcessor.process() failed on "+
                       record.url+": "+str(e.message)+
                       "\n\tWriting old record to new WARC.")
                traceback.print_exc()
                newrecord = record
            finally:
                newrecord.write_to(outf, gzip=outfn.endswith('.gz'))
        print "****Finished file. Tika status codes:", self.tikacodes.items()
        self.tikacodes = defaultdict(int)
        inwf.close()
        outf.close()
        self._openfiles.remove(outfn)

        # Check that the file has written correctly - for an excess of caution
        validrc = os.system("warcvalid "+outfn)

        if validrc:
            print "New file", outfn, "appears not to be valid. Deleting it." 
            os.unlink(outfn)
        if delete and not validrc:
            print "Deleting", infn
            os.unlink(infn)
        return True
Exemplo n.º 10
0
    def __init__(self, url_or_io, bytes_range=None):
        if isinstance(url_or_io, str):
            self.archive = WarcRecord.open_archive(
                file_handle=response_as_file(url_or_io, bytes_range))
        elif isinstance(url_or_io, IterContentAsFile):
            self.archive = WarcRecord.open_archive(file_handle=url_or_io)
        else:
            self.archive = WarcRecord.open_archive(
                file_handle=stream_as_file("upload.warc.gz", url_or_io))

        self.path_types = {}

        self.files = {}
        self.errors = []

        self.offset = 0
        self.buffer = []
Exemplo n.º 11
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    if options.strip_404s and not options.decode_http:
        raise RuntimeError("--strip-404s requires --decode_http")

    if options.json_hrefs_file and not options.decode_http:
        raise RuntimeError("--json-hrefs-file requires --decode_http")

    if options.json_hrefs_file:
        found_hrefs = set()
    else:
        found_hrefs = None

    with open(options.output, "wb") as out:
        if len(input_files) < 1:
            fh = WarcRecord.open_archive(file_handle=sys.stdin,
                                         gzip=None,
                                         mode="rb")
            try:
                previous_record = None
                for record in fh:
                    process(record, previous_record, out, options, found_hrefs)
                    previous_record = record
            finally:
                fh.close()
        else:
            for name in input_files:
                previous_record = None
                fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
                try:
                    for record in fh:
                        process(record, previous_record, out, options,
                                found_hrefs)
                        previous_record = record
                finally:
                    fh.close()

    if found_hrefs is not None:
        fh = bz2.BZ2File(options.json_hrefs_file, "wb")
        try:
            fh.write("\n".join(sorted(found_hrefs)) + "\n")
        finally:
            fh.close()

    return 0
Exemplo n.º 12
0
    def run(self):
        path = self.path
        idx_file = "%s.idx" % path

        records = None

        if os.path.exists(idx_file) and os.path.getmtime(
                idx_file) >= os.path.getmtime(path):
            print "Loading " + path + " from cache"
            self.status = "loading-cache"
            with open(idx_file, "rb") as f:

                def update_progress():
                    self.bytes_read = f.tell()

                f_pr = IOWithProgress(f, update_progress)
                data = cPickle.load(f_pr)
            self.bytes_read = self.bytes_total

            if "version" in data and data["version"] == 1:
                records = data["records"]

        if not records:
            self.status = "indexing"
            self.bytes_total = os.path.getsize(self.path)

            print "Loading " + path
            records = OrderedDict()
            warc = WarcRecord.open_archive(path, gzip="auto")
            for (offset, record, errors) in warc.read_records(limit=None):
                if self.cancel:
                    raise Exception("Loading " + path + " canceled")

                if record and re.sub(
                        r"[^a-z;=/]+", "",
                        record.type) == WarcRecord.RESPONSE and re.sub(
                            r"[^a-z;=/]+", "",
                            record.content[0]) == ResponseMessage.CONTENT_TYPE:
                    http_response = parse_http_response(record)
                    records[canonicalize_url(record.url)] = {
                        "offset": offset,
                        "code": http_response[0],
                        "type": http_response[1]
                    }

                self.bytes_read = offset

            warc.close()

            with open(idx_file, "wb") as f:
                cPickle.dump({"version": 1, "records": records}, f)

        if self.cancel:
            raise Exception("Loading " + path + " canceled")

        print "Indexed " + path + ". Found " + str(len(records)) + " URLs"
        self.status = "indexed"
        self.records = records
Exemplo n.º 13
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in input_files:
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()

    return 0
Exemplo n.º 14
0
def main(argv):
	(options, input_files) = parser.parse_args(args=argv[1:])

	if options.strip_404s and not options.decode_http:
		raise RuntimeError("--strip-404s requires --decode_http")

	if options.json_hrefs_file and not options.decode_http:
		raise RuntimeError("--json-hrefs-file requires --decode_http")

	if options.json_hrefs_file:
		found_hrefs = set()
	else:
		found_hrefs = None

	with open(options.output, "wb") as out:
		if len(input_files) < 1:
			fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None, mode="rb")
			try:
				previous_record = None
				for record in fh:
					process(record, previous_record, out, options, found_hrefs)
					previous_record = record
			finally:
				fh.close()
		else:
			for name in input_files:
				previous_record = None
				fh = WarcRecord.open_archive(name, gzip="auto", mode="rb")
				try:
					for record in fh:
						process(record, previous_record, out, options, found_hrefs)
						previous_record = record
				finally:
					fh.close()

	if found_hrefs is not None:
		fh = bz2.BZ2File(options.json_hrefs_file, "wb")
		try:
			fh.write("\n".join(sorted(found_hrefs)) + "\n")
		finally:
			fh.close()

	return 0
Exemplo n.º 15
0
    def _load_warc_info(self):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")
        temp = wrs.read_records(limit=1)

        if not temp or (temp[0].type != WarcRecord.WARCINFO):
            raise ValueError("WARC info not found")

        return temp[0]
Exemplo n.º 16
0
    def _load_warc_info(self):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")
        temp = wrs.read_records(limit=1)

        if not temp or (temp[0].type != WarcRecord.WARCINFO):
            raise ValueError("WARC info not found")

        return temp[0]
Exemplo n.º 17
0
 def readRecord(filename, offset):
     """
     :type filename: str
     :type offset: int
     :rtype : WarcRecord
     """
     w = WarcRecord.open_archive(filename, offset=offset)
     g = w.read_records(limit=1)
     r = g.next()[1]
     w.close()
     return r
Exemplo n.º 18
0
 def loadWarcFileRecords(name):
     """ Generator function for records from the file 'name' """
     f = WarcRecord.open_archive(name, gzip="auto")
     for (offset, r, err) in f.read_records(limit=None):
         if err:
             print "warc errors at %s:%d" % (name, offset or 0)
             for e in err:
                 print '\t', e
         if r:
             yield (r, offset)
     f.close()
Exemplo n.º 19
0
 def loadWarcFileRecords(name):
     """ Generator function for records from the file 'name' """
     f = WarcRecord.open_archive(name, gzip="auto")
     for (offset, r, err) in f.read_records(limit=None):
         if err:
             print "warc errors at %s:%d" % (name, offset or 0)
             for e in err:
                 print '\t', e
         if r:
             yield (r, offset)
     f.close()
Exemplo n.º 20
0
 def readRecord(filename, offset):
     """
     :type filename: str
     :type offset: int
     :rtype : WarcRecord
     """
     w = WarcRecord.open_archive(filename, offset=offset)
     g = w.read_records(limit=1)
     r = g.next()[1]
     w.close()
     return r
Exemplo n.º 21
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in input_files:
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()



    return 0
Exemplo n.º 22
0
    def find_record(self, url):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")

        for (offset, record, errors) in wrs.read_records(limit=None):
            if record and (record.type == WarcRecord.RESPONSE) \
                    and (record.content[0] == ResponseMessage.CONTENT_TYPE) \
                    and (record.url == url):
                return record

        return None
Exemplo n.º 23
0
    def find_record(self, url):
        self._warc_file_read.seek(0)
        wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
                gzip="record")

        for (offset, record, errors) in wrs.read_records(limit=None):
            if record and (record.type == WarcRecord.RESPONSE) \
                    and (record.content[0] == ResponseMessage.CONTENT_TYPE) \
                    and (record.url == url):
                return record

        return None
Exemplo n.º 24
0
  def warc_record_for_uri(self, uri):
    found = False
    for (path, uris) in self.indices.iteritems():
      if uri in uris:
        warc = WarcRecord.open_archive(path, gzip="auto")
        warc.seek(uris[uri]["offset"])

        for record in warc.read_records(limit=1, offsets=uris[uri]["offset"]):
          found = True
          yield record

        warc.close()

    if not found:
      yield None
Exemplo n.º 25
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    try: # python3
        out = sys.stdout.buffer
    except AttributeError: # python2
        out = sys.stdout

    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in expand_files(input_files):
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()



    return 0
Exemplo n.º 26
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
        
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh,name)

            fh.close()


    return 0
Exemplo n.º 27
0
  def run(self):
    path = self.path
    idx_file = "%s.idx" % path

    records = None

    if os.path.exists(idx_file) and os.path.getmtime(idx_file) >= os.path.getmtime(path):
      print "Loading " + path + " from cache"
      self.status = "loading-cache"
      with open(idx_file, "rb") as f:
        def update_progress():
          self.bytes_read = f.tell()
        f_pr = IOWithProgress(f, update_progress)
        data = cPickle.load(f_pr)
      self.bytes_read = self.bytes_total

      if "version" in data and data["version"] == 1:
        records = data["records"]
    
    if not records:
      self.status = "indexing"
      self.bytes_total = os.path.getsize(self.path)

      print "Loading " + path
      records = OrderedDict()
      warc = WarcRecord.open_archive(path, gzip="auto")
      for (offset, record, errors) in warc.read_records(limit=None):
        if self.cancel:
          raise Exception("Loading " + path + " canceled")

        if record and re.sub(r"[^a-z;=/]+", "", record.type) == WarcRecord.RESPONSE and re.sub(r"[^a-z;=/]+", "", record.content[0]) == ResponseMessage.CONTENT_TYPE:
          http_response = parse_http_response(record)
          records[canonicalize_url(record.url)] = { "offset":offset, "code":http_response[0], "type":http_response[1] }

        self.bytes_read = offset

      warc.close()

      with open(idx_file, "wb") as f:
        cPickle.dump({ "version": 1, "records": records }, f)

    if self.cancel:
      raise Exception("Loading " + path + " canceled")

    print "Indexed "+path+". Found "+str(len(records))+" URLs"
    self.status = "indexed"
    self.records = records
Exemplo n.º 28
0
def doc_from_warc(infn, gzip='auto'):
    """Generator to process a WARC at a given infn."""
    # These are objects of type RecordStream (or a subclass), unlike with
    # the IA library
    inwf = WarcRecord.open_archive(infn, mode='rb', gzip=gzip)
    sys.stderr.write("Processing "+str(infn)+"\n")
    for record in inwf:
#                print "\nStarting record: "+str(record.url)
        try:
            if record.get_header('WARC-Segment-Number'):
                raise Exception("Segmented response/resource record "
                                "for "+record.url+". Not processing.")
            # We can process resource records (and conversion records,
            # which we assume are all of resource type (contain a document
            # rather than an HTTP transaction with nested document). This
            # may be unsafe, but conversion records are almost unknown in
            # the wild. The only ones we'll be handling here are those
            # output from WarcTika, which are in that format.
            # TODO: generalise this.
            # We also handle HTTP response records.
            if (record.type == WarcRecord.RESPONSE and
                  record.url.startswith('http')):
                httpcode, mimetype, charset, body = parse_http_response_charset(record)

            elif (record.type == WarcRecord.RESOURCE
                  or record.type == WarcRecord.CONVERSION):
                mimetype, body = record.content
                httpcode = 200 # "Success" for stored content
                charset = None # Not recorded
                
            # If 'metadata', 'request', 'revisit', 'continuation',
            # or something exotic, we can't do anything interesting
            elif (record.type == WarcRecord.METADATA
                  or record.type == WarcRecord.WARCINFO
                  or record.type == WarcRecord.REQUEST):
                continue
            else:
                sys.stderr.write("Can't handle"+str(record.type)+", "+str(record.url))
            yield (record.url, mimetype, body, httpcode, charset)
        except Exception:
            # General catch to avoid multiprocessing taking down the whole job
            # for one bogus record
            sys.stderr.write("\n\n***** Uncaught exception reading "+record.url
                             +" from file "+infn+":\n")
            traceback.print_exc()
            sys.stderr.write("Continuing.\n\n\n")
    inwf.close()
Exemplo n.º 29
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None),
                     name="-",
                     offsets=False)

    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh, name)

            fh.close()

    return 0
  def process_file(self, filename):
    f = WarcRecord.open_archive(filename, gzip="auto")

    for (offset, record, errors) in f.read_records(limit=None):
      if record:
        if record.type=="response":
          self._process_response(record)
        elif record.type=="request":
          self._process_request(record)
        elif record.type=="resource":
          self._process_resource(record)
      elif errors:
        raise WarcException, "Cannot decode WARC: %s" % errors

    self.current_request = None

    f.close()
Exemplo n.º 31
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                dump_record(fh, outzip)


    return 0
Exemplo n.º 32
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(
                    ArchiveRecord.open_archive(filename=filename,
                                               gzip="auto")) as fh:
                dump_record(fh, outzip)

    return 0
Exemplo n.º 33
0
def read_record(path, num_pages=10):
    warcr = WarcRecord.open_archive(path, gzip='auto')
    i = 0
    documents = []
    urls = []
    for record in warcr:
        if i >= num_pages:
            break
        if record.type == b'response' and record.content[
                0] == b'application/http; msgtype=response':
            url = ""
            for (h, v) in record.headers:
                if h == b'WARC-Target-URI':
                    url = str(v, errors="ignore")
            # domain = re.sub(r'^(www\.)?','',urlparse(url.decode("ISO-8859-1"))[1].lower())
            # urls.append(url.decode("ISO-8859-1").lower())
            urls.append(url)
            # documents.append(extract_text(record.content[1].decode("ISO-8859-1")))
            documents.append(
                extract_text(str(record.content[1], errors="ignore")))
            i += 1
    return documents, urls
Exemplo n.º 34
0
 def build_from_warcs(self, warcs):
     for warc in warcs:
         fh = WarcRecord.open_archive(warc, gzip="auto")
         try:
             for (offset, record, errors) in fh.read_records(limit=None):
                 if record:
                     if record.type == WarcRecord.METADATA:
                         for line in StringIO(record.content[1]):
                             if line.startswith("outlink: "):
                                 outlink = line.strip().split()[1]
                                 self.inverted_index[outlink] = record.url
                     if record.type == WarcRecord.RESPONSE:
                         f = FileHTTPResponse(record.content_file)
                         f.begin()
                         if f.status == 200 and record.url.startswith("http"):
                             self.crawled_uris.append((record.url, f.getheader("content-type"), record.date, record.content_length))
                 elif errors:
                     pass
                 else:
                     pass
         finally:
             fh.close()
Exemplo n.º 35
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)


    return 0
Exemplo n.º 36
0
def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename,
                                                gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)

    return 0
Exemplo n.º 37
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    # prepare regular expressions
    link_ignore_expressions = prepare_link_ignore_re(options.ignore_links)

    print "parsing WARC archives"

    all_urls = []

    for filename in expand_files(input_files):

        print "WARC: "+filename

        link_cache_filename = filename+'.urls'
        if options.persist_links and os.path.exists(link_cache_filename):
            url_fh = open(link_cache_filename, 'r')
            urls = pickle.load(url_fh)
            url_fh.close()
            all_urls += urls
        else:
            urls = []
            fh = WarcRecord.open_archive(filename, gzip="auto")
            for record in fh:

                record = record
                """@type : ArchiveRecord """

                if not record.is_response():
                    continue

                urls.append({
                    'url': record.url,
                    'content-type': record.content_content_type
                })

            # urls.sort(cmp=url_cmp)
            if options.persist_links:
                url_fh = open(link_cache_filename, 'w+')
                pickle.dump(urls, url_fh)
                url_fh.close()

            fh.close()
            all_urls += urls

    if options.dump_links is not None:

        f = open(options.dump_links, 'w+')
        all_urls.sort()
        for url in all_urls:
            # skip ignorable links
            skip_addition = False
            for expression in link_ignore_expressions:
                if expression.match(url['url']):
                    skip_addition = True
                    break
            if not skip_addition:
                f.write(url['url'])
                f.write('\n')
        f.close()

    if options.web_start is not False:
        urltree = UrlTree()
        for url in all_urls:
            # skip filtered links via regex
            skip_addition = False
            for expression in link_ignore_expressions:
                if expression.match(url['url']):
                    skip_addition = True
                    break
            # skip links filtered by content_type filter
            if options.content_type:
                if not url['content-type'].startswith(options.content_type):
                        skip_addition = True
            if options.content_type_not:
                if url['content-type'].startswith(options.content_type_not):
                        skip_addition = True

            if not skip_addition:
                urltree.add_url(url['url'])
        print "Total urls: "+str(urltree.childcount)
        webserver.run(urltree)
Exemplo n.º 38
0
r.seed(1818118181)  # Arbitrary

content = []
rejects = defaultdict(int)

#Load all the objects into memory first
try:
    with open(picklefn, "rb") as fh:
        print "Unpickling selected sample."
        content = pickle.load(fh)
except IOError:
    print "Pickled file does not appear to exist. Loading content."
    for fn in os.listdir(dirname):
        if not fn.endswith('.warc.gz'):
            continue
        wf = WarcRecord.open_archive(dirname + '/' + fn, mode='rb')
        try:
            print fn
            for record in wf:
                if not record.type in [
                        WarcRecord.RESPONSE, WarcRecord.RESOURCE,
                        WarcRecord.CONVERSION
                ]:
                    continue
                if (record.type == WarcRecord.RESPONSE
                        and record.url.startswith('http')):
                    ccode, cmime, cbody = parse_http_response(record)
                    if ccode not in successcodes:
                        continue
                else:
                    ccode = None
Exemplo n.º 39
0
from __future__ import print_function

import sys
from hanzo.warctools import WarcRecord
import argparse

parser = argparse.ArgumentParser(description='Attempt to fix WARC files with '
    'a broken gzipped record. Most WARC tools use the iterator reader, which '
    'fails if any one of the gzip records is damaged.')
parser.add_argument('infn', help='Input gzipped WARC filename.')
parser.add_argument('outfn', help='Output gzipped WARC filename.')

args = parser.parse_args()

inwf = WarcRecord.open_archive(args.infn, gzip="auto")
outwf = open(args.outfn, 'wb')
for (offset, record, errors) in inwf.read_records(limit=None):
    # Generates an offset (or None) plus *either* a valid record (and empty
    # list for errors, *or* a list of errors (and None for record).
    if errors:
        print("warc errors at %s:%d"%(args.infn, offset), file=sys.stderr)
        print(errors, file=sys.stderr)
        break
    elif record is not None and record.validate(): # ugh name, returns errorsa
        print("warc errors at %s:%d"%(args.infn, offset), file=sys.stderr)
        print(record.validate(), file=sys.stderr)
        break
    try:
        record.validate()
        record.write_to(outwf, gzip=True)
Exemplo n.º 40
0
uuidsexcluded = set()

exclist = parse_exc_args(args.pattern)

# In theory this could be agnostic as to whether the stream is compressed or
# not. In practice, the gzip guessing code reads the stream for marker bytes
# and then attempts to rewind, which fails for stdin unless an elaborate
# stream wrapping class is set up.
gzi = 'auto'
if args.gzipped_input:
    gzi = 'record'
elif args.plain_input:
    gzi = False

if args.in_filename is None:
    inwf = WarcRecord.open_archive(file_handle=sys.stdin,
                                   mode='rb', gzip=gzi)
else:
    inwf = WarcRecord.open_archive(filename=args.in_filename,
                                   mode='rb', gzip=gzi)

#####
#MAIN
#####

outf = sys.stdout
if args.out_filename is not None:
    outf = open(args.out_filename, 'wb')

for record in inwf:
    # How many matches constitutes failure?
    write = len(exclist)
Exemplo n.º 41
0
	def __init__( self, warc ):
		self.warc = warc
		logger.debug( "Mounting %s" % self.warc )
		self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" )
		self.tree = Tree()
		self._get_records()
r.seed(1818118181) # Arbitrary

content = []
rejects = defaultdict(int)

#Load all the objects into memory first
try:
    with open(picklefn, "rb") as fh:
        print "Unpickling selected sample."
        content = pickle.load(fh)
except IOError:
    print "Pickled file does not appear to exist. Loading content."
    for fn in os.listdir(dirname):
        if not fn.endswith('.warc.gz'):
            continue
        wf = WarcRecord.open_archive(dirname+'/'+fn, mode='rb')
        try:
            print fn
            for record in wf:
                if not record.type in [WarcRecord.RESPONSE,
                                       WarcRecord.RESOURCE,
                                       WarcRecord.CONVERSION]:
                    continue
                if (record.type == WarcRecord.RESPONSE
                        and record.url.startswith('http')):
                    ccode, cmime, cbody = parse_http_response(record)
                    if ccode not in successcodes:
                        continue
                else:
                    ccode = None
                    cmime = record.content[0]