Пример #1
0
  def iter_zip(self):
    with ZipFile(self, "w") as outzip:
      for (offset, record, errors) in self.archive.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and re.sub(r'\s+', '', record.content[0]) == ResponseMessage.CONTENT_TYPE:
          message = ResponseMessage(RequestMessage())
          leftover = message.feed(record.content[1])
          message.close()

          filename = self.url_to_filename(record.url)
          date_time = record.date
          date_time = (int(date_time[0:4]), int(date_time[5:7]), int(date_time[8:10]),
                       int(date_time[11:13]), int(date_time[14:16]), int(date_time[17:19]))

          info = ZipInfo(filename, date_time)
          outzip.writestr(info, message.get_body())
          self.files[filename] = record.url

          for chunk in self.buffer:
            yield(chunk)
          self.buffer = []

        elif errors:
          self.errors.append("warc errors at %s:%d"%(name, offset if offset else 0))
          for e in errors:
            self.errors.append(e)

      outzip.writestr("files.txt", "\n".join([ "%s -> %s" % (v,k) for k,v in self.files.iteritems() ]))
      if len(self.errors) > 0:
        outzip.writestr("errors.txt", "\n".join(self.errors))

    for chunk in self.buffer:
      yield(chunk)

    self.buffer = []
Пример #2
0
def process(record, out, options):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content
            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on
                message = ResponseMessage(RequestMessage(),
                                          ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                if not leftover and message.complete():
                    content = message.get_decoded_message()
                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed" % len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)" %
                                     (message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(
                        error)

    record.write_to(out, gzip=options.gzip)
Пример #3
0
def dump_record(fh, outzip):
    for (offset, record, errors) in fh.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and record.content[0] == ResponseMessage.CONTENT_TYPE:
            message = ResponseMessage(RequestMessage())
            leftover = message.feed(record.content[1])
            message.close()

            outzip.writestr(re.sub(r'^https?://', '', record.url), message.get_body())
            print(record.url)
        elif errors:
            print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0)
            for e in errors:
                print '\t', e
Пример #4
0
def process(record, out, options):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content
            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on 
                message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                if not leftover and message.complete():
                    content = message.get_decoded_message()
                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed"%len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error)

    record.write_to(out, gzip=options.gzip)
Пример #5
0
def process(record, previous_record, out, options, found_hrefs):
	ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
	if options.decode_http:
		if record.type == WarcRecord.RESPONSE:
			content_type, content = record.content

			message = None
			if content_type == ResponseMessage.CONTENT_TYPE:
				# technically, a http request needs to know the request to be parsed
				# because responses to head requests don't have a body.
				# we assume we don't store 'head' responses, and plough on 
				message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers)
			if content_type == RequestMessage.CONTENT_TYPE:
				message = RequestMessage(ignore_headers=ignore_headers)

			if message:
				leftover = message.feed(content)
				message.close()
				##print "Code", message.header.code

				if not leftover and message.complete():
					content = message.get_decoded_message()

					if found_hrefs is not None and message.header.code == 200:
						found_hrefs.update(match[12:-2] for match in JSON_HREF_RE.findall(content))

					record.content = content_type, content
				else:
					error = []
					if leftover:
						error.append("%d bytes unparsed"%len(leftover))
					if not message.complete():
						error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode))
					print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error)

	if options.strip_404s:
		# We don't write out a request until we confirm its associated response is not 404
		if record.type == WarcRecord.REQUEST:
			pass
		elif record.type == WarcRecord.RESPONSE:
			if message.header.code == 404:
				# If 404, don't write out either the request or the response
				pass
			else:
				if previous_record is None:
					raise RuntimeError("Need to write out previous record as well, but it isn't present")
				if previous_record.type != WarcRecord.REQUEST:
					raise RuntimeError("Expected previous record to be a "
						"WarcRecord.REQUEST, was a %r" % (previous_record.type,))
				# Note that if a request is made multiple times, we will only write out the last
				# attempt at it.
				previous_record.write_to(out, gzip=options.gzip)
				record.write_to(out, gzip=options.gzip)
		else: # metadata
			record.write_to(out, gzip=options.gzip)
	else:
		record.write_to(out, gzip=options.gzip)
Пример #6
0
def parse_http_response_charset(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type, declared character set and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            print 'trailing data in http response for', record.url
        if not message.complete():
            print 'truncated http response for', record.url
    header = message.header

    mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
    charset = None
    if mime_type:
        match = re.search(r'charset=(\S+)', mime_type[0], re.I)
        if match:
            charset = match.group(1).lower()
        mime_type = mime_type[0].split(b';')[0]
    else:
        mime_type = None

    return header.code, mime_type, charset, message.get_body()
Пример #7
0
    def writeRecordToTransport(r, t):
        m = ResponseMessage(RequestMessage())
        m.feed(r.content[1])
        m.close()
        b = m.get_body()

        # construct new headers
        new_headers = []
        old_headers = []
        for k, v in m.header.headers:
            if not k.lower() in ("connection", "content-length",
                                 "cache-control", "accept-ranges", "etag",
                                 "last-modified", "transfer-encoding"):
                new_headers.append((k, v))
            old_headers.append(("X-Archive-Orig-%s" % k, v))

        new_headers.append(("Content-Length", "%d" % len(b)))
        new_headers.append(("Connection", "keep-alive"))
        # write the response
        t.write("%s %d %s\r\n" %
                (m.header.version, m.header.code, m.header.phrase))
        h = new_headers + old_headers
        t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h]))
        t.write("\r\n\r\n")
        t.write(b)
Пример #8
0
    def __call__(self, request):
        """Called by HTTPServer to execute the request."""
        web_match = re.match(self.WEB_RE, request.uri)
        if not web_match:
            web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri)

        if web_match:
            request.host = "warc"
            request.uri = web_match.group("uri")
            request.path, sep, query = request.uri.partition("?")
            self.web_handler.__call__(request)

        else:
            with self.proxy_handler.warc_record_for_uri(
                    canonicalize_url(request.uri)) as record:
                if record:
                    print "Serving %s from WARC" % request.uri

                    # parse the response
                    message = ResponseMessage(RequestMessage())
                    message.feed(record[1].content[1])
                    message.close()

                    body = message.get_body()

                    # construct new headers
                    new_headers = []
                    old_headers = []
                    for k, v in message.header.headers:
                        if not k.lower() in ("connection", "content-length",
                                             "cache-control", "accept-ranges",
                                             "etag", "last-modified",
                                             "transfer-encoding"):
                            new_headers.append((k, v))
                        old_headers.append(("X-Archive-Orig-%s" % k, v))

                    new_headers.append(("Content-Length", "%d" % len(body)))
                    new_headers.append(("Connection", "keep-alive"))

                    # write the response
                    request.write("%s %d %s\r\n" %
                                  (message.header.version, message.header.code,
                                   message.header.phrase))
                    request.write("\r\n".join([
                        "%s: %s" % (k, v)
                        for k, v in (new_headers + old_headers)
                    ]))
                    request.write("\r\n\r\n")
                    request.write(body)

                else:
                    print "Could not find %s in WARC" % request.uri
                    request.write(
                        "HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n"
                    )
            request.finish()
Пример #9
0
 def writeRecordToTransport(r, t):
     m = ResponseMessage(RequestMessage())
     m.feed(r.content[1])
     m.close()        
     b = m.get_body()
     
     # construct new headers
     new_headers = []
     old_headers = []
     for k, v in m.header.headers:
         if not k.lower() in ("connection", "content-length",
                              "cache-control", "accept-ranges", "etag",
                              "last-modified", "transfer-encoding"):
             new_headers.append((k, v))
         old_headers.append(("X-Archive-Orig-%s" % k, v))
     
     new_headers.append(("Content-Length", "%d" % len(b)))
     new_headers.append(("Connection", "keep-alive"))
     # write the response
     t.write("%s %d %s\r\n" % (m.header.version,
                               m.header.code,
                               m.header.phrase))
     h = new_headers + old_headers
     t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h]))
     t.write("\r\n\r\n")
     t.write(b)
Пример #10
0
    def _init_from_warc_record(self, warc_record):
        self._warc_record = warc_record
        self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
        self.setUrl(QUrl(str_to_qstring(self._warc_record.url)))

        rs = ResponseMessage(RequestMessage())
        rs.feed(self._warc_record.content[1])

        for name, value in rs.header.headers:
            self.setRawHeader(name, value)

        self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \
                rs.header.code)
        self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \
                rs.header.phrase)

        self._check_for_redirect(rs.header.code)

        QTimer.singleShot(0, lambda: self.metaDataChanged.emit())

        self._data = rs.get_body()

        QTimer.singleShot(0, lambda: self.readyRead.emit())
        QTimer.singleShot(0, lambda: self.finished.emit())
Пример #11
0
    def _init_from_warc_record(self, warc_record):
        self._warc_record = warc_record
        self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
        self.setUrl(QUrl(str_to_qstring(self._warc_record.url)))

        rs = ResponseMessage(RequestMessage())
        rs.feed(self._warc_record.content[1])

        for name, value in rs.header.headers:
            self.setRawHeader(name, value)

        self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \
                rs.header.code)
        self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \
                rs.header.phrase)

        self._check_for_redirect(rs.header.code)

        QTimer.singleShot(0, lambda: self.metaDataChanged.emit())

        self._data = rs.get_body()

        QTimer.singleShot(0, lambda: self.readyRead.emit())
        QTimer.singleShot(0, lambda: self.finished.emit())
Пример #12
0
    def extractPayload(record):
        """
        :type record: WarcRecord
        """
        m = ResponseMessage(RequestMessage())
        m.feed(record.content[1])
        m.close()
        b = m.get_body()

        z = zlib.decompressobj(16 + zlib.MAX_WBITS)
        try:
            b = z.decompress(b)
        except zlib.error:
            pass
        return b
Пример #13
0
    def extractPayload(record):
        """
        :type record: WarcRecord
        """
        m = ResponseMessage(RequestMessage())
        m.feed(record.content[1])
        m.close()
        b = m.get_body()

        z = zlib.decompressobj(16 + zlib.MAX_WBITS)
        try:
            b = z.decompress(b)
        except zlib.error:
            pass
        return b
Пример #14
0
def parse_http_response(record):
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            print >> sys.stderr, 'warning: trailing data in http response for', record.url
        if not message.complete():
            print >> sys.stderr, 'warning: truncated http response for', record.url

    header = message.header

    mime_type = [v for k, v in header.headers if k.lower() == 'content-type']
    if mime_type:
        mime_type = mime_type[0].split(';')[0]
    else:
        mime_type = None

    return header.code, mime_type, message
Пример #15
0
def parse_http_response(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            print 'trailing data in http response for', record.url
        if not message.complete():
            print 'truncated http response for', record.url
    header = message.header

    mime_type = [v for k, v in header.headers if k.lower() == b'content-type']
    if mime_type:
        mime_type = mime_type[0].split(b';')[0]
    else:
        mime_type = None

    return header.code, mime_type, message.get_body()
Пример #16
0
def dump_record(fh, outzip):
    for (offset, record, errors) in fh.read_records(limit=None):
        if record and record.type == WarcRecord.RESPONSE and record.content[
                0] == ResponseMessage.CONTENT_TYPE:
            message = ResponseMessage(RequestMessage())
            leftover = message.feed(record.content[1])
            message.close()

            outzip.writestr(re.sub(r'^https?://', '', record.url),
                            message.get_body())
            print(record.url)
        elif errors:
            print >> sys.stderr, "warc errors at %s:%d" % (name, offset
                                                           if offset else 0)
            for e in errors:
                print '\t', e
Пример #17
0
  def __call__(self, request):
    """Called by HTTPServer to execute the request."""
    web_match = re.match(self.WEB_RE, request.uri)
    if not web_match:
      web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri)

    if web_match:
      request.host = "warc"
      request.uri = web_match.group("uri")
      request.path, sep, query = request.uri.partition("?")
      self.web_handler.__call__(request)

    else:
      with self.proxy_handler.warc_record_for_uri(canonicalize_url(request.uri)) as record:
        if record:
          print "Serving %s from WARC" % request.uri

          # parse the response
          message = ResponseMessage(RequestMessage())
          message.feed(record[1].content[1])
          message.close()

          body = message.get_body()

          # construct new headers
          new_headers = []
          old_headers = []
          for k, v in message.header.headers:
            if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"):
              new_headers.append((k, v))
            old_headers.append(("X-Archive-Orig-%s" % k, v))

          new_headers.append(("Content-Length", "%d" % len(body)))
          new_headers.append(("Connection", "keep-alive"))

          # write the response
          request.write("%s %d %s\r\n" % (message.header.version, message.header.code, message.header.phrase))
          request.write("\r\n".join([ "%s: %s" % (k,v) for k,v in (new_headers + old_headers) ]))
          request.write("\r\n\r\n")
          request.write(body)

        else:
          print "Could not find %s in WARC" % request.uri
          request.write("HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n")
      request.finish()
Пример #18
0
def parse_http_response(record):
  message = ResponseMessage(RequestMessage())
  remainder = message.feed(record.content[1])
  message.close()
  if remainder or not message.complete():
    if remainder:
      print 'trailing data in http response for %s'% record.url
    if not message.complete():
      print 'truncated http response for %s'%record.url

  header = message.header

  mime_type = [v for k,v in header.headers if k.lower() =='content-type']
  if mime_type:
    mime_type = mime_type[0].split(';')[0]
  else:
    mime_type = None

  return header.code, mime_type, message
Пример #19
0
def parse_http_response_charset(record):
    """Parses the payload of an HTTP 'response' record, returning code,
    content type and body.

    Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
    commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
    message = ResponseMessage(RequestMessage())
    remainder = message.feed(record.content[1])
    message.close()
    if remainder or not message.complete():
        if remainder:
            raise Exception('trailing data in http response for'+str(record.url))
        if not message.complete():
            print Exception('truncated http response for'+str(record.url))
    header = message.header

    mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
    if mime_type:
        mime_type, charset = mime_type[0].split(b';')
    else:
        mime_type = None

    return header.code, mime_type, message.get_body()
Пример #20
0
    def iter_zip(self):
        with ZipFile(self, "w") as outzip:
            for (offset, record,
                 errors) in self.archive.read_records(limit=None):
                if record and record.type == WarcRecord.RESPONSE and re.sub(
                        r'\s+', '',
                        record.content[0]) == ResponseMessage.CONTENT_TYPE:
                    message = ResponseMessage(RequestMessage())
                    leftover = message.feed(record.content[1])
                    message.close()

                    filename = self.url_to_filename(record.url)
                    date_time = record.date
                    date_time = (int(date_time[0:4]), int(date_time[5:7]),
                                 int(date_time[8:10]), int(date_time[11:13]),
                                 int(date_time[14:16]), int(date_time[17:19]))

                    info = ZipInfo(filename, date_time)
                    outzip.writestr(info, message.get_body())
                    self.files[filename] = record.url

                    for chunk in self.buffer:
                        yield (chunk)
                    self.buffer = []

                elif errors:
                    self.errors.append("warc errors at %s:%d" %
                                       (name, offset if offset else 0))
                    for e in errors:
                        self.errors.append(e)

            outzip.writestr(
                "files.txt", "\n".join(
                    ["%s -> %s" % (v, k) for k, v in self.files.iteritems()]))
            if len(self.errors) > 0:
                outzip.writestr("errors.txt", "\n".join(self.errors))

        for chunk in self.buffer:
            yield (chunk)

        self.buffer = []
Пример #21
0
def process(record, previous_record, out, options, found_hrefs):
    ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
    if options.decode_http:
        if record.type == WarcRecord.RESPONSE:
            content_type, content = record.content

            message = None
            if content_type == ResponseMessage.CONTENT_TYPE:
                # technically, a http request needs to know the request to be parsed
                # because responses to head requests don't have a body.
                # we assume we don't store 'head' responses, and plough on
                message = ResponseMessage(RequestMessage(),
                                          ignore_headers=ignore_headers)
            if content_type == RequestMessage.CONTENT_TYPE:
                message = RequestMessage(ignore_headers=ignore_headers)

            if message:
                leftover = message.feed(content)
                message.close()
                ##print "Code", message.header.code

                if not leftover and message.complete():
                    content = message.get_decoded_message()

                    if found_hrefs is not None and message.header.code == 200:
                        found_hrefs.update(
                            match[12:-2]
                            for match in JSON_HREF_RE.findall(content))

                    record.content = content_type, content
                else:
                    error = []
                    if leftover:
                        error.append("%d bytes unparsed" % len(leftover))
                    if not message.complete():
                        error.append("incomplete message (at %s, %s)" %
                                     (message.mode, message.header.mode))
                    print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(
                        error)

    if options.strip_404s:
        # We don't write out a request until we confirm its associated response is not 404
        if record.type == WarcRecord.REQUEST:
            pass
        elif record.type == WarcRecord.RESPONSE:
            if message.header.code == 404:
                # If 404, don't write out either the request or the response
                pass
            else:
                if previous_record is None:
                    raise RuntimeError(
                        "Need to write out previous record as well, but it isn't present"
                    )
                if previous_record.type != WarcRecord.REQUEST:
                    raise RuntimeError("Expected previous record to be a "
                                       "WarcRecord.REQUEST, was a %r" %
                                       (previous_record.type, ))
                # Note that if a request is made multiple times, we will only write out the last
                # attempt at it.
                previous_record.write_to(out, gzip=options.gzip)
                record.write_to(out, gzip=options.gzip)
        else:  # metadata
            record.write_to(out, gzip=options.gzip)
    else:
        record.write_to(out, gzip=options.gzip)