def warcinfo_record(warc_filename): """Return warcinfo WarcRecord. Required to write in the beginning of a WARC file. """ warc_date = warc_datetime_str(datetime.utcnow()) metadata = "\r\n".join(( "format: WARC File Format 1.0", "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf" )) return WarcRecord(headers=[ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.CONTENT_TYPE, b'application/warc-fields'), (WarcRecord.ID, warc_uuid(metadata + warc_date)), (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename) ], content=(b'application/warc-fields', metadata + "\r\n"), version=b"WARC/1.0")
def _init_file(self): warcinfo_headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.ID, WarcRecord.random_warc_uuid()), (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())), (WarcRecord.FILENAME, os.path.basename(self._file_name)), (Warc.MAIN_URL, self._main_url), ] warcinfo_fields = "\r\n".join([ "software: bardo", "format: WARC File Format 1.0", "conformsTo: " + CONFORMS_TO, "robots: unknown", ]) warcinfo_content = ("application/warc-fields", warcinfo_fields) warcinfo_record = WarcRecord(headers=warcinfo_headers, \ content=warcinfo_content) self.write_record(warcinfo_record)
def tweet_warc_record(tweet_json): """Parse Tweet JSON and return WarcRecord. """ try: tweet = json.loads(tweet_json) # skip deleted tweet if 'user' not in tweet: return url = "https://twitter.com/%s/status/%s" % ( tweet['user']['screen_name'], tweet['id']) except Exception as ex: logging.error('error in tweet_warc_record', exc_info=1) return None warc_date = warc_datetime_str( datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0)) return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE), (WarcRecord.CONTENT_TYPE, b'application/json'), (WarcRecord.ID, warc_uuid(url + warc_date)), (WarcRecord.URL, url), (WarcRecord.DATE, warc_date)], content=(b'application/json', tweet_json + "\r\n"), version=b"WARC/1.0")
def write(self,response, fh): request=response.request request_id = "<uin:uuid:%s>" % uuid4() response_id = "<uin:uuid:%s>" % uuid4() date = warc.warc_datetime_str(datetime.utcnow()) request_raw = ["%s %s HTTP/1.1" % (request.method, request.full_url)] request_raw.extend("%s: %s"%(k,v) for k,v in request.headers.iteritems()) content = request._enc_data request_raw.extend([("Content-Length: %d"%len(content)),"",content]) request_raw = "\r\n".join(str(s) for s in request_raw) response_raw = ["HTTP/1.1 %d -"%(response.status_code)] response_raw.extend("%s: %s"%(k,v) for k,v in response.headers.iteritems()) content=response.content response_raw.extend([("Content-Length: %d"%len(content)),"",content]) response_raw = "\r\n".join(str(s) for s in response_raw) requestw = warc.make_request(request_id, date, request.url, ('application/http;msgtype=request', request_raw), response_id) responsew = warc.make_response(response_id, date, response.url, ('application/http;msgtype=response', response_raw), request_id) requestw.write_to(fh) responsew.write_to(fh)
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'wb') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url+record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url+record.date+"-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content =('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type="application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE ), (WarcRecord.URL,record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: out = open(options.output, 'ab') if options.output.endswith('.gz'): options.gzip = True if len(input_files) < 1: parser.error("no imput warc file(s)") for name in input_files: fh = ArcRecord.open_archive(name, gzip="auto") filedesc = None warcinfo_id = None for record in fh: version = "WARC/1.0" warc_id = make_warc_uuid(record.url + record.date) headers = [ (WarcRecord.ID, warc_id), ] if record.date: date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S') headers.append((WarcRecord.DATE, warc_datetime_str(date))) if record.type == 'filedesc': warcinfo_id = warc_id warcinfo_headers = list(headers) warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:])) warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO)) warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n') warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version) warcrecord.write_to(out, gzip=options.gzip) warc_id = make_warc_uuid(record.url + record.date + "-meta") warcmeta_headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.CONCURRENT_TO, warcinfo_id), (WarcRecord.ID, warc_id), (WarcRecord.URL, record.url), (WarcRecord.DATE, warcrecord.date), (WarcRecord.WARCINFO_ID, warcinfo_id), ] warcmeta_content = ('application/arc', record.raw()) warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version) warcrecord.write_to(out, gzip=options.gzip) else: content_type, content = record.content if record.url.startswith('http'): # don't promote content-types for http urls, # they contain headers in the body. content_type = "application/http;msgtype=response" headers.extend([ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, record.url), (WarcRecord.WARCINFO_ID, warcinfo_id), ]) warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version) warcrecord.write_to(out, gzip=options.gzip) fh.close() return 0
def _reply_finished(self): self._network_reply.readyRead.disconnect(self._reply_ready_read) self._network_reply.finished.disconnect(self._reply_finished) self._network_reply.error.disconnect(self._reply_error) status_code = self._network_reply.attribute(QNetworkRequest \ .HttpStatusCodeAttribute) if not status_code.isValid(): self._temp_data.close() self._temp_data = None self._network_reply = None QTimer.singleShot(0, lambda: self.finished.emit()) return headers = dict() for header in self._network_reply.rawHeaderList(): temp = str(self._network_reply.rawHeader(header)) headers[str(header)] = re.sub("\s", " ", temp) elements = [] for name, value in headers.iteritems(): elements.append(name + ": " + value) elements.append("") url = qstring_to_str(self._network_reply.url().toString()) status_msg = self._network_reply.attribute(QNetworkRequest \ .HttpReasonPhraseAttribute) assert(status_msg.isValid()) self._temp_data.seek(0) # XXX: we can't get HTTP version from Qt webkit, assumes 1.1 h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \ + str(status_msg.toString()) content_data = h_status + "\r\n" \ + "\r\n".join(elements) + "\r\n" \ + self._temp_data.read() content_type = ResponseMessage.CONTENT_TYPE content = (content_type, content_data) wr = warc.make_response(WarcRecord.random_warc_uuid(), warc.warc_datetime_str(datetime.utcnow()), url, content, None) self._temp_data.close() self._temp_data = None self.manager().current_warc.write_record(wr) self._init_from_warc_record(wr) self._network_reply = None
def _reply_finished(self): self._network_reply.readyRead.disconnect(self._reply_ready_read) self._network_reply.finished.disconnect(self._reply_finished) self._network_reply.error.disconnect(self._reply_error) status_code = self._network_reply.attribute(QNetworkRequest \ .HttpStatusCodeAttribute) if not status_code.isValid(): self._temp_data.close() self._temp_data = None self._network_reply = None QTimer.singleShot(0, lambda: self.finished.emit()) return headers = dict() for header in self._network_reply.rawHeaderList(): temp = str(self._network_reply.rawHeader(header)) headers[str(header)] = re.sub("\s", " ", temp) elements = [] for name, value in headers.iteritems(): elements.append(name + ": " + value) elements.append("") url = qstring_to_str(self._network_reply.url().toString()) status_msg = self._network_reply.attribute(QNetworkRequest \ .HttpReasonPhraseAttribute) assert (status_msg.isValid()) self._temp_data.seek(0) # XXX: we can't get HTTP version from Qt webkit, assumes 1.1 h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \ + str(status_msg.toString()) content_data = h_status + "\r\n" \ + "\r\n".join(elements) + "\r\n" \ + self._temp_data.read() content_type = ResponseMessage.CONTENT_TYPE content = (content_type, content_data) wr = warc.make_response(WarcRecord.random_warc_uuid(), warc.warc_datetime_str(datetime.utcnow()), url, content, None) self._temp_data.close() self._temp_data = None self.manager().current_warc.write_record(wr) self._init_from_warc_record(wr) self._network_reply = None