def warcinfo_record(warc_filename):
    """Return warcinfo WarcRecord.
    Required to write in the beginning of a WARC file.
    """
    warc_date = warc_datetime_str(datetime.utcnow())
    metadata = "\r\n".join((
        "format: WARC File Format 1.0",
        "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"
    ))
    return WarcRecord(headers=[
        (WarcRecord.TYPE, WarcRecord.WARCINFO),
        (WarcRecord.CONTENT_TYPE, b'application/warc-fields'),
        (WarcRecord.ID, warc_uuid(metadata + warc_date)),
        (WarcRecord.DATE, warc_date), (WarcRecord.FILENAME, warc_filename)
    ],
                      content=(b'application/warc-fields', metadata + "\r\n"),
                      version=b"WARC/1.0")
示例#2
0
    def _init_file(self):
        warcinfo_headers = [
            (WarcRecord.TYPE, WarcRecord.WARCINFO),
            (WarcRecord.ID, WarcRecord.random_warc_uuid()),
            (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
            (WarcRecord.FILENAME, os.path.basename(self._file_name)),
            (Warc.MAIN_URL, self._main_url),
        ]

        warcinfo_fields = "\r\n".join([
            "software: bardo",
            "format: WARC File Format 1.0",
            "conformsTo: " + CONFORMS_TO,
            "robots: unknown",
        ])

        warcinfo_content = ("application/warc-fields", warcinfo_fields)

        warcinfo_record = WarcRecord(headers=warcinfo_headers, \
                content=warcinfo_content)

        self.write_record(warcinfo_record)
示例#3
0
    def _init_file(self):
        warcinfo_headers = [
            (WarcRecord.TYPE, WarcRecord.WARCINFO),
            (WarcRecord.ID, WarcRecord.random_warc_uuid()),
            (WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
            (WarcRecord.FILENAME, os.path.basename(self._file_name)),
            (Warc.MAIN_URL, self._main_url),
        ]

        warcinfo_fields = "\r\n".join([
            "software: bardo",
            "format: WARC File Format 1.0",
            "conformsTo: " + CONFORMS_TO,
            "robots: unknown",
        ])

        warcinfo_content = ("application/warc-fields", warcinfo_fields)

        warcinfo_record = WarcRecord(headers=warcinfo_headers, \
                content=warcinfo_content)

        self.write_record(warcinfo_record)
示例#4
0
def tweet_warc_record(tweet_json):
    """Parse Tweet JSON and return WarcRecord.
    """
    try:
        tweet = json.loads(tweet_json)
        # skip deleted tweet
        if 'user' not in tweet:
            return
        url = "https://twitter.com/%s/status/%s" % (
            tweet['user']['screen_name'], tweet['id'])
    except Exception as ex:
        logging.error('error in tweet_warc_record', exc_info=1)
        return None

    warc_date = warc_datetime_str(
        datetime.utcfromtimestamp(float(tweet['timestamp_ms']) / 1000.0))
    return WarcRecord(headers=[(WarcRecord.TYPE, WarcRecord.RESOURCE),
                               (WarcRecord.CONTENT_TYPE, b'application/json'),
                               (WarcRecord.ID, warc_uuid(url + warc_date)),
                               (WarcRecord.URL, url),
                               (WarcRecord.DATE, warc_date)],
                      content=(b'application/json', tweet_json + "\r\n"),
                      version=b"WARC/1.0")
示例#5
0
文件: crawler.py 项目: ersi/crawler
    def write(self,response, fh):
        
        request=response.request
        request_id = "<uin:uuid:%s>" % uuid4()
        response_id = "<uin:uuid:%s>" % uuid4()
        date = warc.warc_datetime_str(datetime.utcnow())

        request_raw = ["%s %s HTTP/1.1" % (request.method, request.full_url)]
        request_raw.extend("%s: %s"%(k,v) for k,v in request.headers.iteritems())
        content = request._enc_data
        request_raw.extend([("Content-Length: %d"%len(content)),"",content])
        request_raw = "\r\n".join(str(s) for s in request_raw)

        response_raw = ["HTTP/1.1 %d -"%(response.status_code)]
        response_raw.extend("%s: %s"%(k,v) for k,v in response.headers.iteritems())
        content=response.content
        response_raw.extend([("Content-Length: %d"%len(content)),"",content])
        response_raw = "\r\n".join(str(s) for s in response_raw)

        requestw = warc.make_request(request_id, date, request.url, ('application/http;msgtype=request', request_raw), response_id)
        responsew = warc.make_response(response_id, date, response.url, ('application/http;msgtype=response', response_raw), request_id)

        requestw.write_to(fh)
        responsew.write_to(fh)
示例#6
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'wb')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")
        
    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url+record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))


            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url+record.date+"-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content =('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type="application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE ),
                    (WarcRecord.URL,record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])
            
                warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version)

                warcrecord.write_to(out, gzip=options.gzip)


        fh.close()



    return 0
示例#7
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        out = open(options.output, 'ab')
        if options.output.endswith('.gz'):
            options.gzip = True
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    for name in input_files:
        fh = ArcRecord.open_archive(name, gzip="auto")

        filedesc = None

        warcinfo_id = None
        for record in fh:
            version = "WARC/1.0"

            warc_id = make_warc_uuid(record.url + record.date)
            headers = [
                (WarcRecord.ID, warc_id),
            ]
            if record.date:
                date = datetime.datetime.strptime(record.date, '%Y%m%d%H%M%S')
                headers.append((WarcRecord.DATE, warc_datetime_str(date)))

            if record.type == 'filedesc':
                warcinfo_id = warc_id

                warcinfo_headers = list(headers)
                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))

                warcinfo_content = ('application/warc-fields',
                                    'software: hanzo.arc2warc\r\n')

                warcrecord = WarcRecord(headers=warcinfo_headers,
                                        content=warcinfo_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)

                warc_id = make_warc_uuid(record.url + record.date + "-meta")
                warcmeta_headers = [
                    (WarcRecord.TYPE, WarcRecord.METADATA),
                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
                    (WarcRecord.ID, warc_id),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.DATE, warcrecord.date),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ]
                warcmeta_content = ('application/arc', record.raw())

                warcrecord = WarcRecord(headers=warcmeta_headers,
                                        content=warcmeta_content,
                                        version=version)
                warcrecord.write_to(out, gzip=options.gzip)
            else:
                content_type, content = record.content
                if record.url.startswith('http'):
                    # don't promote content-types for http urls,
                    # they contain headers in the body.
                    content_type = "application/http;msgtype=response"

                headers.extend([
                    (WarcRecord.TYPE, WarcRecord.RESPONSE),
                    (WarcRecord.URL, record.url),
                    (WarcRecord.WARCINFO_ID, warcinfo_id),
                ])

                warcrecord = WarcRecord(headers=headers,
                                        content=(content_type, content),
                                        version=version)

                warcrecord.write_to(out, gzip=options.gzip)

        fh.close()

    return 0
示例#8
0
    def _reply_finished(self):
        self._network_reply.readyRead.disconnect(self._reply_ready_read)
        self._network_reply.finished.disconnect(self._reply_finished)
        self._network_reply.error.disconnect(self._reply_error)

        status_code = self._network_reply.attribute(QNetworkRequest \
                .HttpStatusCodeAttribute)

        if not status_code.isValid():
            self._temp_data.close()
            self._temp_data = None
            self._network_reply = None

            QTimer.singleShot(0, lambda: self.finished.emit())

            return

        headers = dict()

        for header in self._network_reply.rawHeaderList():
            temp = str(self._network_reply.rawHeader(header))
            headers[str(header)] = re.sub("\s", " ", temp)

        elements = []

        for name, value in headers.iteritems():
            elements.append(name + ": " + value)

        elements.append("")

        url = qstring_to_str(self._network_reply.url().toString())

        status_msg = self._network_reply.attribute(QNetworkRequest \
                .HttpReasonPhraseAttribute)

        assert(status_msg.isValid())

        self._temp_data.seek(0)

        # XXX: we can't get HTTP version from Qt webkit, assumes 1.1
        h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \
                + str(status_msg.toString())

        content_data = h_status + "\r\n" \
                + "\r\n".join(elements) + "\r\n" \
                + self._temp_data.read()

        content_type = ResponseMessage.CONTENT_TYPE

        content = (content_type, content_data)

        wr = warc.make_response(WarcRecord.random_warc_uuid(),
                warc.warc_datetime_str(datetime.utcnow()), url, content, None)

        self._temp_data.close()
        self._temp_data = None

        self.manager().current_warc.write_record(wr)

        self._init_from_warc_record(wr)

        self._network_reply = None
示例#9
0
    def _reply_finished(self):
        self._network_reply.readyRead.disconnect(self._reply_ready_read)
        self._network_reply.finished.disconnect(self._reply_finished)
        self._network_reply.error.disconnect(self._reply_error)

        status_code = self._network_reply.attribute(QNetworkRequest \
                .HttpStatusCodeAttribute)

        if not status_code.isValid():
            self._temp_data.close()
            self._temp_data = None
            self._network_reply = None

            QTimer.singleShot(0, lambda: self.finished.emit())

            return

        headers = dict()

        for header in self._network_reply.rawHeaderList():
            temp = str(self._network_reply.rawHeader(header))
            headers[str(header)] = re.sub("\s", " ", temp)

        elements = []

        for name, value in headers.iteritems():
            elements.append(name + ": " + value)

        elements.append("")

        url = qstring_to_str(self._network_reply.url().toString())

        status_msg = self._network_reply.attribute(QNetworkRequest \
                .HttpReasonPhraseAttribute)

        assert (status_msg.isValid())

        self._temp_data.seek(0)

        # XXX: we can't get HTTP version from Qt webkit, assumes 1.1
        h_status = "HTTP/1.1 " + str(status_code.toString()) + " " \
                + str(status_msg.toString())

        content_data = h_status + "\r\n" \
                + "\r\n".join(elements) + "\r\n" \
                + self._temp_data.read()

        content_type = ResponseMessage.CONTENT_TYPE

        content = (content_type, content_data)

        wr = warc.make_response(WarcRecord.random_warc_uuid(),
                                warc.warc_datetime_str(datetime.utcnow()), url,
                                content, None)

        self._temp_data.close()
        self._temp_data = None

        self.manager().current_warc.write_record(wr)

        self._init_from_warc_record(wr)

        self._network_reply = None