Python WARCFile示例

编程语言: Python

命名空间/包名称: warc

类/类型: WARCFile

hotexamples.com的示例: 7

Python WARCFile - 已找到7个示例。这些是从开源项目中提取的最受好评的warc.WARCFile现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

WARCFile(5)

close(2)

write_record(2)

read_record(1)

示例#1

显示文件

文件： dump.py 项目： EHRI/resync

    def write_warc(self, resources=None, dumpfile=None):
        """Write a WARC dump file

        WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left
        in this library for experimentation.
        """
        # Load library late as we want to be able to run rest of code 
        # without this installed
        try:
            from warc import WARCFile,WARCHeader,WARCRecord
        except:
            raise DumpError("Failed to load WARC library")
        wf = WARCFile(dumpfile, mode="w", compress=self.compress)
        # Add all files in the resources
        for resource in resources:
            wh = WARCHeader({})
            wh.url = resource.uri
            wh.ip_address = None
            wh.date = resource.lastmod
            wh.content_type = 'text/plain'
            wh.result_code = 200
            wh.checksum = 'aabbcc'
            wh.location = self.archive_path(resource.path)
            wf.write_record( WARCRecord( header=wh, payload=resource.path ) )
        wf.close()
        warcsize = os.path.getsize(dumpfile)
        self.logging.info("Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize))

示例#2

显示文件

文件： commoncrawl.py 项目： rodricios/sift

    def parse_warc_content(buf):
        wf = WARCFile(fileobj=StringIO(buf))
        record = wf.read_record()
        payload = record.payload.read()
        top = payload[:15]

        if top.startswith('HTTP/') and top.endswith('200 OK'):
            content_start = payload.find('\r\n\r\n')
            if content_start != -1:
                yield record.url, payload[content_start + 4:]

示例#3

显示文件

 def read_warc(self, key):
     keypath = 's3://aws-publicdatasets/{key}'.format(key=key)
     with self.s3.open(keypath, 'rb') as fp:
         warcfile = WARCFile(fileobj=fp, compress='gzip')
         for record in warcfile.reader:
             if record.type == 'response':
                 self.increment_counter(self.__class__.__name__, 'match', 1)
                 yield record

示例#4

显示文件

 def read_warc(self, key):
     keypath = 's3://commoncrawl/{key}'.format(key=key)
     with self.s3.open(keypath, 'rb') as fp:
         g = gzip.GzipFile(fileobj=fp)
         warcfile = WARCFile(fileobj=g)
         for record in warcfile:
             #if record.type == 'response':
                 self.increment_counter(self.__class__.__name__, 'match', 1)
                 yield record

示例#5

显示文件

    def write_warc(self, resources=None, dumpfile=None):
        """Write a WARC dump file.

        WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left
        in this library for experimentation.
        """
        # Load library late as we want to be able to run rest of code
        # without this installed
        try:
            from warc import WARCFile, WARCHeader, WARCRecord
        except:
            raise DumpError("Failed to load WARC library")
        wf = WARCFile(dumpfile, mode="w", compress=self.compress)
        # Add all files in the resources
        for resource in resources:
            wh = WARCHeader({})
            wh.url = resource.uri
            wh.ip_address = None
            wh.date = resource.lastmod
            wh.content_type = 'text/plain'
            wh.result_code = 200
            wh.checksum = 'aabbcc'
            wh.location = self.archive_path(resource.path)
            wf.write_record(WARCRecord(header=wh, payload=resource.path))
        wf.close()
        warcsize = os.path.getsize(dumpfile)
        self.logging.info("Wrote WARC file dump %s with size %d bytes" %
                          (dumpfile, warcsize))

示例#6

显示文件

文件： dump.py 项目： pedak/resdbp

 def write_warc(self, inventory=None, dumpfile=None):
     """Write a WARC dump file"""
     # Load library late as we want to be able to run rest of code 
     # without this installed
     try:
         from warc import WARCFile,WARCHeader,WARCRecord
     except:
         raise DumpError("Failed to load WARC library")
     wf = WARCFile(dumpfile, mode="w", compress=self.compress)
     # Add all files in the inventory
     for resource in inventory:
         wh = WARCHeader({})
         wh.url = resource.uri
         wh.ip_address = None
         wh.date = resource.lastmod
         wh.content_type = 'text/plain'
         wh.result_code = 200
         wh.checksum = 'aabbcc'
         wh.location = 'loc'
         wf.write_record( WARCRecord( header=wh, payload=resource.file ) )
     wf.close()
     warcsize = os.path.getsize(dumpfile)
     print "Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize)

示例#7

显示文件

文件： dump.py 项目： pombredanne/sync-oai

 def write_warc(self, inventory=None, dumpfile=None):
     """Write a WARC dump file"""
     # Load library late as we want to be able to run rest of code 
     # without this installed
     try:
         from warc import WARCFile,WARCHeader,WARCRecord
     except:
         raise DumpError("Failed to load WARC library")
     wf = WARCFile(dumpfile, mode="w", compress=self.compress)
     # Add all files in the inventory
     for resource in inventory:
         wh = WARCHeader({})
         wh.url = resource.uri
         wh.ip_address = None
         wh.date = resource.lastmod
         wh.content_type = 'text/plain'
         wh.result_code = 200
         wh.checksum = 'aabbcc'
         wh.location = 'loc'
         wf.write_record( WARCRecord( header=wh, payload=resource.file ) )
     wf.close()
     warcsize = os.path.getsize(dumpfile)
     print "Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize)