示例#1
0
文件: dump.py 项目: EHRI/resync
    def write_warc(self, resources=None, dumpfile=None):
        """Write a WARC dump file

        WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left
        in this library for experimentation.
        """
        # Load library late as we want to be able to run rest of code 
        # without this installed
        try:
            from warc import WARCFile,WARCHeader,WARCRecord
        except:
            raise DumpError("Failed to load WARC library")
        wf = WARCFile(dumpfile, mode="w", compress=self.compress)
        # Add all files in the resources
        for resource in resources:
            wh = WARCHeader({})
            wh.url = resource.uri
            wh.ip_address = None
            wh.date = resource.lastmod
            wh.content_type = 'text/plain'
            wh.result_code = 200
            wh.checksum = 'aabbcc'
            wh.location = self.archive_path(resource.path)
            wf.write_record( WARCRecord( header=wh, payload=resource.path ) )
        wf.close()
        warcsize = os.path.getsize(dumpfile)
        self.logging.info("Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize))
示例#2
0
    def parse_warc_content(buf):
        wf = WARCFile(fileobj=StringIO(buf))
        record = wf.read_record()
        payload = record.payload.read()
        top = payload[:15]

        if top.startswith('HTTP/') and top.endswith('200 OK'):
            content_start = payload.find('\r\n\r\n')
            if content_start != -1:
                yield record.url, payload[content_start + 4:]
示例#3
0
 def read_warc(self, key):
     keypath = 's3://aws-publicdatasets/{key}'.format(key=key)
     with self.s3.open(keypath, 'rb') as fp:
         warcfile = WARCFile(fileobj=fp, compress='gzip')
         for record in warcfile.reader:
             if record.type == 'response':
                 self.increment_counter(self.__class__.__name__, 'match', 1)
                 yield record
示例#4
0
 def read_warc(self, key):
     keypath = 's3://commoncrawl/{key}'.format(key=key)
     with self.s3.open(keypath, 'rb') as fp:
         g = gzip.GzipFile(fileobj=fp)
         warcfile = WARCFile(fileobj=g)
         for record in warcfile:
             #if record.type == 'response':
                 self.increment_counter(self.__class__.__name__, 'match', 1)
                 yield record
示例#5
0
    def write_warc(self, resources=None, dumpfile=None):
        """Write a WARC dump file.

        WARC support is not part of ResourceSync v1.0 (Z39.99 2014) but is left
        in this library for experimentation.
        """
        # Load library late as we want to be able to run rest of code
        # without this installed
        try:
            from warc import WARCFile, WARCHeader, WARCRecord
        except:
            raise DumpError("Failed to load WARC library")
        wf = WARCFile(dumpfile, mode="w", compress=self.compress)
        # Add all files in the resources
        for resource in resources:
            wh = WARCHeader({})
            wh.url = resource.uri
            wh.ip_address = None
            wh.date = resource.lastmod
            wh.content_type = 'text/plain'
            wh.result_code = 200
            wh.checksum = 'aabbcc'
            wh.location = self.archive_path(resource.path)
            wf.write_record(WARCRecord(header=wh, payload=resource.path))
        wf.close()
        warcsize = os.path.getsize(dumpfile)
        self.logging.info("Wrote WARC file dump %s with size %d bytes" %
                          (dumpfile, warcsize))
示例#6
0
文件: dump.py 项目: pedak/resdbp
 def write_warc(self, inventory=None, dumpfile=None):
     """Write a WARC dump file"""
     # Load library late as we want to be able to run rest of code 
     # without this installed
     try:
         from warc import WARCFile,WARCHeader,WARCRecord
     except:
         raise DumpError("Failed to load WARC library")
     wf = WARCFile(dumpfile, mode="w", compress=self.compress)
     # Add all files in the inventory
     for resource in inventory:
         wh = WARCHeader({})
         wh.url = resource.uri
         wh.ip_address = None
         wh.date = resource.lastmod
         wh.content_type = 'text/plain'
         wh.result_code = 200
         wh.checksum = 'aabbcc'
         wh.location = 'loc'
         wf.write_record( WARCRecord( header=wh, payload=resource.file ) )
     wf.close()
     warcsize = os.path.getsize(dumpfile)
     print "Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize)
示例#7
0
 def write_warc(self, inventory=None, dumpfile=None):
     """Write a WARC dump file"""
     # Load library late as we want to be able to run rest of code 
     # without this installed
     try:
         from warc import WARCFile,WARCHeader,WARCRecord
     except:
         raise DumpError("Failed to load WARC library")
     wf = WARCFile(dumpfile, mode="w", compress=self.compress)
     # Add all files in the inventory
     for resource in inventory:
         wh = WARCHeader({})
         wh.url = resource.uri
         wh.ip_address = None
         wh.date = resource.lastmod
         wh.content_type = 'text/plain'
         wh.result_code = 200
         wh.checksum = 'aabbcc'
         wh.location = 'loc'
         wf.write_record( WARCRecord( header=wh, payload=resource.file ) )
     wf.close()
     warcsize = os.path.getsize(dumpfile)
     print "Wrote WARC file dump %s with size %d bytes" % (dumpfile,warcsize)