Exemplo n.º 1
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(self._url_item.url_info.authority,
                                       self._url_item.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(__(
                _('Could not find external process metadata file: {filename}'),
                filename=glob_pattern
            ))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields('metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Exemplo n.º 2
0
    def _add_warc_snapshot(self, filename, content_type, url):
        _logger.debug('Adding snapshot record.')

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\
            .format(wpull.url.quote(url))

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Exemplo n.º 3
0
    def _start_new_warc_file(self, meta=False):
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        self._warc_filename = '{0}{1}.{2}'.format(
            self._prefix_filename, sequence_name, extension
        )

        _logger.debug(__('WARC file at {0}', self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Exemplo n.º 4
0
 def pre_request(self, request):
     self._request = request
     self._request_record = record = WARCRecord()
     record.set_common_fields(WARCRecord.REQUEST, WARCRecord.TYPE_REQUEST)
     record.fields['WARC-Target-URI'] = request.url_info.url
     record.fields['WARC-IP-Address'] = request.address[0]
     record.block_file = self._new_temp_file()
Exemplo n.º 5
0
 def pre_response(self, response):
     self._response_record = record = WARCRecord()
     record.set_common_fields(WARCRecord.RESPONSE, WARCRecord.TYPE_RESPONSE)
     record.fields['WARC-Target-URI'] = self._request.url_info.url
     record.fields['WARC-IP-Address'] = self._request.address[0]
     record.fields['WARC-Concurrent-To'] = self._request_record.fields[
         WARCRecord.WARC_RECORD_ID]
     record.block_file = self._response_temp_file
Exemplo n.º 6
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
        }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Exemplo n.º 7
0
    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_record = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        if params.log:
            self._log_record = WARCRecord()
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()
Exemplo n.º 8
0
Arquivo: warc.py Projeto: asergi/wpull
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name
            )
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Exemplo n.º 9
0
    def pre_request(self, request):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            request.address[0]), \
            'IP address needed, got {}'.format(request.address[0])

        self._request = request
        self._request_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.REQUEST, WARCRecord.TYPE_REQUEST)
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]
        record.block_file = self._new_temp_file(hint='warcsesreq')
Exemplo n.º 10
0
    def _add_warc_snapshot(self, filename, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        extension = os.path.splitext(filename)[1]
        content_type = {
            '.pdf': 'application/pdf',
            '.html': 'text/html',
            '.png': 'image/png',
            '.gif': 'image/gif'
            }[extension]

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Exemplo n.º 11
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(self._url_item.url_info.authority,
                                       self._url_item.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(
                __(_(
                    'Could not find external process metadata file: {filename}'
                ),
                   filename=glob_pattern))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields(
                'metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Exemplo n.º 12
0
    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug(__('Skip {0}', self._warc_filename))
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug(__('WARC file at {0}', self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Exemplo n.º 13
0
    def pre_response(self, response):
        hostname, port = response.data_address
        self._write_control_event(
            'Opened data connection to {hostname}:{port}'.format(
                hostname=hostname, port=port))

        self._response_record = record = WARCRecord()
        record.set_common_fields('resource', 'application/octet-stream')
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._control_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._new_temp_file('warcresp')
Exemplo n.º 14
0
    def pre_response(self, response):
        assert re.match(
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-f0-9:.]+)$',
            self._request.address[0]), \
            'IP address needed, got {}'.format(self._request.address[0])

        self._response_record = record = WARCRecord()
        record.set_common_fields(WARCRecord.RESPONSE, WARCRecord.TYPE_RESPONSE)
        record.fields['WARC-Target-URI'] = self._request.url_info.url
        record.fields['WARC-IP-Address'] = self._request.address[0]
        record.fields['WARC-Concurrent-To'] = self._request_record.fields[
            WARCRecord.WARC_RECORD_ID]
        record.block_file = self._response_temp_file
Exemplo n.º 15
0
    def __init__(self, filename, compress=True, extra_fields=None,
    temp_dir=None, log=True, appending=False, digests=True, cdx_filename=None):
        self._filename = filename
        self._gzip_enabled = compress
        self._temp_dir = temp_dir
        self._warcinfo_record = WARCRecord()
        self._log_record = None
        self._log_handler = None
        self._digests_enabled = digests
        self._cdx_filename = cdx_filename

        if not appending:
            self._truncate_existing_file()

        self._populate_warcinfo(extra_fields)

        if log:
            self._log_record = WARCRecord()
            self._setup_log()

        if self._cdx_filename:
            self._write_cdx_header()

        self.write_record(self._warcinfo_record)
Exemplo n.º 16
0
    def _start_new_warc_file(self, meta=False):
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        self._warc_filename = '{0}{1}.{2}'.format(self._prefix_filename,
                                                  sequence_name, extension)

        _logger.debug('WARC file at {0}'.format(self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Exemplo n.º 17
0
    def _add_warc_action_log(self, url):
        '''Add the acton log to the WARC file.'''
        _logger.debug('Adding action log record.')

        log_data = json.dumps(
            {'actions': self._actions},
            indent=4,
        ).encode('utf-8')

        self._action_warc_record = record = WARCRecord()
        record.set_common_fields('metadata', 'application/json')
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\
            .format(wpull.url.quote(url))
        record.block_file = io.BytesIO(log_data)

        self._warc_recorder.set_length_and_maybe_checksums(record)
        self._warc_recorder.write_record(record)
Exemplo n.º 18
0
    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_record = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        if params.log:
            self._log_record = WARCRecord()
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()
Exemplo n.º 19
0
    def begin_control(self, request, connection_reused=False):
        self._request = request
        self._control_record = record = WARCRecord()

        record.set_common_fields('metadata', 'text/x-ftp-control-conversation')
        record.fields['WARC-Target-URI'] = request.url_info.url
        record.fields['WARC-IP-Address'] = request.address[0]

        record.block_file = self._new_temp_file('warcctrl')

        hostname, port = self._request_hostname_port()

        if connection_reused:
            connection_string = 'Reusing control connection to {hostname}:{port}'
        else:
            connection_string = 'Opening control connection to {hostname}:{port}'

        self._write_control_event(
            connection_string.format(hostname=hostname, port=port))
Exemplo n.º 20
0
    def _add_warc_snapshot(self, filename, content_type, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\
            .format(wpull.url.quote(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Exemplo n.º 21
0
Arquivo: warc.py Projeto: asergi/wpull
    def _start_new_warc_file(self, meta=False):
        '''Create and set as current WARC file.'''
        if self._params.max_size and not meta and self._params.appending:
            while True:
                self._warc_filename = self._generate_warc_filename()

                if os.path.exists(self._warc_filename):
                    _logger.debug(__('Skip {0}', self._warc_filename))
                    self._sequence_num += 1
                else:
                    break
        else:
            self._warc_filename = self._generate_warc_filename(meta=meta)

        _logger.debug(__('WARC file at {0}', self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Exemplo n.º 22
0
    def _add_warc_snapshot(self, filename, content_type, url):
        '''Add the snaphot to the WARC file.'''
        _logger.debug('Adding snapshot record.')

        record = WARCRecord()
        record.set_common_fields('resource', content_type)
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}'\
            .format(wpull.url.quote(url))

        if self._action_warc_record:
            record.fields['WARC-Concurrent-To'] = \
                self._action_warc_record.fields[WARCRecord.WARC_RECORD_ID]

        with open(filename, 'rb') as in_file:
            record.block_file = in_file

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)
Exemplo n.º 23
0
    def _add_warc_action_log(self, path, url):
        '''Add the action log to the WARC file.'''
        _logger.debug('Adding action log record.')

        actions = []
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            for line in file:
                actions.append(json.loads(line))

        log_data = json.dumps(
            {
                'actions': actions
            },
            indent=4,
        ).encode('utf-8')

        self._action_warc_record = record = WARCRecord()
        record.set_common_fields('metadata', 'application/json')
        record.fields['WARC-Target-URI'] = 'urn:X-wpull:snapshot?url={0}' \
            .format(wpull.url.percent_encode_query_value(url))
        record.block_file = io.BytesIO(log_data)

        self._warc_recorder.set_length_and_maybe_checksums(record)
        self._warc_recorder.write_record(record)
Exemplo n.º 24
0
    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_temp_file:
            self._log_handler.flush()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler.stream.close()

            log_record = WARCRecord()
            log_record.block_file = gzip.GzipFile(
                filename=self._log_temp_file.name)
            log_record.set_common_fields('resource', 'text/plain')

            log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(log_record)
            self.write_record(log_record)

            log_record.block_file.close()

            try:
                os.remove(self._log_temp_file.name)
            except OSError:
                _logger.exception('Could not close log temp file.')

            self._log_temp_file = None

            self._log_handler.close()
            self._log_handler = None

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)
Exemplo n.º 25
0
class WARCRecorder(BaseRecorder):
    '''Record to WARC file.

    Args:
        filename (str): The filename (including the extension).
        compress (bool): If True, files will be compressed with gzip
        extra_fields (list): A list of key-value pairs containing extra
            metadata fields
        temp_dir (str): Directory to use for temporary files
        log (bool): Include the program logging messages in the WARC file
        appending (bool): If True, the file is not overwritten upon opening
        digests (bool): If True, the SHA1 hash digests will be written.
        cdx_filename (str): If given, a CDX file will be written.
    '''
    CDX_DELIMINATOR = ' '

    def __init__(self, filename, compress=True, extra_fields=None,
    temp_dir=None, log=True, appending=False, digests=True, cdx_filename=None):
        self._filename = filename
        self._gzip_enabled = compress
        self._temp_dir = temp_dir
        self._warcinfo_record = WARCRecord()
        self._log_record = None
        self._log_handler = None
        self._digests_enabled = digests
        self._cdx_filename = cdx_filename

        if not appending:
            self._truncate_existing_file()

        self._populate_warcinfo(extra_fields)

        if log:
            self._log_record = WARCRecord()
            self._setup_log()

        if self._cdx_filename:
            self._write_cdx_header()

        self.write_record(self._warcinfo_record)

    def _truncate_existing_file(self):
        '''Truncate existing WARC and CDX file if it exists.'''
        if os.path.exists(self._filename):
            with open(self._filename, 'wb'):
                pass

        if self._cdx_filename and os.path.exists(self._cdx_filename):
            with open(self._cdx_filename, 'wb'):
                pass

    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(
            WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = 'Wpull/{0} Python/{1}'.format(
            wpull.version.__version__, wpull.util.python_version())
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()

    def _setup_log(self):
        '''Set up the logging file.'''
        logger = logging.getLogger()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self._log_record.block_file = NamedTemporaryFile(
            prefix='wpull-warc-',
            dir=self._temp_dir,
            suffix='.log',
        )
        self._log_handler = handler = logging.FileHandler(
            self._log_record.block_file.name, encoding='utf-8')

        logger.setLevel(logging.DEBUG)
        logger.debug('Wpull needs the root logger level set to DEBUG.')

        handler.setFormatter(formatter)
        logger.addHandler(handler)
        handler.setLevel(logging.INFO)

    @contextlib.contextmanager
    def session(self):
        recorder_session = WARCRecorderSession(self, self._temp_dir)
        yield recorder_session

    def set_length_and_maybe_checksums(self, record, payload_offset=None):
        '''Set the content length and possibly the checksums.'''
        if self._digests_enabled:
            record.compute_checksum(payload_offset)
        else:
            record.set_content_length()

    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug('Writing WARC record {0}.'.format(
            record.fields['WARC-Type']))

        if self._gzip_enabled:
            open_func = wpull.backport.gzip.GzipFile
        else:
            open_func = open

        if os.path.exists(self._filename):
            before_offset = os.path.getsize(self._filename)
        else:
            before_offset = 0

        try:
            with open_func(self._filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(
                _('Rolling back file {filename} to length {length}.')\
                .format(filename=self._filename, length=before_offset)
            )
            with open(self._filename, mode='wb') as out_file:
                out_file.truncate(before_offset)
            raise error

        after_offset = os.path.getsize(self._filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(
                record, raw_file_record_size, raw_file_offset
            )

    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_record:
            self._log_handler.flush()
            self._log_handler.close()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler = None

            self._log_record.block_file.seek(0)
            self._log_record.set_common_fields('resource', 'text/plain')

            self._log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            self.set_length_and_maybe_checksums(self._log_record)
            self.write_record(self._log_record)

            self._log_record.block_file.close()

    def _write_cdx_header(self):
        '''Write the CDX header.

        It writes the fields:

        1. a: original URL
        2. b: UNIX timestamp
        3. m: MIME Type from the HTTP Content-type
        4. s: response code
        5. k: new style checksum
        6. S: raw file record size
        7. V: offset in raw file
        8. g: filename of raw file
        9. u: record ID
        '''
        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR)
            out_file.write(self.CDX_DELIMINATOR.join((
                'CDX',
                'a', 'b', 'm', 's',
                'k', 'S', 'V', 'g',
                'u'
            )))
            out_file.write('\n')

    def _write_cdx_field(self, record, raw_file_record_size, raw_file_offset):
        '''Write the CDX field if needed.'''
        if record.fields[WARCRecord.WARC_TYPE] != WARCRecord.RESPONSE \
        or not re.match(r'application/http; *msgtype *= *response',
        record.fields[WARCRecord.CONTENT_TYPE]):
            return

        url = record.fields['WARC-Target-URI']

        _logger.debug('Writing CDX record {0}.'.format(url))

        http_header = record.get_http_header()

        if http_header:
            mime_type = self.parse_mimetype(
                http_header.fields.get('Content-Type', '')
            ) or '-'
            response_code = str(http_header.status_code)
        else:
            mime_type = '-'
            response_code = '-'

        timestamp = str(int(
            wpull.util.parse_iso8601_str(record.fields[WARCRecord.WARC_DATE])
        ))

        checksum = record.fields.get('WARC-Payload-Digest', '')

        if checksum.startswith('sha1:'):
            checksum = checksum.replace('sha1:', '', 1)
        else:
            checksum = '-'

        raw_file_record_size_str = str(raw_file_record_size)
        raw_file_offset_str = str(raw_file_offset)
        filename = os.path.basename(self._filename)
        record_id = record.fields[WARCRecord.WARC_RECORD_ID]
        fields_strs = (
            url,
            timestamp,
            mime_type,
            response_code,
            checksum,
            raw_file_record_size_str,
            raw_file_offset_str,
            filename,
            record_id
        )

        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR.join(fields_strs))
            out_file.write('\n')

    @classmethod
    def parse_mimetype(cls, value):
        '''Return the MIME type from a Content-Type string.

        Returns:
            str, None: A string in the form ``type/subtype`` or None.
        '''
        match = re.match(r'([a-zA-Z0-9-]+/[a-zA-Z0-9-]+)', value)

        if match:
            return match.group(1)
Exemplo n.º 26
0
class WARCRecorder(BaseRecorder):
    '''Record to WARC file.

    Args:
        filename (str): The filename (without the extension).
        params (:class:`WARCRecorderParams`): Parameters.
    '''
    CDX_DELIMINATOR = ' '
    '''Default CDX delimiter.'''
    DEFAULT_SOFTWARE_STRING = 'Wpull/{0} Python/{1}'.format(
        wpull.version.__version__, wpull.util.python_version()
    )
    '''Default software string.'''

    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_record = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        if params.log:
            self._log_record = WARCRecord()
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()

    def _start_new_warc_file(self, meta=False):
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        self._warc_filename = '{0}{1}.{2}'.format(
            self._prefix_filename, sequence_name, extension
        )

        _logger.debug(__('WARC file at {0}', self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)

    def _start_new_cdx_file(self):
        self._cdx_filename = '{0}.cdx'.format(self._prefix_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._cdx_filename)
            self._write_cdx_header()
        elif not os.path.exists(self._cdx_filename):
            self._write_cdx_header()

    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(
            WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()

    def _setup_log(self):
        '''Set up the logging file.'''
        logger = logging.getLogger()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self._log_record.block_file = NamedTemporaryFile(
            prefix='wpull-warc-',
            dir=self._params.temp_dir,
            suffix='.log',
        )
        self._log_handler = handler = logging.FileHandler(
            self._log_record.block_file.name, encoding='utf-8')

        logger.setLevel(logging.DEBUG)
        logger.debug('Wpull needs the root logger level set to DEBUG.')

        handler.setFormatter(formatter)
        logger.addHandler(handler)
        handler.setLevel(logging.INFO)

    @contextlib.contextmanager
    def session(self):
        recorder_session = WARCRecorderSession(
            self,
            temp_dir=self._params.temp_dir, url_table=self._params.url_table
        )
        yield recorder_session

        if self._params.max_size is not None \
           and os.path.getsize(self._warc_filename) > self._params.max_size:
            self._sequence_num += 1

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

            _logger.debug('Starting new warc file due to max size.')
            self._start_new_warc_file()

    def _move_file_to_dest_dir(self, filename):
        '''Move the file to the ``move_to` directory.'''
        assert self._params.move_to

        if os.path.isdir(self._params.move_to):
            _logger.debug('Moved %s to %s.', self._warc_filename,
                          self._params.move_to)
            shutil.move(filename, self._params.move_to)
        else:
            _logger.error('%s is not a directory; not moving %s.',
                          self._params.move_to, filename)

    def set_length_and_maybe_checksums(self, record, payload_offset=None):
        '''Set the content length and possibly the checksums.'''
        if self._params.digests:
            record.compute_checksum(payload_offset)
        else:
            record.set_content_length()

    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug(__('Writing WARC record {0}.',
                         record.fields['WARC-Type']))

        if self._params.compress:
            open_func = wpull.backport.gzip.GzipFile
        else:
            open_func = open

        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(__(
                _('Rolling back file {filename} to length {length}.'),
                filename=self._warc_filename, length=before_offset
            ))
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)
            raise error

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(
                record, raw_file_record_size, raw_file_offset
            )

    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_record:
            self._log_handler.flush()
            self._log_handler.close()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler = None

            self._log_record.block_file.seek(0)
            self._log_record.set_common_fields('resource', 'text/plain')

            self._log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                if self._params.move_to is not None:
                    self._move_file_to_dest_dir(self._warc_filename)

                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(self._log_record)
            self.write_record(self._log_record)

            self._log_record.block_file.close()

            if self._params.move_to is not None:
                self._move_file_to_dest_dir(self._warc_filename)

        if self._cdx_filename and self._params.move_to is not None:
            self._move_file_to_dest_dir(self._cdx_filename)

    def _write_cdx_header(self):
        '''Write the CDX header.

        It writes the fields:

        1. a: original URL
        2. b: UNIX timestamp
        3. m: MIME Type from the HTTP Content-type
        4. s: response code
        5. k: new style checksum
        6. S: raw file record size
        7. V: offset in raw file
        8. g: filename of raw file
        9. u: record ID
        '''
        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR)
            out_file.write(self.CDX_DELIMINATOR.join((
                'CDX',
                'a', 'b', 'm', 's',
                'k', 'S', 'V', 'g',
                'u'
            )))
            out_file.write('\n')

    def _write_cdx_field(self, record, raw_file_record_size, raw_file_offset):
        '''Write the CDX field if needed.'''
        if record.fields[WARCRecord.WARC_TYPE] != WARCRecord.RESPONSE \
           or not re.match(r'application/http; *msgtype *= *response',
                           record.fields[WARCRecord.CONTENT_TYPE]):
            return

        url = record.fields['WARC-Target-URI']

        _logger.debug(__('Writing CDX record {0}.', url))

        http_header = record.get_http_header()

        if http_header:
            mime_type = self.parse_mimetype(
                http_header.fields.get('Content-Type', '')
            ) or '-'
            response_code = str(http_header.status_code)
        else:
            mime_type = '-'
            response_code = '-'

        timestamp = str(int(
            wpull.util.parse_iso8601_str(record.fields[WARCRecord.WARC_DATE])
        ))

        checksum = record.fields.get('WARC-Payload-Digest', '')

        if checksum.startswith('sha1:'):
            checksum = checksum.replace('sha1:', '', 1)
        else:
            checksum = '-'

        raw_file_record_size_str = str(raw_file_record_size)
        raw_file_offset_str = str(raw_file_offset)
        filename = os.path.basename(self._warc_filename)
        record_id = record.fields[WARCRecord.WARC_RECORD_ID]
        fields_strs = (
            url,
            timestamp,
            mime_type,
            response_code,
            checksum,
            raw_file_record_size_str,
            raw_file_offset_str,
            filename,
            record_id
        )

        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR.join(fields_strs))
            out_file.write('\n')

    @classmethod
    def parse_mimetype(cls, value):
        '''Return the MIME type from a Content-Type string.

        Returns:
            str, None: A string in the form ``type/subtype`` or None.
        '''
        match = re.match(r'([a-zA-Z0-9-]+/[a-zA-Z0-9-]+)', value)

        if match:
            return match.group(1)
Exemplo n.º 27
0
class WARCRecorder(BaseRecorder):
    '''Record to WARC file.

    Args:
        filename (str): The filename (without the extension).
        params (:class:`WARCRecorderParams`): Parameters.
    '''
    CDX_DELIMINATOR = ' '
    '''Default CDX delimiter.'''
    DEFAULT_SOFTWARE_STRING = 'Wpull/{0} Python/{1}'.format(
        wpull.version.__version__, wpull.util.python_version())
    '''Default software string.'''
    def __init__(self, filename, params=None):
        self._prefix_filename = filename
        self._params = params or WARCRecorderParams()
        self._warcinfo_record = None
        self._sequence_num = 0
        self._log_record = None
        self._log_handler = None
        self._warc_filename = None
        self._cdx_filename = None

        if params.log:
            self._log_record = WARCRecord()
            self._setup_log()

        self._start_new_warc_file()

        if self._params.cdx:
            self._start_new_cdx_file()

    def _start_new_warc_file(self, meta=False):
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        self._warc_filename = '{0}{1}.{2}'.format(self._prefix_filename,
                                                  sequence_name, extension)

        _logger.debug('WARC file at {0}'.format(self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)

    def _start_new_cdx_file(self):
        self._cdx_filename = '{0}.cdx'.format(self._prefix_filename)

        if not self._params.appending:
            wpull.util.truncate_file(self._cdx_filename)
            self._write_cdx_header()
        elif not os.path.exists(self._cdx_filename):
            self._write_cdx_header()

    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(WARCRecord.WARCINFO,
                                                WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()

    def _setup_log(self):
        '''Set up the logging file.'''
        logger = logging.getLogger()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self._log_record.block_file = NamedTemporaryFile(
            prefix='wpull-warc-',
            dir=self._params.temp_dir,
            suffix='.log',
        )
        self._log_handler = handler = logging.FileHandler(
            self._log_record.block_file.name, encoding='utf-8')

        logger.setLevel(logging.DEBUG)
        logger.debug('Wpull needs the root logger level set to DEBUG.')

        handler.setFormatter(formatter)
        logger.addHandler(handler)
        handler.setLevel(logging.INFO)

    @contextlib.contextmanager
    def session(self):
        recorder_session = WARCRecorderSession(
            self,
            temp_dir=self._params.temp_dir,
            url_table=self._params.url_table)
        yield recorder_session

        if self._params.max_size is not None \
        and os.path.getsize(self._warc_filename) > self._params.max_size:
            self._sequence_num += 1

            _logger.debug('Starting new warc file due to max size.')
            self._start_new_warc_file()

    def set_length_and_maybe_checksums(self, record, payload_offset=None):
        '''Set the content length and possibly the checksums.'''
        if self._params.digests:
            record.compute_checksum(payload_offset)
        else:
            record.set_content_length()

    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug('Writing WARC record {0}.'.format(
            record.fields['WARC-Type']))

        if self._params.compress:
            open_func = wpull.backport.gzip.GzipFile
        else:
            open_func = open

        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(
                _('Rolling back file {filename} to length {length}.')\
                .format(filename=self._warc_filename, length=before_offset)
            )
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)
            raise error

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(record, raw_file_record_size,
                                  raw_file_offset)

    def close(self):
        '''Close the WARC file and clean up any logging handlers.'''
        if self._log_record:
            self._log_handler.flush()
            self._log_handler.close()

            logger = logging.getLogger()
            logger.removeHandler(self._log_handler)
            self._log_handler = None

            self._log_record.block_file.seek(0)
            self._log_record.set_common_fields('resource', 'text/plain')

            self._log_record.fields['WARC-Target-URI'] = \
                'urn:X-wpull:log'

            if self._params.max_size is not None:
                self._start_new_warc_file(meta=True)

            self.set_length_and_maybe_checksums(self._log_record)
            self.write_record(self._log_record)

            self._log_record.block_file.close()

    def _write_cdx_header(self):
        '''Write the CDX header.

        It writes the fields:

        1. a: original URL
        2. b: UNIX timestamp
        3. m: MIME Type from the HTTP Content-type
        4. s: response code
        5. k: new style checksum
        6. S: raw file record size
        7. V: offset in raw file
        8. g: filename of raw file
        9. u: record ID
        '''
        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR)
            out_file.write(
                self.CDX_DELIMINATOR.join(
                    ('CDX', 'a', 'b', 'm', 's', 'k', 'S', 'V', 'g', 'u')))
            out_file.write('\n')

    def _write_cdx_field(self, record, raw_file_record_size, raw_file_offset):
        '''Write the CDX field if needed.'''
        if record.fields[WARCRecord.WARC_TYPE] != WARCRecord.RESPONSE \
        or not re.match(r'application/http; *msgtype *= *response',
        record.fields[WARCRecord.CONTENT_TYPE]):
            return

        url = record.fields['WARC-Target-URI']

        _logger.debug('Writing CDX record {0}.'.format(url))

        http_header = record.get_http_header()

        if http_header:
            mime_type = self.parse_mimetype(
                http_header.fields.get('Content-Type', '')) or '-'
            response_code = str(http_header.status_code)
        else:
            mime_type = '-'
            response_code = '-'

        timestamp = str(
            int(
                wpull.util.parse_iso8601_str(
                    record.fields[WARCRecord.WARC_DATE])))

        checksum = record.fields.get('WARC-Payload-Digest', '')

        if checksum.startswith('sha1:'):
            checksum = checksum.replace('sha1:', '', 1)
        else:
            checksum = '-'

        raw_file_record_size_str = str(raw_file_record_size)
        raw_file_offset_str = str(raw_file_offset)
        filename = os.path.basename(self._warc_filename)
        record_id = record.fields[WARCRecord.WARC_RECORD_ID]
        fields_strs = (url, timestamp, mime_type, response_code, checksum,
                       raw_file_record_size_str, raw_file_offset_str, filename,
                       record_id)

        with open(self._cdx_filename, mode='a', encoding='utf-8') as out_file:
            out_file.write(self.CDX_DELIMINATOR.join(fields_strs))
            out_file.write('\n')

    @classmethod
    def parse_mimetype(cls, value):
        '''Return the MIME type from a Content-Type string.

        Returns:
            str, None: A string in the form ``type/subtype`` or None.
        '''
        match = re.match(r'([a-zA-Z0-9-]+/[a-zA-Z0-9-]+)', value)

        if match:
            return match.group(1)