Exemplo n.º 1
0
    def test_split_url_to_filename(self):
        self.assertEqual(['example.com', 'index.php_article=Main_Page'],
            util.split_url_to_filename(
                'http://example.com/index.php?article=Main_Page')
        )

        def f1():
            util.split_url_to_filename('http://example.com/../system')
        self.assertRaises(ValueError, f1)

        def f2():
            util.split_url_to_filename('http://example.com/./system')
        self.assertRaises(ValueError, f2)
Exemplo n.º 2
0
    def test_split_url_to_filename(self):
        self.assertEqual(['example.com', 'index.php_article=Main_Page'],
                         util.split_url_to_filename(
                             'http://example.com/index.php?article=Main_Page'))

        def f1():
            util.split_url_to_filename('http://example.com/../system')

        self.assertRaises(ValueError, f1)

        def f2():
            util.split_url_to_filename('http://example.com/./system')

        self.assertRaises(ValueError, f2)
Exemplo n.º 3
0
    def action(self, record):
        if record.warc_type != 'response':
            return
        if not isinstance(record.content_block, model.BlockWithPayload):
            return
        if not isinstance(record.content_block.fields, model.HTTPHeaders):
            return
        if not record.content_block.fields.status_code == http.client.OK:
            return

        url = record.header.fields['WARC-Target-URI']
        binary_block = record.content_block.binary_block
        file_obj = binary_block.get_file()
        data = file_obj.read(binary_block.length)
        response = util.parse_http_response(data)
        path_list = util.split_url_to_filename(url)
        path_list = util.truncate_filename_parts(path_list)
        path = os.path.join(self.out_dir, *path_list)
        dir_path = os.path.dirname(path)

        if os.path.isdir(path):
            path = util.append_index_filename(path)

        _logger.debug('Extracting %s to %s', record.record_id, path)
        util.rename_filename_dirs(path)
        os.makedirs(dir_path, exist_ok=True)

        try:
            with open(path, 'wb') as f:
                shutil.copyfileobj(response, f)
        except http.client.IncompleteRead as error:
            _logger.warning('Malformed HTTP response: %s', error)

            with open(path, 'wb') as f:
                f.write(error.partial)

        last_modified_str = response.getheader('Last-Modified')

        if last_modified_str:
            try:
                last_modified = util.parse_http_date(last_modified_str)
            except ValueError:
                pass
            else:
                timestamp = time.mktime(last_modified.utctimetuple())
                os.utime(path, (time.time(), timestamp))
                _logger.debug('Apply mtime %d to %s', timestamp, path)

        _logger.info('Extracted %s to %s', record.record_id, path)
Exemplo n.º 4
0
Arquivo: tool.py Projeto: chfoo/warcat
    def action(self, record):
        if record.warc_type != 'response':
            return
        if not isinstance(record.content_block, model.BlockWithPayload):
            return
        if not isinstance(record.content_block.fields, model.HTTPHeaders):
            return
        if not record.content_block.fields.status_code == http.client.OK:
            return

        url = record.header.fields['WARC-Target-URI']
        binary_block = record.content_block.binary_block
        file_obj = binary_block.get_file()
        data = file_obj.read(binary_block.length)
        response = util.parse_http_response(data)
        path_list = util.split_url_to_filename(url)
        path_list = util.truncate_filename_parts(path_list)
        path = os.path.join(self.out_dir, *path_list)
        dir_path = os.path.dirname(path)

        if os.path.isdir(path):
            path = util.append_index_filename(path)

        _logger.debug('Extracting %s to %s', record.record_id, path)
        util.rename_filename_dirs(path)
        os.makedirs(dir_path, exist_ok=True)

        try:
            with open(path, 'wb') as f:
                shutil.copyfileobj(response, f)
        except http.client.IncompleteRead as error:
            _logger.warning('Malformed HTTP response: %s', error)

            with open(path, 'wb') as f:
                f.write(error.partial)

        last_modified_str = response.getheader('Last-Modified')

        if last_modified_str:
            try:
                last_modified = util.parse_http_date(last_modified_str)
            except ValueError:
                pass
            else:
                timestamp = time.mktime(last_modified.utctimetuple())
                os.utime(path, (time.time(), timestamp))
                _logger.debug('Apply mtime %d to %s', timestamp, path)

        _logger.info('Extracted %s to %s', record.record_id, path)
Exemplo n.º 5
0
    def test_split_url_to_filename(self):
        self.assertEqual(
            ["example.com", "index.php_article=Main_Page"],
            util.split_url_to_filename("http://example.com/index.php?article=Main_Page"),
        )

        def f1():
            util.split_url_to_filename("http://example.com/../system")

        self.assertRaises(ValueError, f1)

        def f2():
            util.split_url_to_filename("http://example.com/./system")

        self.assertRaises(ValueError, f2)
Exemplo n.º 6
0
 def f2():
     util.split_url_to_filename('http://example.com/./system')
Exemplo n.º 7
0
 def f2():
     util.split_url_to_filename('http://example.com/./system')
Exemplo n.º 8
0
 def f1():
     util.split_url_to_filename("http://example.com/../system")