예제 #1
0
    def test_name_value_fallback(self):
        text = '''Name: Кракозябры'''.encode('koi8-r')
        record = NameValueRecord()
        record.parse(text)

        self.assertEqual('Кракозябры'.encode('koi8-r').decode('latin1'),
                         record['name'])
예제 #2
0
 def test_name_value_str_format(self):
     record = NameValueRecord()
     record.parse(self.RECORD_STR_1)
     self.assertEqual(('Entry:\r\n'
                       'Who: Gilbert, W.S. | Sullivan, Arthur\r\n'
                       'What: The Yeomen of the Guard\r\n'
                       'When/Created: 1888\r\n'), str(record))
예제 #3
0
    def _build_phantomjs_coprocessor(cls, session: AppSession,
                                     proxy_port: int):
        '''Build proxy server and PhantomJS client. controller, coprocessor.'''
        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in session.args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not session.args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if session.args.read_timeout:
            page_settings['resourceTimeout'] = session.args.read_timeout * 1000

        page_settings['userAgent'] = session.args.user_agent \
                                     or session.default_user_agent

        # Test early for executable
        wpull.driver.phantomjs.get_version(session.args.phantomjs_exe)

        phantomjs_params = PhantomJSParams(
            wait_time=session.args.phantomjs_wait,
            num_scrolls=session.args.phantomjs_scroll,
            smart_scroll=session.args.phantomjs_smart_scroll,
            snapshot=session.args.phantomjs_snapshot,
            custom_headers=default_headers,
            page_settings=page_settings,
            load_time=session.args.phantomjs_max_time,
        )

        extra_args = [
            '--proxy', '{}:{}'.format(session.args.proxy_server_address,
                                      proxy_port), '--ignore-ssl-errors=true'
        ]

        phantomjs_driver_factory = functools.partial(
            session.factory.class_map['PhantomJSDriver'],
            exe_path=session.args.phantomjs_exe,
            extra_args=extra_args,
        )

        phantomjs_coprocessor = session.factory.new(
            'PhantomJSCoprocessor',
            phantomjs_driver_factory,
            session.factory['ProcessingRule'],
            phantomjs_params,
            root_path=session.args.directory_prefix,
            warc_recorder=session.factory.get('WARCRecorder'),
        )

        return phantomjs_coprocessor
예제 #4
0
파일: download.py 프로젝트: Super-Rad/wpull
    def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int):
        '''Build proxy server and PhantomJS client. controller, coprocessor.'''
        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in session.args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not session.args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if session.args.read_timeout:
            page_settings['resourceTimeout'] = session.args.read_timeout * 1000

        page_settings['userAgent'] = session.args.user_agent \
                                     or session.default_user_agent

        # Test early for executable
        wpull.driver.phantomjs.get_version(session.args.phantomjs_exe)

        phantomjs_params = PhantomJSParams(
            wait_time=session.args.phantomjs_wait,
            num_scrolls=session.args.phantomjs_scroll,
            smart_scroll=session.args.phantomjs_smart_scroll,
            snapshot=session.args.phantomjs_snapshot,
            custom_headers=default_headers,
            page_settings=page_settings,
            load_time=session.args.phantomjs_max_time,
        )

        extra_args = [
            '--proxy',
            '{}:{}'.format(session.args.proxy_server_address, proxy_port),
            '--ignore-ssl-errors=true'
        ]

        phantomjs_driver_factory = functools.partial(
            session.factory.class_map['PhantomJSDriver'],
            exe_path=session.args.phantomjs_exe,
            extra_args=extra_args,
        )

        phantomjs_coprocessor = session.factory.new(
            'PhantomJSCoprocessor',
            phantomjs_driver_factory,
            session.factory['ProcessingRule'],
            phantomjs_params,
            root_path=session.args.directory_prefix,
            warc_recorder=session.factory.get('WARCRecorder'),
        )

        return phantomjs_coprocessor
예제 #5
0
    def test_name_value_fallback(self):
        text = '''Name: Кракозябры'''.encode('koi8-r')
        record = NameValueRecord()
        record.parse(text)

        self.assertEqual(
            'Кракозябры'.encode('koi8-r').decode('latin1'),
            record['name'])
예제 #6
0
    def test_name_value_encoding(self):
        text = '''Name: Кракозябры'''.encode('koi8-r')
        record = NameValueRecord(encoding='koi8-r')
        record.parse(text)

        self.assertEqual(
            'Кракозябры',
            record['name'])
예제 #7
0
 def test_name_value_str_format(self):
     record = NameValueRecord()
     record.parse(self.RECORD_STR_1)
     self.assertEqual(
         ('Entry:\r\n'
          'Who: Gilbert, W.S. | Sullivan, Arthur\r\n'
          'What: The Yeomen of the Guard\r\n'
          'When/Created: 1888\r\n'),
         str(record)
     )
예제 #8
0
파일: builder.py 프로젝트: mback2k/wpull
    def _build_phantomjs_controller(self):
        '''Build proxy server and PhantomJS client and controller.'''
        if not self._args.phantomjs:
            return

        proxy_server = self._factory.new(
            'HTTPProxyServer',
            self.factory['Client']
        )
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        proxy_server.add_socket(proxy_socket)

        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in self._args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not self._args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if self._args.read_timeout:
            page_settings['resourceTimeout'] = self._args.read_timeout * 1000

        page_settings['userAgent'] = self._args.user_agent \
            or self.default_user_agent

        phantomjs_client = self._factory.new(
            'PhantomJSClient',
            'localhost:{0}'.format(proxy_port),
            page_settings=page_settings,
            default_headers=default_headers,
            exe_path=self._args.phantomjs_exe
        )
        phantomjs_client.test_client_exe()

        phantomjs_controller = self._factory.new(
            'PhantomJSController',
            phantomjs_client,
            wait_time=self._args.phantomjs_wait,
            num_scrolls=self._args.phantomjs_scroll,
            warc_recorder=self.factory.get('WARCRecorder'),
            smart_scroll=self._args.phantomjs_smart_scroll,
            snapshot=self._args.phantomjs_snapshot,
        )

        return phantomjs_controller
예제 #9
0
    def test_missing_colon(self):
        record = NameValueRecord()

        self.assertRaises(ValueError, record.parse, 'text:hello\nhi\n')

        record = NameValueRecord()

        record.parse('text:hello\nhi\n', strict=False)

        self.assertEqual('hello', record['text'])
        self.assertNotIn('hi', record)
예제 #10
0
    def test_missing_colon(self):
        record = NameValueRecord()

        self.assertRaises(ValueError, record.parse, 'text:hello\nhi\n')

        record = NameValueRecord()

        record.parse('text:hello\nhi\n', strict=False)

        self.assertEqual('hello', record['text'])
        self.assertNotIn('hi', record)
예제 #11
0
    def _build_phantomjs_controller(self):
        '''Build proxy server and PhantomJS client and controller.'''
        if not self._args.phantomjs:
            return

        proxy_server = self._factory.new('HTTPProxyServer',
                                         self.factory['Client'])
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        proxy_server.add_socket(proxy_socket)

        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in self._args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not self._args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if self._args.read_timeout:
            page_settings['resourceTimeout'] = self._args.read_timeout * 1000

        page_settings['userAgent'] = self._args.user_agent \
            or self.default_user_agent

        phantomjs_client = self._factory.new(
            'PhantomJSClient',
            'localhost:{0}'.format(proxy_port),
            page_settings=page_settings,
            default_headers=default_headers,
        )
        phantomjs_client.test_client_exe()

        phantomjs_controller = self._factory.new(
            'PhantomJSController',
            phantomjs_client,
            wait_time=self._args.phantomjs_wait,
            num_scrolls=self._args.phantomjs_scroll,
            warc_recorder=self.factory.get('WARCRecorder'),
            smart_scroll=self._args.phantomjs_smart_scroll,
        )

        return phantomjs_controller
예제 #12
0
    def test_mixed_line_ending(self):
        record = NameValueRecord()
        record.parse(self.MIXED_LINE_ENDING_STR_1)

        self.assertEqual('woof', record['dog'])
        self.assertEqual('meow', record['cat'])
        self.assertEqual('tweet', record['bird'])
        self.assertEqual('squeak', record['mouse'])
        self.assertEqual('moo', record['cow'])
        self.assertEqual('croak', record['frog'])
        self.assertEqual('toot', record['elephant'])
        self.assertEqual('quack', record['duck'])
        self.assertEqual('blub', record['fish'])
        self.assertEqual('ow ow ow', record['seal'])
        self.assertEqual('???', record['fox'])
예제 #13
0
 def test_name_value_record_parsing(self):
     record = NameValueRecord()
     record.parse(self.RECORD_STR_1)
     self.assertIn('who', record)
     self.assertEqual('Gilbert, W.S. | Sullivan, Arthur', record['who'])
예제 #14
0
    def test_name_value_utf8(self):
        text = '''Name: dogé'''
        record = NameValueRecord()
        record.parse(text)

        self.assertEqual('dogé', record['name'])
예제 #15
0
    def test_name_value_utf8(self):
        text = '''Name: dogé'''
        record = NameValueRecord()
        record.parse(text)

        self.assertEqual('dogé', record['name'])
예제 #16
0
 def test_name_value_record_parsing(self):
     record = NameValueRecord()
     record.parse(self.RECORD_STR_1)
     self.assertIn('who', record)
     self.assertEqual('Gilbert, W.S. | Sullivan, Arthur', record['who'])
예제 #17
0
class Response(BaseResponse, SerializableMixin, DictableMixin):
    '''Represents the HTTP response.

    Attributes:
        status_code (int): The status code in the status line.
        status_reason (str): The status reason string in the status line.
        version (str): The HTTP version in the status line. For example,
            ``HTTP/1.1``.
        fields (:class:`.namevalue.NameValueRecord`): The fields in
            the HTTP headers (and trailer, if present).
        body (:class:`.body.Body`, file-like, None): The optional payload
            (without and transfer or content encoding).
        request: The corresponding request.
        encoding (str): The encoding of the status line.
    '''
    def __init__(self,
                 status_code=None,
                 reason=None,
                 version='HTTP/1.1',
                 request=None):
        super().__init__()

        if status_code is not None:
            assert isinstance(status_code, int), \
                'Expect int, got {}'.format(type(status_code))
            assert reason is not None

        self.status_code = status_code
        self.reason = reason
        self.version = version
        self.fields = NameValueRecord(encoding='latin-1')
        self.request = request
        self.encoding = 'latin-1'

    @property
    def protocol(self):
        return 'http'

    def to_dict(self):
        return {
            'protocol': 'http',
            'status_code': self.status_code,
            'reason': self.reason,
            'response_code': self.status_code,
            'response_message': self.reason,
            'version': self.version,
            'fields': list(self.fields.get_all()),
            'body': self.call_to_dict_or_none(self.body),
            'request': self.request.to_dict() if self.request else None,
            'encoding': self.encoding,
        }

    def to_bytes(self):
        assert self.version
        assert self.status_code is not None
        assert self.reason is not None

        status = '{0} {1} {2}'.format(self.version, self.status_code,
                                      self.reason).encode(self.encoding)
        fields = self.fields.to_bytes(errors='replace')

        return b'\r\n'.join([status, fields, b''])

    def parse(self, data):
        if self.status_code is None:
            line, data = data.split(b'\n', 1)
            self.version, self.status_code, self.reason = self.parse_status_line(
                line)

        self.fields.parse(data, strict=False)

    @classmethod
    def parse_status_line(cls, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the version, code, and reason.
        '''
        match = re.match(br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)',
                         data)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], int(groups[1]), groups[2]),
                    encoding='latin-1',
                )

        raise ProtocolError(
            'Error parsing status line {line}".'.format(line=ascii(data)))

    def __repr__(self):
        return '<Response({version}, {code}, {reason})>'.format(
            version=ascii(self.version),
            code=self.status_code,
            reason=ascii(self.reason))

    def __str__(self):
        return wpull.string.printable_str(self.to_bytes().decode(
            'utf-8', 'replace'),
                                          keep_newlines=True)

    def response_code(self):
        return self.status_code

    def response_message(self):
        return self.reason
예제 #18
0
파일: request.py 프로젝트: Willianvdv/wpull
class RawRequest(SerializableMixin, DictableMixin):
    '''Represents an HTTP request.

    Attributes:
        method (str): The HTTP method in the status line. For example, ``GET``,
            ``POST``.
        resource_path (str): The URL or "path" in the status line.
        version (str): The HTTP version in the status line. For example,
            ``HTTP/1.0``.
        fields (:class:`.namevalue.NameValueRecord`): The fields in
            the HTTP header.
        body (:class:`.body.Body`, file-like, None): An optional payload.
        encoding (str): The encoding of the status line.
    '''
    def __init__(self, method=None, resource_path=None, version='HTTP/1.1'):
        super().__init__()
        self.method = method
        self.resource_path = resource_path
        self.version = version
        self.fields = NameValueRecord(encoding='latin-1')
        self.body = None
        self.encoding = 'latin-1'

    def to_dict(self):
        return {
            'protocol': 'http',
            'method': self.method,
            'version': self.version,
            'resource_path': self.resource_path,
            'fields': list(self.fields.get_all()),
            'body': self.call_to_dict_or_none(self.body),
            'encoding': self.encoding,
        }

    def to_bytes(self):
        assert self.method
        assert self.resource_path
        assert self.version

        status = '{0} {1} {2}'.format(self.method, self.resource_path, self.version).encode(self.encoding)
        fields = self.fields.to_bytes(errors='replace')

        return b'\r\n'.join([status, fields, b''])

    def parse(self, data):
        if not self.resource_path:
            line, data = data.split(b'\n', 1)
            self.method, self.resource_path, self.version = self.parse_status_line(line)

        self.fields.parse(data, strict=False)

    def parse_status_line(self, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the method, URI, and
            version.
        '''
        match = re.match(
            br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)',
            data
        )
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], groups[1], groups[2]),
                    encoding=self.encoding,
                )

        raise ProtocolError('Error parsing status line.')

    def __repr__(self):
        return '<Request({method}, {url}, {version})>'.format(
            method=self.method, url=self.resource_path, version=self.version
        )

    def copy(self):
        '''Return a copy.'''
        return copy.deepcopy(self)

    def set_continue(self, offset):
        '''Modify the request into a range request.'''
        assert offset >= 0, offset
        self.fields['Range'] = 'bytes={0}-'.format(offset)
예제 #19
0
파일: request.py 프로젝트: Willianvdv/wpull
class Response(SerializableMixin, DictableMixin, ProtocolResponseMixin):
    '''Represents the HTTP response.

    Attributes:
        status_code (int): The status code in the status line.
        status_reason (str): The status reason string in the status line.
        version (str): The HTTP version in the status line. For example,
            ``HTTP/1.1``.
        fields (:class:`.namevalue.NameValueRecord`): The fields in
            the HTTP headers (and trailer, if present).
        body (:class:`.body.Body`, file-like, None): The optional payload
            (without and transfer or content encoding).
        request: The corresponding request.
        encoding (str): The encoding of the status line.
    '''
    def __init__(self, status_code=None, reason=None, version='HTTP/1.1', request=None):
        if status_code is not None:
            assert isinstance(status_code, int), \
                'Expect int, got {}'.format(type(status_code))
            assert reason is not None

        self.status_code = status_code
        self.reason = reason
        self.version = version
        self.fields = NameValueRecord(encoding='latin-1')
        self.body = None
        self.request = request
        self.encoding = 'latin-1'

    @property
    def protocol(self):
        return 'http'

    def to_dict(self):
        return {
            'protocol': 'http',
            'status_code': self.status_code,
            'reason': self.reason,
            'response_code': self.status_code,
            'response_message': self.reason,
            'version': self.version,
            'fields': list(self.fields.get_all()),
            'body': self.call_to_dict_or_none(self.body),
            'request': self.request.to_dict() if self.request else None,
            'encoding': self.encoding,
        }

    def to_bytes(self):
        assert self.version
        assert self.status_code is not None
        assert self.reason is not None

        status = '{0} {1} {2}'.format(self.version, self.status_code, self.reason).encode(self.encoding)
        fields = self.fields.to_bytes(errors='replace')

        return b'\r\n'.join([status, fields, b''])

    def parse(self, data):
        if self.status_code is None:
            line, data = data.split(b'\n', 1)
            self.version, self.status_code, self.reason = self.parse_status_line(line)

        self.fields.parse(data, strict=False)

    @classmethod
    def parse_status_line(cls, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the version, code, and reason.
        '''
        match = re.match(
            br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)',
            data
        )
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], int(groups[1]), groups[2]),
                    encoding='latin-1',
                )

        raise ProtocolError(
            'Error parsing status line {line}".'.format(line=ascii(data))
        )

    def __repr__(self):
        return '<Response({version}, {code}, {reason})>'.format(
            version=ascii(self.version), code=self.status_code,
            reason=ascii(self.reason)
        )

    def __str__(self):
        return wpull.string.printable_str(
            self.to_bytes().decode('utf-8', 'replace'), keep_newlines=True
        )

    def response_code(self):
        return self.status_code

    def response_message(self):
        return self.reason
예제 #20
0
class RawRequest(BaseRequest, SerializableMixin, DictableMixin):
    '''Represents an HTTP request.

    Attributes:
        method (str): The HTTP method in the status line. For example, ``GET``,
            ``POST``.
        resource_path (str): The URL or "path" in the status line.
        version (str): The HTTP version in the status line. For example,
            ``HTTP/1.0``.
        fields (:class:`.namevalue.NameValueRecord`): The fields in
            the HTTP header.
        body (:class:`.body.Body`, file-like, None): An optional payload.
        encoding (str): The encoding of the status line.
    '''
    def __init__(self, method=None, resource_path=None, version='HTTP/1.1'):
        super().__init__()
        self.method = method
        self.resource_path = resource_path
        self.version = version
        self.fields = NameValueRecord(encoding='latin-1')
        self.body = None
        self.encoding = 'latin-1'

    def to_dict(self):
        return {
            'protocol': 'http',
            'method': self.method,
            'version': self.version,
            'resource_path': self.resource_path,
            'fields': list(self.fields.get_all()),
            'body': self.call_to_dict_or_none(self.body),
            'encoding': self.encoding,
        }

    def to_bytes(self):
        assert self.method
        assert self.resource_path
        assert self.version

        status = '{0} {1} {2}'.format(self.method, self.resource_path,
                                      self.version).encode(self.encoding)
        fields = self.fields.to_bytes(errors='replace')

        return b'\r\n'.join([status, fields, b''])

    def parse(self, data):
        if not self.resource_path:
            line, data = data.split(b'\n', 1)
            self.method, self.resource_path, self.version = self.parse_status_line(
                line)

        self.fields.parse(data, strict=False)

    def parse_status_line(self, data):
        '''Parse the status line bytes.

        Returns:
            tuple: An tuple representing the method, URI, and
            version.
        '''
        match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)',
                         data)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                return wpull.string.to_str(
                    (groups[0], groups[1], groups[2]),
                    encoding=self.encoding,
                )

        raise ProtocolError('Error parsing status line.')

    def __repr__(self):
        return '<Request({method}, {url}, {version})>'.format(
            method=self.method, url=self.resource_path, version=self.version)

    def copy(self):
        '''Return a copy.'''
        return copy.deepcopy(self)

    def set_continue(self, offset):
        '''Modify the request into a range request.'''
        assert offset >= 0, offset
        self.fields['Range'] = 'bytes={0}-'.format(offset)