예제 #1
0
    def _build_phantomjs_coprocessor(cls, session: AppSession,
                                     proxy_port: int):
        '''Build proxy server and PhantomJS client. controller, coprocessor.'''
        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in session.args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not session.args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if session.args.read_timeout:
            page_settings['resourceTimeout'] = session.args.read_timeout * 1000

        page_settings['userAgent'] = session.args.user_agent \
                                     or session.default_user_agent

        # Test early for executable
        wpull.driver.phantomjs.get_version(session.args.phantomjs_exe)

        phantomjs_params = PhantomJSParams(
            wait_time=session.args.phantomjs_wait,
            num_scrolls=session.args.phantomjs_scroll,
            smart_scroll=session.args.phantomjs_smart_scroll,
            snapshot=session.args.phantomjs_snapshot,
            custom_headers=default_headers,
            page_settings=page_settings,
            load_time=session.args.phantomjs_max_time,
        )

        extra_args = [
            '--proxy', '{}:{}'.format(session.args.proxy_server_address,
                                      proxy_port), '--ignore-ssl-errors=true'
        ]

        phantomjs_driver_factory = functools.partial(
            session.factory.class_map['PhantomJSDriver'],
            exe_path=session.args.phantomjs_exe,
            extra_args=extra_args,
        )

        phantomjs_coprocessor = session.factory.new(
            'PhantomJSCoprocessor',
            phantomjs_driver_factory,
            session.factory['ProcessingRule'],
            phantomjs_params,
            root_path=session.args.directory_prefix,
            warc_recorder=session.factory.get('WARCRecorder'),
        )

        return phantomjs_coprocessor
예제 #2
0
파일: download.py 프로젝트: Super-Rad/wpull
    def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int):
        '''Build proxy server and PhantomJS client. controller, coprocessor.'''
        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in session.args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not session.args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if session.args.read_timeout:
            page_settings['resourceTimeout'] = session.args.read_timeout * 1000

        page_settings['userAgent'] = session.args.user_agent \
                                     or session.default_user_agent

        # Test early for executable
        wpull.driver.phantomjs.get_version(session.args.phantomjs_exe)

        phantomjs_params = PhantomJSParams(
            wait_time=session.args.phantomjs_wait,
            num_scrolls=session.args.phantomjs_scroll,
            smart_scroll=session.args.phantomjs_smart_scroll,
            snapshot=session.args.phantomjs_snapshot,
            custom_headers=default_headers,
            page_settings=page_settings,
            load_time=session.args.phantomjs_max_time,
        )

        extra_args = [
            '--proxy',
            '{}:{}'.format(session.args.proxy_server_address, proxy_port),
            '--ignore-ssl-errors=true'
        ]

        phantomjs_driver_factory = functools.partial(
            session.factory.class_map['PhantomJSDriver'],
            exe_path=session.args.phantomjs_exe,
            extra_args=extra_args,
        )

        phantomjs_coprocessor = session.factory.new(
            'PhantomJSCoprocessor',
            phantomjs_driver_factory,
            session.factory['ProcessingRule'],
            phantomjs_params,
            root_path=session.args.directory_prefix,
            warc_recorder=session.factory.get('WARCRecorder'),
        )

        return phantomjs_coprocessor
예제 #3
0
파일: builder.py 프로젝트: mback2k/wpull
    def _build_phantomjs_controller(self):
        '''Build proxy server and PhantomJS client and controller.'''
        if not self._args.phantomjs:
            return

        proxy_server = self._factory.new(
            'HTTPProxyServer',
            self.factory['Client']
        )
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        proxy_server.add_socket(proxy_socket)

        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in self._args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not self._args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if self._args.read_timeout:
            page_settings['resourceTimeout'] = self._args.read_timeout * 1000

        page_settings['userAgent'] = self._args.user_agent \
            or self.default_user_agent

        phantomjs_client = self._factory.new(
            'PhantomJSClient',
            'localhost:{0}'.format(proxy_port),
            page_settings=page_settings,
            default_headers=default_headers,
            exe_path=self._args.phantomjs_exe
        )
        phantomjs_client.test_client_exe()

        phantomjs_controller = self._factory.new(
            'PhantomJSController',
            phantomjs_client,
            wait_time=self._args.phantomjs_wait,
            num_scrolls=self._args.phantomjs_scroll,
            warc_recorder=self.factory.get('WARCRecorder'),
            smart_scroll=self._args.phantomjs_smart_scroll,
            snapshot=self._args.phantomjs_snapshot,
        )

        return phantomjs_controller
예제 #4
0
    def _build_phantomjs_controller(self):
        '''Build proxy server and PhantomJS client and controller.'''
        if not self._args.phantomjs:
            return

        proxy_server = self._factory.new('HTTPProxyServer',
                                         self.factory['Client'])
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        proxy_server.add_socket(proxy_socket)

        page_settings = {}
        default_headers = NameValueRecord()

        for header_string in self._args.header:
            default_headers.parse(header_string)

        # Since we can only pass a one-to-one mapping to PhantomJS,
        # we put these last since NameValueRecord.items() will use only the
        # first value added for each key.
        default_headers.add('Accept-Language', '*')

        if not self._args.http_compression:
            default_headers.add('Accept-Encoding', 'identity')

        default_headers = dict(default_headers.items())

        if self._args.read_timeout:
            page_settings['resourceTimeout'] = self._args.read_timeout * 1000

        page_settings['userAgent'] = self._args.user_agent \
            or self.default_user_agent

        phantomjs_client = self._factory.new(
            'PhantomJSClient',
            'localhost:{0}'.format(proxy_port),
            page_settings=page_settings,
            default_headers=default_headers,
        )
        phantomjs_client.test_client_exe()

        phantomjs_controller = self._factory.new(
            'PhantomJSController',
            phantomjs_client,
            wait_time=self._args.phantomjs_wait,
            num_scrolls=self._args.phantomjs_scroll,
            warc_recorder=self.factory.get('WARCRecorder'),
            smart_scroll=self._args.phantomjs_smart_scroll,
        )

        return phantomjs_controller
예제 #5
0
파일: recorder.py 프로젝트: mback2k/wpull
    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(
            WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()
예제 #6
0
    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(WARCRecord.WARCINFO,
                                                WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = self._params.software_string \
            or self.DEFAULT_SOFTWARE_STRING
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()
예제 #7
0
    def _populate_warcinfo(self, extra_fields=None):
        '''Add the metadata to the Warcinfo record.'''
        self._warcinfo_record.set_common_fields(
            WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS)

        info_fields = NameValueRecord()
        info_fields['Software'] = 'Wpull/{0} Python/{1}'.format(
            wpull.version.__version__, wpull.util.python_version())
        info_fields['format'] = 'WARC File Format 1.0'
        info_fields['conformsTo'] = \
            'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'

        if extra_fields:
            for name, value in extra_fields:
                info_fields.add(name, value)

        self._warcinfo_record.block_file = io.BytesIO(
            bytes(info_fields) + b'\r\n')
        self._warcinfo_record.compute_checksum()
예제 #8
0
    def test_with_normalize_overrides(self):
        record = NameValueRecord(normalize_overrides=['WARC-Type'])

        record.add('WARC-Type', 'warcinfo')

        self.assertIn('WARC-Type', record)
        self.assertEqual('warcinfo', record['WARC-Type'])
        self.assertEqual([('WARC-Type', 'warcinfo')], list(record.get_all()))
        self.assertEqual(['warcinfo'], record.get_list('Warc-Type'))
        self.assertEqual(['WARC-Type'], list(record.keys()))

        record['Warc-Type'] = 'resource'

        self.assertIn('WARC-Type', record)
        self.assertEqual('resource', record['WARC-Type'])
        self.assertEqual([('WARC-Type', 'resource')], list(record.get_all()))
        self.assertEqual(['resource'], record.get_list('Warc-Type'))
        self.assertEqual(['WARC-Type'], list(record.keys()))

        record['WARC-Blah'] = 'blah'
        self.assertEqual(['WARC-Type', 'Warc-Blah'], list(record.keys()))
예제 #9
0
    def test_with_normalize_overrides(self):
        record = NameValueRecord(normalize_overrides=['WARC-Type'])

        record.add('WARC-Type', 'warcinfo')

        self.assertIn('WARC-Type', record)
        self.assertEqual('warcinfo', record['WARC-Type'])
        self.assertEqual([('WARC-Type', 'warcinfo')], list(record.get_all()))
        self.assertEqual(['warcinfo'], record.get_list('Warc-Type'))
        self.assertEqual(['WARC-Type'], list(record.keys()))

        record['Warc-Type'] = 'resource'

        self.assertIn('WARC-Type', record)
        self.assertEqual('resource', record['WARC-Type'])
        self.assertEqual([('WARC-Type', 'resource')], list(record.get_all()))
        self.assertEqual(['resource'], record.get_list('Warc-Type'))
        self.assertEqual(['WARC-Type'], list(record.keys()))

        record['WARC-Blah'] = 'blah'
        self.assertEqual(['WARC-Type', 'Warc-Blah'], list(record.keys()))