Exemplo n.º 1
0
    def process_topic(self, item):
        """Dump topic to file."""
        def prepend_domain(endpoint):
            return item['domain'][:-1] + endpoint if endpoint is not None else None

        # fetch topic pdf, docx urls
        doc_urls = item['doc_urls']
        pdf_url = docx_url = None
        for d in doc_urls:
            _, ext = splitext(basename(urlsplit(d).path))
            if ext.lower() == '.pdf':
                if pdf_url is not None:
                    raise NotSupported('multiple pdf urls for source url: {}'.format(item['source_url']))
                pdf_url = prepend_domain(d.strip())
            elif ext.lower() == '.docx':
                docx_url = prepend_domain(d.strip())
            else:
                raise NotSupported('unsupported docx url file type: {}'.format(d, ext))

        data = {
            'id': item['id'],
            'source_url': item['source_url'],
            'chapter_num': item['chapter_num'],

            'pdf_url': pdf_url,
            'docx_url': docx_url,

            # the following fields can be empty.
            # some topics only link to pdf/docx with no other content.
            'title': (
                self.html_parser.unescape(item['title'].strip())
                if item['title'] is not None
                else None
            ),
            'office': (
                self.html_parser.unescape(item['office'].strip())
                if item['office'] is not None
                else None
            ),
            'body': (
                self.html_parser.unescape(item['body'].strip())
                if item['body'] is not None
                else None
            ),
        }

        self.exporters['topics'].export_item(data)
        return item
Exemplo n.º 2
0
    def preserve(self, ad: JSONObject) -> None:
        self.logger.debug(ad)
        ad['isDetailed'] = False

        if ad['type'] == 'item' or ad['type'] == 'xlItem':
            timestamp = ad['value']['time']
            id = ad['value']['id']
        elif ad['type'] == 'vip':
            timestamp = ad['value']['list'][0]['value']['time']
            id = ad['value']['list'][0]['value']['id']
        else:
            raise NotSupported()

        if self.last_stamp == timestamp:
            self.page += 1
        else:
            self.last_stamp = timestamp
            self.page = 1

        if self.recent_collection.collection.find_one({'value.id': id}):
            self.broken_ads += 1
            self.broken_ads_in_a_row += 1
        else:
            self.broken_ads_in_a_row = 0
            self.recent_collection.collection.insert_one(ad)

        if self.broken_ads_in_a_row > BROKEN_ADS_THRESHOLD:
            raise CloseSpider("Broken Ads threshold excedeed")
Exemplo n.º 3
0
 async def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     handler = self._get_handler(scheme)
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     return await handler.download_request(request, spider)
Exemplo n.º 4
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme  # 找到下载类型,比如http类型
     handler = self._get_handler(scheme)  # 根据类型找到对应的handler
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     return handler.download_request(request, spider)
Exemplo n.º 5
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     handler = self._get_handler(scheme)
     if not handler:
         raise NotSupported(
             f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}"
         )
     return handler.download_request(request, spider)
Exemplo n.º 6
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     handler = self._get_handler(scheme)
     # print(handler,'###############')  # <scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler object at 0x04697CD0>
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     return handler.download_request(request, spider)
Exemplo n.º 7
0
 def resolve_item_type(cls, document: JSONObject) -> Type[Enum]:
     if document['type'] == 'item':
         return cls['ITEM']
     elif document['type'] == 'xlItem':
         return cls['XLITEM']
     elif document['type'] == 'vip':
         return cls['VIP']
     else:
         raise NotSupported()
Exemplo n.º 8
0
 def resolve_item_value(document: JSONObject) -> JSONObject:
     """Resolves item value"""
     assert document is not None
     if document['type'] == 'item' or document['type'] == 'xlItem':
         return document['value']
     elif document['type'] == 'vip':
         return document['value']['list'][0]['value']
     else:
         raise NotSupported()
Exemplo n.º 9
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     try:
         handler = self._handlers[scheme]
     except KeyError:
         msg = self._notconfigured.get(scheme, \
                 'no handler available for that scheme')
         raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg))
     return handler(request, spider)
Exemplo n.º 10
0
 def _add_middleware(self, mw):
     if hasattr(mw, 'open_spider'):
         self.methods['open_spider'].append(mw.open_spider)
     if hasattr(mw, 'close_spider'):
         if self.close_spider_order == 'numerical':
             self.methods['close_spider'].append(mw.close_spider)
         elif self.close_spider_order == 'default':
             self.methods['close_spider'].insert(0, mw.close_spider)
         else:
             raise NotSupported('CLOSESPIDER_CALLING_ORDER setting has to be either "default" or "numerical"')
Exemplo n.º 11
0
 def download_request(self, request, spider):
     ## 获取请求的协议
     scheme = urlparse_cached(request).scheme
     ## 获取协议对应的下载处理器
     handler = self._get_handler(scheme)
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     ## 通过下载处理器下载请求,并返回响应内容
     return handler.download_request(request, spider)
Exemplo n.º 12
0
 def _connect(self, factory):
     host, port = factory.host, factory.port
     if factory.scheme == 'https':
         if ssl_supported:
             return reactor.connectSSL(host, port, factory, \
                     self.ClientContextFactory())
         raise NotSupported(
             "HTTPS not supported: install pyopenssl library")
     else:
         return reactor.connectTCP(host, port, factory)
Exemplo n.º 13
0
    def _check_crawlera_settings(self, splash_settings):
        # When using crawlera with splash we use a script to configure it
        # properly, but that means that some options that can be used in
        # render.html can't be used anymore.

        if splash_settings.get('endpoint', 'render.html') != 'render.html':
            raise NotSupported("Splash + Crawlera integration is only "
                "implemented for the render.html endpoint")

        splash_args = splash_settings.get('args', {})
        not_implemented_options = { # option: (allowed values, ...)
            'js': (None, ''),
            'allowed_content_types': (None, ''),
            'forbidden_content_types': (None, ''),
        }
        for option, allowed in not_implemented_options.items():
            if option in splash_args and splash_args[option] not in allowed:
                raise NotSupported(
                    "Splash option '%s' is not compatible with Crawlera" % option
                )
Exemplo n.º 14
0
 def download_request(self, request, spider):
     # 1. 获取schedule(http/https)
     scheme = urlparse_cached(request).scheme
     # 2. 根据schedule选择加载器(http, https, ftp等)
     handler = self._get_handler(scheme)
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     # 3. 开始下载并返回结果, 例如scrapy/core/downloader/handlers/http11.py
     # a. 下载异常, 会依次调用各个中间件process_exception方法, 注意, 处理process_exception
     #   和process_response是倒序的(见scrapy/core/downloader/middleware.py).
     # b. 下载成功, 一步步返回到scrapy/core/engine.py中的_next_request_from_scheduler方法
     return handler.download_request(request, spider)
Exemplo n.º 15
0
 def getProxyPool(self, drop_list):
     with pymysql.connect(**self.dbparm) as cursor:
         if len(drop_list) > 0:
             format_str = ','.join(['%s'] * len(drop_list))
             sql = 'update python.proxytable set valid = 1 where address in({})'.format(
                 format_str)
             cursor.execute(sql, tuple(drop_list))
         row_count = cursor.execute(
             'select address from python.proxytable'
             ' where valid = 0 order by rank,update_time desc limit 10')
         if row_count > 0:
             result = [item[0] for item in cursor.fetchall()]
             return result
         else:
             self.proxy_enabled = False
             raise NotSupported('proxy resources are exhausted')
Exemplo n.º 16
0
    def _handle_page_load(self, request, webpage, cookiejar, load_result):
        if cookiejar:
            yield sync_cookies(cookiejar, webpage)

        browser_response = request.meta.get('browser_response', False)

        try:
            ok, status, headers, exc = load_result

            if ok:
                if browser_response:
                    respcls = BrowserResponse
                else:
                    respcls = HtmlResponse

                url = yield webpage.callRemote('get_url')
                encoding, body = yield webpage.callRemote('get_body')
                response = respcls(status=status,
                                   url=url,
                                   headers=headers,
                                   body=body,
                                   encoding=encoding,
                                   request=request)

                if browser_response:
                    response._webpage = PBReferenceMethodsWrapper(webpage)
                    response._semaphore = self._semaphore
                    response._cookiejar = cookiejar

            else:
                if isinstance(exc, ScrapyNotSupported):
                    exc = NotSupported(*exc.args)
                raise exc

        except Exception as err:
            browser_response = False
            response = Failure(err)

        finally:
            if not browser_response:
                try:
                    yield webpage.callRemote('close')
                finally:
                    self._semaphore.release()

        return response
Exemplo n.º 17
0
    def parse_XML(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Exemplo n.º 18
0
    def _handle_page_load(self,
                          request,
                          webpage,
                          load_result=(True, 200, None, None)):
        """

        Handle a request for a web page, either a page load or a request to
        continue using an existing page object.

        """

        try:
            ok, status, headers, exc = load_result

            if ok:
                url = yield webpage.callRemote('get_url')

                browser_response = request.meta.get('browser_response', False)
                if browser_response:
                    respcls = BrowserResponse
                else:
                    respcls = HtmlResponse

                encoding, body = yield webpage.callRemote('get_body')
                response = respcls(status=status,
                                   url=url,
                                   headers=headers,
                                   body=body,
                                   encoding=encoding,
                                   request=request)

                if browser_response:
                    response.webpage = PbReferenceMethodsWrapper(webpage)

            else:
                if isinstance(exc, ScrapyNotSupported):
                    exc = NotSupported(*exc.args)
                raise exc

        except Exception as err:
            response = Failure(err)

        return response
Exemplo n.º 19
0
    def _parse(self, response, **kwargs):#当程序从父类的start_request 接受到respond后
# 根据不同的解析器 解析respond 调用parse_node 返回结果
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

        response = self.adapt_response(response) #在调用解析器前 调用 self.adapt_response
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath(f'//{self.itertag}')
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath(f'//{self.itertag}')
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Exemplo n.º 20
0
    def _parse(self, response, **kwargs):
        if not hasattr(self, "parse_node"):
            raise NotConfigured(
                "You must define parse_node method in order to scrape this XML feed"
            )

        response = self.adapt_response(response)
        if self.iterator == "iternodes":
            nodes = self._iternodes(response)
        elif self.iterator == "xml":
            selector = Selector(response, type="xml")
            self._register_namespaces(selector)
            nodes = selector.xpath(f"//{self.itertag}")
        elif self.iterator == "html":
            selector = Selector(response, type="html")
            self._register_namespaces(selector)
            nodes = selector.xpath(f"//{self.itertag}")
        else:
            raise NotSupported("Unsupported node iterator")

        return self.parse_nodes(response, nodes)
Exemplo n.º 21
0
    def start_requests(self):
        channel_number = self.settings.get('CHANNEL_NUMBER')
        channel_list = []
        base_url = 'https://www.pornhubpremium.com/channels/{0}/videos?o=ra'
        with open('channel.txt') as f:
            for channel in f:
                channel = channel.strip()
                if channel != '':
                    channel_list.append(channel)
        # check CHANNEL_NUMBER is smaller than list length
        if isinstance(channel_number,
                      int) and len(channel_list) < channel_number:
            raise NotSupported(
                'CHANNEL_NUMBER config is bigger than website channel list')

        if channel_number == 'ALL':
            for i in channel_list:
                yield scrapy.Request(base_url.format(i))
        else:
            for i in range(channel_number):
                yield scrapy.Request(base_url.format(channel_list[i]))
Exemplo n.º 22
0
 def _get_web_services(self):
     for ext in self.crawler.extensions.middlewares:
         if isinstance(ext, WebService):
             return "http://{host.host:s}:{host.port:d}".format(
                 host=ext.port.getHost()).encode('utf-8')
     raise NotSupported('Web Not Supported Please Check!')
Exemplo n.º 23
0
 def xpath(self, *a, **kw):
     """Shortcut method implemented only by responses whose content
     is text (subclasses of TextResponse).
     """
     raise NotSupported("Response content isn't text")
Exemplo n.º 24
0
 def xpath(self, a: object, kw: object) -> object:
     """Shortcut method implemented only by responses whose content
     is text (subclasses of TextResponse).
     """
     raise NotSupported("Response content isn't text")
Exemplo n.º 25
0
 def __init__(self, *args, **kw):
     raise NotSupported('HTTP1.1 not supported')
Exemplo n.º 26
0
    def parse(self, response):
        goods_all = json.loads(self.json_data)

        skus = goods_all['skus']
        goods_colors = goods_all['goods_colors']
        goods = goods_all['goods']

        if 'title' not in goods.keys():
            raise NotSupported('No title in goods, Pls specify!!')
        title = ''.join(goods['title'].split()).lower()
        make_md5 = hashlib.md5()
        make_md5.update(quote(title, safe=''))
        show_product_id = make_md5.hexdigest()

        logging.warning('show_product_id: ' + show_product_id)
        logging.warning('title: ' + goods['title'])
        from_site = goods['from_site']

        skus_colors = []
        skus_sizes = []
        for sku in skus:
            if sku['color'] not in skus_colors:
                skus_colors.append(sku['color'])
            skus_sizes.append(sku['size'])
            if 'goods_current_price' not in dir(
            ) or goods_current_price > sku['current_price']:
                goods_current_price = sku['current_price']
            if 'goods_list_price' not in dir(
            ) or goods_list_price < sku['list_price']:
                goods_list_price = sku['list_price']
            sku['type'] = 'sku'
            sku['id'] = show_product_id + '-' + sku['color'] + '-' + sku['size']
            sku['is_outof_stock'] = False
            sku['from_site'] = from_site
            sku['show_product_id'] = show_product_id
        goods_colors_colors = []
        for goods_color in goods_colors:
            if 'name' not in goods_color.keys():
                raise NotSupported('ERROR! No name in (color)')
            goods_colors_colors.append(goods_color['name'])
            if 'cover' not in goods.keys():
                goods['cover'] = goods_color['images'][0]['image']
            if 'images' not in goods_color.keys() or len(
                    goods_color['images']) == 0 or (
                        len(goods_color['images']) == 1
                        and len(goods_color['images'][0]) == 0):
                goods_color['images'] = [{'image': goods['cover']}]
                goods_color['cover'] = goods['cover']
            else:
                goods_color['cover'] = goods_color['images'][0]['image']
            goods_color['from_site'] = from_site
            goods_color['show_product_id'] = show_product_id

        # if goods_colors_colors != list(set(goods_colors_colors)):
        #     raise NotSupported('ERROR! Dupelicate name in goods_colors')

        if goods_colors_colors == [] or skus_colors == [] or sorted(
                skus_colors) != sorted(goods_colors_colors):
            raise NotSupported('skus_colors: ' + str(skus_colors) + ', ' +
                               'goods_colors_colors: ' +
                               str(goods_colors_colors) + ', \n' +
                               'goods_colors not equal sku_colors')

        for goods_color in goods_colors:
            colorItem = Color()
            colorItem = goods_color
            colorItem['type'] = 'color'
            yield colorItem

        item = BaseItem()
        item = goods
        item['type'] = 'base'
        item['product_type'] = 'json_import_' + goods['product_type_id']
        item['category'] = 'json_import_' + goods['category_id']
        item['product_type_id'] = goods['product_type_id']
        item['category_id'] = goods['category_id']
        item['title'] = goods['title']
        item['show_product_id'] = show_product_id
        item['from_site'] = from_site
        item['colors'] = skus_colors
        item['sizes'] = skus_sizes
        item['current_price'] = goods_current_price
        item['list_price'] = goods_list_price
        if 'groupbuy_num' in goods.keys():
            item['groupbuy_num'] = goods['groupbuy_num']
        item['skus'] = skus
        print 'skus', skus
        if 'desc' in goods.keys():
            item['desc'] = goods['desc']
        if 'weight' in goods.keys():
            item['weight'] = goods['weight']
        yield item
Exemplo n.º 27
0
 def xpath(self, *a, **kw):
     raise NotSupported("Response content isn't text")