示例#1
0
 def login(self):
     ''' 判断是否登陆以及登陆 '''
     try:
         # 有usercookies 且没有登陆信息
         if self.usercookies not in (None, False):
             print_green('登陆...')
             self.session.cookies.update(self.usercookies)
             self.usercookies = False
     except Exception as err:
         print_green('登陆出错啦', err)
示例#2
0
 def is_get_right_page(self):
     ''' 判断是否获得正确页面, 若需要可重写
         这里默认你的链接指向页面就是你的目标页面, 故跳转即失败
     '''
     try:
         self.response.raise_for_status()
         if len(self.response.history) > 0:
             if self.response.history[0].status_code == 302:
                 return False
         return True
     except Exception as err:
         print_green('判断是否获得正确页面出错啦', err)
         return False
示例#3
0
 def save_proxy2mongo(self, data):
     ''' 储存代理及相关信息到mongo
     data = {
         'proxy':{.:.},
         'cookies':{.:.},
         'headers':{.:.}
     }
     '''
     print_green('储存代理')
     try:
         if data.get('proxy') is None:
             raise ValueError('数据格式错误, 没有 proxy 键')
         if self.collection_name is not None:
             result = self.col.insert_one(data)
             return result.inserted_id
     except Exception as err:
         print_green('储存代理出错啦', err)
示例#4
0
def parse_details_html_by_selenium():
    ''' 用selenium解析 '''
    try:
        items = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, '.J_MouserOnverReq')))
        # items = driver.find_elements_by_class_name('J_MouserOnverReq')

        for item in items:
            result = {}
            image_ele = item.find_element_by_class_name('J_ItemPic')
            image = image_ele.get_attribute('data-src')

            result['image'] = image if image else image_ele.get_attribute(
                'src')
            result['title'] = item.find_element_by_class_name(
                'title').text.strip()
            result['url'] = item.find_element_by_class_name(
                'J_ClickStat').get_attribute('href')
            result['price'] = item.find_element_by_css_selector(
                '.price strong').text
            result['deal_cnt'] = item.find_element_by_class_name(
                'deal-cnt').text[:-3]
            # 这种写法是分析特性,不具有普遍性
            # result['is_service_free'] = True if len(item.find_elements_by_css_selector('.row-1 > div'))==3 else False

            # 封装判断element是否存在的方法,简单粗暴,目的直接
            result['is_service_free'] = '包邮' if has_element(
                item, '.icon-service-free') else '不包邮'
            result['shop_name'] = item.find_element_by_class_name(
                'shopname').text.strip()
            result['location'] = item.find_element_by_class_name(
                'location').text
            print_green(
                ' ' * 20, items.index(item), "*" * 20, '正在用selenium解析',
                '*' * 20,
                '保存到数据库: %s' % ('成功' if save_to_db(result) else '失败', ))
        return len(items)

    except TimeoutError as e:
        print_rd('等待超时', e)
        parse_details_html_by_selenium()
    except Exception as e:
        print_rd('parse__by_selenium 出错啦', e)
示例#5
0
    def proxy(self):
        ''' 获取一个有效代理, 不需要再对返回的值进行判断
            传入一个集合名, 如果没有, 那么从代理池获取
        '''
        try:
            test_x = ''
            if self.collection_name is not None:
                test_x = '数据库没有代理,'
                print_green('从数据库获取代理')
                result = self.col.find_one_and_delete({})
                if result is not None:
                    return result

            print_green('%s 从代理池获取代理' % test_x)
            resp = requests.get(PROXY_GETED_API)
            resp.raise_for_status
            if len(resp.text.split(':')) == 2:
                return {'proxy': {
                    'http': 'http://' + resp.text,
                    'https': 'http://' + resp.text
                }}
            return self.proxy
        except Exception as err:
            print_green('获取代理出错啦', err)
示例#6
0
        def get_resp_wrapper(self, url, is_login='******', retry=5, is_sleep=True, validate=None, **kwargs):
            ''' 对用session各种请求得到的结果进行预处理

            如: 请求页面是否需要登陆, 什么时候登陆, 错误处理, 是否处理验证码
            is_login : '******'-> 请求后登陆/'before'-> 请求前登陆/None-> 不登录, 优先级高于对象
            validate : None-> 调用对象的self.validate/False -> 优先级高于对象, 禁止验证/function -> 执行这个传入的validate
            retry : int 出错后重试次数
            is_sleep : 返回前是否sleep
            kwargs : 全部传入后续session请求
            '''
            if retry < 0:
                print_green('错误次数太多, 重新实例化并执行')
                return get_resp_wrapper(MsessionReq(*self.params_tuple, **self.params_dict), url, is_login=is_login, validate=validate, **kwargs)
            try:
                if is_login == 'before':
                    self.login()

                self.response = func(self, url, **kwargs)

                if self.is_get_right_page() is False:
                    print_green('302码, 代理被封:',
                                self.session.proxies['http'])
                    referer_url = self.response.history[0].url
                    self.session.headers.update({'Referer': referer_url})

                    if validate is not None:
                        # 方法
                        if validate is False:
                            self.__is_validate = False
                        else:
                            __validate = validate
                    else:
                        # 对象
                        if self.validate is None:
                            self.__is_validate = False
                        else:
                            __validate = self.validate

                    if self.__is_validate is True:
                        # 解决验证码
                        results = __validate(referer_url, self)

                        if results['is_continue'] is False:
                            MsessionReq.__is_validate = False

                        if results['result'] is not None:

                            self.proxy_info.update(results['result'])
                            # 重新请求链接
                            return get_resp_wrapper(self, url, is_login=is_login, validate=validate, **kwargs)

                    print_green('验证失败, 重新实例化并执行')
                    return get_resp_wrapper(MsessionReq(*self.params_tuple, **self.params_dict), url, is_login=is_login, validate=validate, **kwargs)

                # 成功获取页面
                if is_login == 'behind':
                    self.login()

                self.session.headers.update({'Referer': self.response.url})

                if is_sleep:
                    #在每次成功返回之前随机等待, 避免失败时的等待
                    time.sleep(uniform(1, 7))
                    print_green('succeed 正在使用代理:',
                                self.proxy_info['proxy']['http'])

                return self.response
            except Exception as err:
                print_green('get_resp_wrapper出错啦', err)
                # 不重新实例化, 传入原来self, 重新执行 preprocessor
                return get_resp_wrapper(self, url, is_login=is_login, retry=retry - 1, validate=validate, **kwargs)
示例#7
0
 def __del__(self):
     if self.collection_name is not None:
         print_green('关闭代理数据库连接')
         self.client.close()