示例#1
0
 def find_many(self, collection_name, filter_dict=None, projection_dict=None, limit_size=0, skip_index=0):
     """
     查找多条表记录,默认返回空数组
     :param collection_name: str 集合名
     :param filter_dict: dict filter_dict: 过滤条件如{'campaignId':123}
     :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0}
     :param limit_size: int 限定返回的数据条数
     :param skip_index: int 游标位移
     :return: list 查询到的记录组成的列表,每个元素是一个字典
     """
     result = []
     try:
         collection = self.database.get_collection(collection_name)
         if not limit_size:
             if not skip_index:
                 result = collection.find(filter_dict, projection_dict)
             else:
                 result = collection.find(filter_dict, projection_dict).skip(skip_index)
         else:
             if not skip_index:
                 result = collection.find(filter_dict, projection_dict).limit(limit_size)
             else:
                 result = collection.find(filter_dict, projection_dict).skip(skip_index).limit(limit_size)
     except Exception as e:
         Log.e('find data failed: %s' % e)
     finally:
         return result
示例#2
0
 def gets_html(url,
               params=None,
               headers=None,
               cookies=None,
               proxies=None,
               charset='UTF-8'):
     '''
     发送https get请求
     :param url:str 请求的url
     :param params:dict 参数
     :param headers:dict 自定义请求头
     :param cookies:dict 网站cookies
     :param proxies:dict 代理
     :return: str 返回的str文本
     '''
     html = None
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies,
                          verify=False)
         r.encoding = charset
         html = r.text
     except Exception as e:
         Log.e("https get html failed -> " + str(e))
     finally:
         pass
     return html
示例#3
0
 def find_one_and_replace(self, collection_name, filter_dict, replace_dict, upsert=False, auto_uptime=True):
     """
     查找并更新表记录,默认返回false,保证原子性
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}}
     :param update_dict: dict 更新的字段,如{'$set':{status_key:0,'campaign.status':1},{'$unset':'campaign.name':'test_camp'}}
     :param insert: bool 如果需要更新的记录不存在是否插入
     :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有
     :return: Document 更新成功后的文档
     """
     result = None
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             replace_dict['uptime']=uptime
             replace_dict['uptimestamp'] = uptimestamp
         collection = self.database.get_collection(collection_name)
         document=collection.find_one_and_replace(filter_dict, replace_dict, upsert=upsert,return_document=ReturnDocument.AFTER)
         result = document
         if result is None:
             Log.i("[INFO] find and update nothing!")
         else:
             Log.d("[INFO] find and update success!")
     except Exception as e:
         Log.e('find and update failed: %s' % e)
     finally:
         return result
示例#4
0
 def posts_html(url,
                data=None,
                headers=None,
                cookies=None,
                proxies=None,
                charset='UTF-8'):
     '''
     发送https post请求
     :param url:str 请求的url
     :param data:dict post的数据
     :param headers:dict 自定义请求头
     :return: str 返回的str文本
     '''
     html = None
     try:
         r = requests.post(url,
                           data=data,
                           headers=headers,
                           cookies=cookies,
                           proxies=proxies,
                           verify=False)
         r.encoding = charset
         html = r.text
     except Exception as e:
         Log.e("https post html failed -> " + str(e))
     finally:
         pass
     return html
示例#5
0
 def update(self, collection_name, filter_dict, update_dict, insert=False, multi=False, auto_uptime=True):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}}
     :param update_dict: dict 更新的字段,如{'$set':{'status_key:0','campaign.status':1},{'$unset':'campaign.name':'test_camp'}}
     :param insert: bool 如果需要更新的记录不存在是否插入
     :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             if '$set' in update_dict:
                 update_dict['$set']['uptime']=uptime
                 update_dict['$set']['uptimestamp'] = uptimestamp
             else:
                 update_dict['$set']={'uptime':uptime,'uptimestamp':uptimestamp}
         collection = self.database.get_collection(collection_name)
         collection.update(filter_dict, update_dict, insert, multi)
         result = True
         Log.d("update success!")
     except Exception as e:
         Log.e('update failed: %s' % e)
         traceback.print_exc()
     finally:
         return result
示例#6
0
 def get_file(file_name,
              url,
              params=None,
              headers=None,
              cookies=None,
              proxies=None):
     '''
     发送http get请求文件
     :return:
     '''
     html = True
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies)
         with open(file_name, 'wb') as fd:
             for chunk in r.iter_content(512):
                 fd.write(chunk)
     except Exception as e:
         Log.e("http get file failed -> " + str(e))
         html = False
     finally:
         pass
     return html
示例#7
0
    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = kafka_setting
        context = ssl.create_default_context()
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.verify_mode = ssl.CERT_REQUIRED
        # context.check_hostname = True
        context.load_verify_locations(CACERT_FILENAME)

        try:
            consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'],
                                     group_id=conf['consumer_id'],
                                     sasl_mechanism="PLAIN",
                                     ssl_context=context,
                                     security_protocol='SASL_SSL',
                                     api_version=(0, 10),
                                     sasl_plain_username=conf['sasl_plain_username'],
                                     sasl_plain_password=conf['sasl_plain_password'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer
示例#8
0
 def insert(self, collection_name, insert_data, auto_uptime=True):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param insert_data: dict 插入的数据,如{'campaignId':{'$in':[1,2,3]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime(
                 '%Y-%m-%d %H:%M:%S.%f')[:-3]
             if type(insert_data) == dict:
                 insert_data['uptime'] = uptime
                 insert_data['uptimestamp'] = uptimestamp
             elif type(insert_data) == list:
                 items = []
                 for data in insert_data:
                     data['uptime'] = uptime
                     data['uptimestamp'] = uptimestamp
                     items.append(data)
                 insert_data = items
         collection = self.database.get_collection(collection_name)
         collection.insert(insert_data)
         result = True
         Log.d("insert success!")
     except Exception as e:
         Log.e('insert failed: %s' % e)
     finally:
         return result
示例#9
0
 def replace(self,
             collection_name,
             filter_dict,
             replace_data,
             auto_uptime=True):
     """
     替换文档,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}}
     :param replace_data: dict 替换的数据,如{'campaignId':{'$in':[4,5,6]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime(
                 '%Y-%m-%d %H:%M:%S.%f')[:-3]
             replace_data['uptime'] = uptime
             replace_data['uptimestamp'] = uptimestamp
         collection = self.database.get_collection(collection_name)
         collection.replace_one(filter_dict, replace_data)
         result = True
         Log.d("remove success!")
     except Exception as e:
         Log.e('remove failed: %s' % e)
     finally:
         return result
示例#10
0
 def __init__(self, host=None, port=None, db_name=None, mechanism=None, user=None, password=None):
     """
     初始化对象,链接数据库
     :param host: mongo数据库所在服务器地址
     :param port: mongo数据库端口
     :param db_name: 数据库的名称
     :param mechanism: 认证类型,None:无认证,MONGODB-CR:2.x认证,SCRAM-SHA-1:3.x认证
     :param user:用户名
     :param password:密码
     :return: 无返回值
     """
     if host is None:
         host=Setting.MONGO_HOST
     if port is None:
         port = Setting.MONGO_PORT
     if db_name is None:
         db_name = Setting.MONGO_DB
     if mechanism is None:
         mechanism = Setting.MONGO_MECHANISM
     if user is None:
         user=Setting.MONGO_USER
     if password is None:
         password=Setting.MONGO_PASSWORD
     try:
         Log.d('start connect mongo')
         self.client = None
         self.client = MongoClient(host, int(port))
         self.database = self.client.get_database(db_name)
         if mechanism is not None:
             self.database.authenticate(user,password,mechanism=mechanism)
         Log.d('mongo connect success')
     except Exception as e:
         self.close_conn()
         Log.e('init mongo bar failed: %s' % e)
示例#11
0
 def get_json(url,
              params=None,
              headers=None,
              cookies=None,
              proxies=None,
              charset='UTF-8'):
     '''
     发送http get请求
     :param url:str 请求的url
     :param params:dict 参数
     :param headers:dict 自定义请求头
     :param cookies:dict 网站cookies
     :param proxies:dict 代理
     :return: json 返回的json对象
     '''
     html = None
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies)
         r.encoding = charset
         html = r.json()
     except Exception as e:
         Log.e("http get json failed -> " + str(e))
     finally:
         pass
     return html
 def insert(self, item, collection_name=None):
     if collection_name != None:
         collection = self.db.get_collection(collection_name)
         try:
             return len(collection.insert(item))
         except Exception:
             return 0
     else:
         try:
             return len(self.collection.insert(item))
         except Exception as e:
             Log.e("mongo insert failed -> " + str(e))
             return 0
示例#13
0
    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = localKafka_setting
        try:
            consumer = KafkaConsumer(
                bootstrap_servers=conf['bootstrap_servers'],
                group_id=conf['consumer_id'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer
示例#14
0
 def get_header(url, params=None, headers=None, cookies=None, proxies=None):
     '''
     发送http head请求
     :param url:str 请求的url
     :return: dict header值
     '''
     html = None
     try:
         r = requests.get(url)
         html = r.headers
     except Exception as e:
         Log.e("http get header failed -> " + str(e))
     finally:
         pass
     return html
def getIpProxyPoolFromeRemote():
    """
    直接从远程获取免费ip代理
    :return: 可用的ip代理
    """
    if USE_PROXY is False:
        return None

    try:
        # Log.i('获取代理···')
        resp = requests.get(PROXY_REMOTE_URL, timeout=TIMEOUT)
        return resp.text
    except Exception as e:
        Log.e('无法获取代理信息,请确认代理系统是否启动')
        return None
示例#16
0
 def find_one(self, collection_name, filter_dict=None, projection_dict=None):
     """
     查找一条表记录,默认返回空字典
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件如{'campaignId':123}
     :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0}
     :return: dict 查找到的数据
     """
     result = {}
     try:
         collection = self.database.get_collection(collection_name)
         result = collection.find_one(filter_dict, projection_dict)
     except Exception as e:
         Log.e('find data failed: %s' % e)
     finally:
         return result
示例#17
0
 def count(self, collection_name, filter_dict=None):
     """
     查找表记录条数,默认返回0
     :param collection_name: str 集合名
     :param table_name: str 表名
     :param filter_dict: dict 过滤条件
     :return: int 表记录条数
     """
     tab_size = 0
     try:
         collection = self.database.get_collection(collection_name)
         tab_size = collection.find(filter_dict).count()
         return tab_size
     except Exception as e:
         Log.e('get table size failed: %s' % e)
     finally:
         return tab_size
示例#18
0
 def delete(self, collection_name, filter_dict):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         collection = self.database.get_collection(collection_name)
         collection.remove(filter_dict)
         result = True
         Log.d("remove success!")
     except Exception as e:
         Log.e('remove failed: %s' % e)
     finally:
         return result
示例#19
0
 def producterUUID(self, strurl):
     """
     生产ggcp话题的uuid
     :param strurl:
     """
     try:
         conf = kafka_setting
         #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs.
         future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII'))
         self.producer.flush()
         future.get()
     except KafkaError as e:
         self.producer.close()
         if self.producer is None:
             self.producer = self.__setproducer()
         Log.e(e+'send message failed')
         pass
def update_proxy():
    """
    获取并校验代理ip地址
    :return:
    """
    if USE_PROXY:
        i = 0
        while True:
            try:
                get_proxy()
                notify_ip_address()
                return True
            except Exception:
                i += 1
                Log.e("代理获取失败,尝试重试,重试次数%s" % (i, ))
    else:
        Log.i('notify address')
        notify_ip_address()
示例#21
0
 def producerUrl(self, strurl):
     """
     生产父链接
     :param strurl:
     """
     try:
         conf = kafka_setting
         future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII'))
         self.producer.flush()
         future.get()
     except KafkaError as e:
         #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理
         #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft
         self.producer.close()
         if self.producer is None:
             self.producer = self.__setproducer()
         Log.e(e+'send message failed')
         pass
示例#22
0
 def consumerurl(self,pipeDictData):
     """
     消费父链接
     :param queueDictData:
     """
     conf = kafka_setting
     self.consumer.subscribe((conf['topic_name']))
     # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了)
     for message in self.consumer:
         jsondata = str(message.value, "utf-8")
         # Log.i(jsondata)
         try:
             dictdata = json.loads(jsondata)
         except Exception as e:
             Log.e(e + jsondata)
             continue
         # self.setURL_inf(dictdata)
         #发送源数据,驱动下载器
         pipeDictData.send(dictdata)
def get_proxy():
    """
    获取代理ip,并更新控制器PROXIES
    :return: 可用的ip代理
    """
    if USE_PROXY is False:
        return None

    try:
        Log.i('获取代理···')
        resp = requests.get(PROXY_URL, timeout=TIMEOUT)
        ip_address = resp.text
        proxies = {'http': ip_address, 'https': ip_address}
        # Log.i(proxies)
        PROXIES = proxies
        return PROXIES
    except Exception as e:
        Log.e('无法获取代理信息,请确认代理系统是否启动')
        return None
示例#24
0
    def downLoadHtml(self, data):
        """
        爬取并提取子链接
        :param urlInfor:
        """
        if self.ipProxy is None:
            self.ipProxy = self.getIpPoolMethod()
        if self.heads is None:
            self.heads = self.headersEngine.getHeaders()

        # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn',
        #  'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0,
        #  'content': None,
        #  'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=',
        #  'SleepTime': 0.0, 'FilePath': None}

        html = None  #爬取网页所有内容
        ctifety = 0  #解析子链接标志位
        Flag = 1  #爬取完成标志位
        count = 0  #空网页计算器
        while (Flag):
            try:
                if count > 1:
                    self.ipProxy = self.getIpPoolMethod()

                protocol = 'https' if 'https' in self.ipProxy else 'http'
                proxiesmmm = {protocol: self.ipProxy}

                req = requests.get(data['Urlname'],
                                   headers=self.heads,
                                   allow_redirects=False,
                                   proxies=proxiesmmm,
                                   timeout=3)
                # 跳过验证反扒机制
                soup_validate = BeautifulSoup(req.text, 'lxml')
                if soup_validate.find(name='title').string == '安全验证':
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                if req.status_code != 200:
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    data = self.__downlowdFile(data=data, req=req)
                    data['Download'] = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    data['Download'] = 0
                    ctifety = 1
                    Flag = 0  # 该回大部队了
                else:
                    continue
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                count += 1
                if html is None:
                    Flag = 1
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count += 1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                requests.adapters.DEFAULT_RETRIES = 5
                s = requests.session()
                s.keep_alive = False
                if html is None:
                    Flag = 1
                # if count > 4:
                #     return None
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass

        if ctifety:
            data['content'] = html
            soup = BeautifulSoup(html, 'html.parser')  #很棒棒的bs简单解析下
        else:
            soup = None

        data['soup'] = soup
        # Log.i(data['content'].decode('utf-8'))
        return data
示例#25
0
    def __getSoupAndDeepnumOrDown(self, ipProxy, headers = []):
        """
        爬虫并简单解析子链接
        :param proxiesmmm:
        :param headers:
        """

        html = None#爬取网页所有内容
        ctifety = 0#解析子链接标志位
        Flag = 1#爬取完成标志位
        count = 0#空网页计算器

        #初始化你的头
        headers = HEADERS
        headersEngine = HeadersEngine()

        #奶奶的个大循环下载内容和文件,很JB耗时i/o操作
        while (Flag):
            url = self.URL_inf.Urlname#中间变量
            try:
                #如果有异常了赶紧换ip
                if count > 0:
                    ipProxy=self.getIpPoolMethod()
                protocol = 'https' if 'https' in ipProxy else 'http'
                proxiesmmm = {protocol: ipProxy}
                #Request Http请求网页,虽然我不喜欢这个库
                # req = requests.get(url, headers=headers, proxies=proxiesmmm, timeout=2)  # ,proxies=proxiesmmm,stream=True
                # req = requests.get(url, headers=headers, proxies=proxiesmmm)
                #解决HTTP超时异常 https://www.zhihu.com/question/52595659 拒绝默认的301/302重定向
                req = requests.get(url, headers=headers, allow_redirects=False, proxies=proxiesmmm, timeout=3)

                if req.status_code != 200:
                    return None

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    self.__downlowdFile(url=url, req=req)
                    self.URL_inf.Download = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    self.URL_inf.Download = 0
                    ctifety = 1
                    Flag = 0#该回大部队了
                else:
                    return None
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                if count > 3:
                    return None
                pass
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count+=1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                headers = headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                s = requests.session()
                s.keep_alive = False
                if count > 3:
                    return None
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                headers = headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                if count > 3:
                    return None
                pass


        if ctifety:
            self.URL_inf.content = html
            soup = BeautifulSoup(html, 'html.parser')#很棒棒的bs简单解析下
        else:
            soup = None

        self.URL_inf.soup = soup
        # Log.i(self.URL_inf.content.decode('utf-8'))
        return self.URL_inf#终于TM的爬完和简单解析了