def find_many(self, collection_name, filter_dict=None, projection_dict=None, limit_size=0, skip_index=0): """ 查找多条表记录,默认返回空数组 :param collection_name: str 集合名 :param filter_dict: dict filter_dict: 过滤条件如{'campaignId':123} :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0} :param limit_size: int 限定返回的数据条数 :param skip_index: int 游标位移 :return: list 查询到的记录组成的列表,每个元素是一个字典 """ result = [] try: collection = self.database.get_collection(collection_name) if not limit_size: if not skip_index: result = collection.find(filter_dict, projection_dict) else: result = collection.find(filter_dict, projection_dict).skip(skip_index) else: if not skip_index: result = collection.find(filter_dict, projection_dict).limit(limit_size) else: result = collection.find(filter_dict, projection_dict).skip(skip_index).limit(limit_size) except Exception as e: Log.e('find data failed: %s' % e) finally: return result
def gets_html(url, params=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送https get请求 :param url:str 请求的url :param params:dict 参数 :param headers:dict 自定义请求头 :param cookies:dict 网站cookies :param proxies:dict 代理 :return: str 返回的str文本 ''' html = None try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies, verify=False) r.encoding = charset html = r.text except Exception as e: Log.e("https get html failed -> " + str(e)) finally: pass return html
def find_one_and_replace(self, collection_name, filter_dict, replace_dict, upsert=False, auto_uptime=True): """ 查找并更新表记录,默认返回false,保证原子性 :param collection_name: str 集合名 :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}} :param update_dict: dict 更新的字段,如{'$set':{status_key:0,'campaign.status':1},{'$unset':'campaign.name':'test_camp'}} :param insert: bool 如果需要更新的记录不存在是否插入 :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有 :return: Document 更新成功后的文档 """ result = None try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] replace_dict['uptime']=uptime replace_dict['uptimestamp'] = uptimestamp collection = self.database.get_collection(collection_name) document=collection.find_one_and_replace(filter_dict, replace_dict, upsert=upsert,return_document=ReturnDocument.AFTER) result = document if result is None: Log.i("[INFO] find and update nothing!") else: Log.d("[INFO] find and update success!") except Exception as e: Log.e('find and update failed: %s' % e) finally: return result
def posts_html(url, data=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送https post请求 :param url:str 请求的url :param data:dict post的数据 :param headers:dict 自定义请求头 :return: str 返回的str文本 ''' html = None try: r = requests.post(url, data=data, headers=headers, cookies=cookies, proxies=proxies, verify=False) r.encoding = charset html = r.text except Exception as e: Log.e("https post html failed -> " + str(e)) finally: pass return html
def update(self, collection_name, filter_dict, update_dict, insert=False, multi=False, auto_uptime=True): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}} :param update_dict: dict 更新的字段,如{'$set':{'status_key:0','campaign.status':1},{'$unset':'campaign.name':'test_camp'}} :param insert: bool 如果需要更新的记录不存在是否插入 :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有 :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] if '$set' in update_dict: update_dict['$set']['uptime']=uptime update_dict['$set']['uptimestamp'] = uptimestamp else: update_dict['$set']={'uptime':uptime,'uptimestamp':uptimestamp} collection = self.database.get_collection(collection_name) collection.update(filter_dict, update_dict, insert, multi) result = True Log.d("update success!") except Exception as e: Log.e('update failed: %s' % e) traceback.print_exc() finally: return result
def get_file(file_name, url, params=None, headers=None, cookies=None, proxies=None): ''' 发送http get请求文件 :return: ''' html = True try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies) with open(file_name, 'wb') as fd: for chunk in r.iter_content(512): fd.write(chunk) except Exception as e: Log.e("http get file failed -> " + str(e)) html = False finally: pass return html
def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = kafka_setting context = ssl.create_default_context() context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED # context.check_hostname = True context.load_verify_locations(CACERT_FILENAME) try: consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id'], sasl_mechanism="PLAIN", ssl_context=context, security_protocol='SASL_SSL', api_version=(0, 10), sasl_plain_username=conf['sasl_plain_username'], sasl_plain_password=conf['sasl_plain_password']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer
def insert(self, collection_name, insert_data, auto_uptime=True): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param insert_data: dict 插入的数据,如{'campaignId':{'$in':[1,2,3]}} :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S.%f')[:-3] if type(insert_data) == dict: insert_data['uptime'] = uptime insert_data['uptimestamp'] = uptimestamp elif type(insert_data) == list: items = [] for data in insert_data: data['uptime'] = uptime data['uptimestamp'] = uptimestamp items.append(data) insert_data = items collection = self.database.get_collection(collection_name) collection.insert(insert_data) result = True Log.d("insert success!") except Exception as e: Log.e('insert failed: %s' % e) finally: return result
def replace(self, collection_name, filter_dict, replace_data, auto_uptime=True): """ 替换文档,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}} :param replace_data: dict 替换的数据,如{'campaignId':{'$in':[4,5,6]}} :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S.%f')[:-3] replace_data['uptime'] = uptime replace_data['uptimestamp'] = uptimestamp collection = self.database.get_collection(collection_name) collection.replace_one(filter_dict, replace_data) result = True Log.d("remove success!") except Exception as e: Log.e('remove failed: %s' % e) finally: return result
def __init__(self, host=None, port=None, db_name=None, mechanism=None, user=None, password=None): """ 初始化对象,链接数据库 :param host: mongo数据库所在服务器地址 :param port: mongo数据库端口 :param db_name: 数据库的名称 :param mechanism: 认证类型,None:无认证,MONGODB-CR:2.x认证,SCRAM-SHA-1:3.x认证 :param user:用户名 :param password:密码 :return: 无返回值 """ if host is None: host=Setting.MONGO_HOST if port is None: port = Setting.MONGO_PORT if db_name is None: db_name = Setting.MONGO_DB if mechanism is None: mechanism = Setting.MONGO_MECHANISM if user is None: user=Setting.MONGO_USER if password is None: password=Setting.MONGO_PASSWORD try: Log.d('start connect mongo') self.client = None self.client = MongoClient(host, int(port)) self.database = self.client.get_database(db_name) if mechanism is not None: self.database.authenticate(user,password,mechanism=mechanism) Log.d('mongo connect success') except Exception as e: self.close_conn() Log.e('init mongo bar failed: %s' % e)
def get_json(url, params=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送http get请求 :param url:str 请求的url :param params:dict 参数 :param headers:dict 自定义请求头 :param cookies:dict 网站cookies :param proxies:dict 代理 :return: json 返回的json对象 ''' html = None try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies) r.encoding = charset html = r.json() except Exception as e: Log.e("http get json failed -> " + str(e)) finally: pass return html
def insert(self, item, collection_name=None): if collection_name != None: collection = self.db.get_collection(collection_name) try: return len(collection.insert(item)) except Exception: return 0 else: try: return len(self.collection.insert(item)) except Exception as e: Log.e("mongo insert failed -> " + str(e)) return 0
def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = localKafka_setting try: consumer = KafkaConsumer( bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer
def get_header(url, params=None, headers=None, cookies=None, proxies=None): ''' 发送http head请求 :param url:str 请求的url :return: dict header值 ''' html = None try: r = requests.get(url) html = r.headers except Exception as e: Log.e("http get header failed -> " + str(e)) finally: pass return html
def getIpProxyPoolFromeRemote(): """ 直接从远程获取免费ip代理 :return: 可用的ip代理 """ if USE_PROXY is False: return None try: # Log.i('获取代理···') resp = requests.get(PROXY_REMOTE_URL, timeout=TIMEOUT) return resp.text except Exception as e: Log.e('无法获取代理信息,请确认代理系统是否启动') return None
def find_one(self, collection_name, filter_dict=None, projection_dict=None): """ 查找一条表记录,默认返回空字典 :param collection_name: str 集合名 :param filter_dict: dict 过滤条件如{'campaignId':123} :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0} :return: dict 查找到的数据 """ result = {} try: collection = self.database.get_collection(collection_name) result = collection.find_one(filter_dict, projection_dict) except Exception as e: Log.e('find data failed: %s' % e) finally: return result
def count(self, collection_name, filter_dict=None): """ 查找表记录条数,默认返回0 :param collection_name: str 集合名 :param table_name: str 表名 :param filter_dict: dict 过滤条件 :return: int 表记录条数 """ tab_size = 0 try: collection = self.database.get_collection(collection_name) tab_size = collection.find(filter_dict).count() return tab_size except Exception as e: Log.e('get table size failed: %s' % e) finally: return tab_size
def delete(self, collection_name, filter_dict): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}} :return: bool 是否更新成功 """ result = False try: collection = self.database.get_collection(collection_name) collection.remove(filter_dict) result = True Log.d("remove success!") except Exception as e: Log.e('remove failed: %s' % e) finally: return result
def producterUUID(self, strurl): """ 生产ggcp话题的uuid :param strurl: """ try: conf = kafka_setting #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs. future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass
def update_proxy(): """ 获取并校验代理ip地址 :return: """ if USE_PROXY: i = 0 while True: try: get_proxy() notify_ip_address() return True except Exception: i += 1 Log.e("代理获取失败,尝试重试,重试次数%s" % (i, )) else: Log.i('notify address') notify_ip_address()
def producerUrl(self, strurl): """ 生产父链接 :param strurl: """ try: conf = kafka_setting future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理 #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass
def consumerurl(self,pipeDictData): """ 消费父链接 :param queueDictData: """ conf = kafka_setting self.consumer.subscribe((conf['topic_name'])) # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了) for message in self.consumer: jsondata = str(message.value, "utf-8") # Log.i(jsondata) try: dictdata = json.loads(jsondata) except Exception as e: Log.e(e + jsondata) continue # self.setURL_inf(dictdata) #发送源数据,驱动下载器 pipeDictData.send(dictdata)
def get_proxy(): """ 获取代理ip,并更新控制器PROXIES :return: 可用的ip代理 """ if USE_PROXY is False: return None try: Log.i('获取代理···') resp = requests.get(PROXY_URL, timeout=TIMEOUT) ip_address = resp.text proxies = {'http': ip_address, 'https': ip_address} # Log.i(proxies) PROXIES = proxies return PROXIES except Exception as e: Log.e('无法获取代理信息,请确认代理系统是否启动') return None
def downLoadHtml(self, data): """ 爬取并提取子链接 :param urlInfor: """ if self.ipProxy is None: self.ipProxy = self.getIpPoolMethod() if self.heads is None: self.heads = self.headersEngine.getHeaders() # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn', # 'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0, # 'content': None, # 'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=', # 'SleepTime': 0.0, 'FilePath': None} html = None #爬取网页所有内容 ctifety = 0 #解析子链接标志位 Flag = 1 #爬取完成标志位 count = 0 #空网页计算器 while (Flag): try: if count > 1: self.ipProxy = self.getIpPoolMethod() protocol = 'https' if 'https' in self.ipProxy else 'http' proxiesmmm = {protocol: self.ipProxy} req = requests.get(data['Urlname'], headers=self.heads, allow_redirects=False, proxies=proxiesmmm, timeout=3) # 跳过验证反扒机制 soup_validate = BeautifulSoup(req.text, 'lxml') if soup_validate.find(name='title').string == '安全验证': self.ipProxy = self.getIpPoolMethod() continue if req.status_code != 200: self.ipProxy = self.getIpPoolMethod() continue reqheaders = req.headers if "application" in reqheaders["Content-Type"]: data = self.__downlowdFile(data=data, req=req) data['Download'] = 1 elif "text" in reqheaders["Content-Type"]: html = req.content data['Download'] = 0 ctifety = 1 Flag = 0 # 该回大部队了 else: continue except requests.exceptions.ConnectTimeout as e: Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() count += 1 if html is None: Flag = 1 except (ConnectionError, Timeout) as e: Flag = 1 count += 1 Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False if html is None: Flag = 1 # if count > 4: # return None pass except Exception as e: Flag = 1 count += 1 #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language #TODO 处理这种httpconnectionpool max retries Failed to establish a new connection: Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e)) self.heads = self.headersEngine.getHeaders() #异常Max retries exceeded with url Error的处理 s = requests.session() s.keep_alive = False count += 1 if html is None: Flag = 1 pass if ctifety: data['content'] = html soup = BeautifulSoup(html, 'html.parser') #很棒棒的bs简单解析下 else: soup = None data['soup'] = soup # Log.i(data['content'].decode('utf-8')) return data
def __getSoupAndDeepnumOrDown(self, ipProxy, headers = []): """ 爬虫并简单解析子链接 :param proxiesmmm: :param headers: """ html = None#爬取网页所有内容 ctifety = 0#解析子链接标志位 Flag = 1#爬取完成标志位 count = 0#空网页计算器 #初始化你的头 headers = HEADERS headersEngine = HeadersEngine() #奶奶的个大循环下载内容和文件,很JB耗时i/o操作 while (Flag): url = self.URL_inf.Urlname#中间变量 try: #如果有异常了赶紧换ip if count > 0: ipProxy=self.getIpPoolMethod() protocol = 'https' if 'https' in ipProxy else 'http' proxiesmmm = {protocol: ipProxy} #Request Http请求网页,虽然我不喜欢这个库 # req = requests.get(url, headers=headers, proxies=proxiesmmm, timeout=2) # ,proxies=proxiesmmm,stream=True # req = requests.get(url, headers=headers, proxies=proxiesmmm) #解决HTTP超时异常 https://www.zhihu.com/question/52595659 拒绝默认的301/302重定向 req = requests.get(url, headers=headers, allow_redirects=False, proxies=proxiesmmm, timeout=3) if req.status_code != 200: return None reqheaders = req.headers if "application" in reqheaders["Content-Type"]: self.__downlowdFile(url=url, req=req) self.URL_inf.Download = 1 elif "text" in reqheaders["Content-Type"]: html = req.content self.URL_inf.Download = 0 ctifety = 1 Flag = 0#该回大部队了 else: return None except requests.exceptions.ConnectTimeout as e: Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) if count > 3: return None pass except (ConnectionError, Timeout) as e: Flag = 1 count+=1 Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) headers = headersEngine.getHeaders() #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error s = requests.session() s.keep_alive = False if count > 3: return None pass except Exception as e: Flag = 1 count += 1 #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language #TODO 处理这种httpconnectionpool max retries Failed to establish a new connection: Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e)) headers = headersEngine.getHeaders() #异常Max retries exceeded with url Error的处理 s = requests.session() s.keep_alive = False if count > 3: return None pass if ctifety: self.URL_inf.content = html soup = BeautifulSoup(html, 'html.parser')#很棒棒的bs简单解析下 else: soup = None self.URL_inf.soup = soup # Log.i(self.URL_inf.content.decode('utf-8')) return self.URL_inf#终于TM的爬完和简单解析了