def data_soup(url, header=None, cert=None): """ 获取网页后用BeautifulSoup处理 返回一个元组(request格式的对象,beautifulSoup格式的对象) :return: :param url: :param header: """ print 'data_soup' connect = data_request(url, header, cert) try: if connect: html = connect.content # encoding = chardet.detect(html) # print encoding soup = BeautifulSoup(html, 'html5lib') else: return None, None except Exception as e: error_text = fun.exception_format(fun.get_current_function_name(), e) print error_text return connect, None else: # print 'soup:'+soup.original_encoding return connect, soup
def connect_bing(): print 'connect_bing' try: client = pymongo.MongoClient(host, port) except Exception as e: error_text = fun.exception_format(fun.get_current_function_name(), e) print error_text else: db = client.fish_check collection = db.bing return collection
def insert_data(collection, database): """ 向数据库中插入数据 :param collection: 数据库对象 :param database: 可以是json格式或者dict格式的数据 """ print 'insert_data' try: if isinstance(database, list): collection.insert_many(database) elif isinstance(database, dict): collection.insert_one(database) except Exception as e: error_text = fun.exception_format(fun.get_current_function_name(), e) print error_text
def is_url_exist(collection, url): """ 判断一个URL是否已经被存入数据库中 存在 返回True 不存在 返回False :param collection: 数据库实例对象 :param url: url """ print 'search_url' try: temp = collection.find({'url': url}) except Exception as e: error_text = fun.exception_format(fun.get_current_function_name(), e) print error_text else: if temp.count() == 0: return False else: return True
def data_request(url, header=None, cert=None): """ 获取网页后不处理 返回一个requests类型的连接 出错返回一个None :param header: :param url: """ print 'data_request' flag = fun.ssl_judge(url) print flag count = 0 while True: try: page = requests.get(url, headers=header, timeout=10, verify=flag, cert=cert) except requests.exceptions.ConnectTimeout: print 'ConnectTimeout' if count > 1: return None else: count += 1 continue except requests.exceptions.SSLError: print 'SSLError' flag = False continue except requests.exceptions.ConnectionError: print 'ConnectionError' if flag: flag = False count += 1 continue if count > 1: return None else: count += 1 continue except requests.exceptions.ReadTimeout: print 'ReadTimeout' if count > 1: return None else: count += 1 continue except requests.exceptions.Timeout: # this is important print 'Timeout' return None except requests.exceptions.TooManyRedirects: print 'TooManyRedirects' return None except requests.exceptions.HTTPError: print 'HTTPError' return None except requests.exceptions as e: error_text = fun.exception_format(fun.get_current_function_name(), e) print error_text return None else: if page.status_code == requests.codes.ok: return page # get page content else: error_text = "Page Code %s " % page.status_code print error_text if count > 1: return None else: count += 1 continue