def getcompany_info(self, name, url): logger = Logger(logname='error.log', logger="58com").getlog() ds = DataStore() try: company_text = [] html = proxy.proxy_request(url) soup = BeautifulSoup(html, 'html.parser') tag = soup.find(class_="basicMsg") ul = tag.find("ul") li_tags = ul.find_all(name='li') strinfo = re.compile('\s') for li in li_tags: txt = strinfo.sub('', li.get_text()) company_text.append(txt.split(':')[1]) #获取工商信息 #gongshang_info = tianyan.tianyan_search(name) #gongshang_info = ','.join(gongshang_info) ds.insert_database(name, company_text) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) logger.error("Get company info fail, company name: %s, url: %s", name, url) #记录解析失败的公司和url except Exception as e: print("exception:" + str(e)) sleep(1)
class HtmlParser(object): def __init__(self): self.storage = DataStore() #self._start() #def _start(self): #self.storage.csv_store('水质自动监测站','断面属性','测量时间$pH$溶解氧$氨氮$高锰酸盐指数$总有机碳','data.csv') def get_info(self, start_url): try: data = proxy.proxy_request(start_url) soup = BeautifulSoup(data, 'lxml') total_data = soup.find_all(type="hidden") temp_num = 1 for data in total_data: temp_data = data.get('value') if temp_num == 2: data_list = temp_data.split('!!') if temp_num == 3: station_list = temp_data.split('!!') if temp_num == 4: attr_list = temp_data.split('!!') if temp_num >= 5: break else: temp_num += 1 except Exception as e: print("exception:" + str(e)) sleep(1) return data_list, station_list, attr_list #数据拆分 def split_data(self, data_list, station_list, attr_list): station_dict = {} attr_dict = {} data_dict = {} #去除列表中的空值 while '' in data_list: data_list.remove('') while '' in station_list: station_list.remove('') while '' in attr_list: attr_list.remove('') try: for i in data_list: Data = i.split('$') data_num = i.split('$')[0] data_dict[data_num] = Data for j in station_list: station_num = j.split('$')[0] station_name = j.split('$')[1] station_dict[station_num] = station_name for k in attr_list: attr_num = k.split('$')[0] attr_name = k.split('$')[1] attr_dict[attr_num] = attr_name except Exception as e: print("exception:" + str(e)) sleep(1) return data_dict, station_dict, attr_dict #保存到本地CSV文件 def data_store(self, data_dict, station_dict, attr_dict): try: for key in station_dict: self.storage.csv_store(station_dict[key], attr_dict[key], data_dict[key], 'data.csv') except Exception as e: print("exception:" + str(e)) sleep(1) #保存到数据库 def insert_mysql(self, data_dict, station_dict, attr_dict): try: for key in station_dict: self.storage.insert_database(station_dict[key], attr_dict[key], data_dict[key]) except Exception as e: print("exception:" + str(e)) sleep(1)