示例#1
0
    def getcompany_info(self, name, url):
        logger = Logger(logname='error.log', logger="58com").getlog()
        ds = DataStore()
        try:
            company_text = []
            html = proxy.proxy_request(url)
            soup = BeautifulSoup(html, 'html.parser')
            tag = soup.find(class_="basicMsg")
            ul = tag.find("ul")
            li_tags = ul.find_all(name='li')
            strinfo = re.compile('\s')
            for li in li_tags:
                txt = strinfo.sub('', li.get_text())
                company_text.append(txt.split(':')[1])
            #获取工商信息
            #gongshang_info = tianyan.tianyan_search(name)
            #gongshang_info = ','.join(gongshang_info)
            ds.insert_database(name, company_text)

        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
            logger.error("Get company info fail, company name: %s, url: %s",
                         name, url)  #记录解析失败的公司和url
        except Exception as e:
            print("exception:" + str(e))
            sleep(1)
示例#2
0
class HtmlParser(object):
    def __init__(self):
        self.storage = DataStore()
        #self._start()

    #def _start(self):
        #self.storage.csv_store('水质自动监测站','断面属性','测量时间$pH$溶解氧$氨氮$高锰酸盐指数$总有机碳','data.csv')

    def get_info(self, start_url):
        try:
            data = proxy.proxy_request(start_url)
            soup = BeautifulSoup(data, 'lxml')
            total_data = soup.find_all(type="hidden")
            temp_num = 1
            for data in total_data:
                temp_data = data.get('value')
                if temp_num == 2:
                    data_list = temp_data.split('!!')
                if temp_num == 3:
                    station_list = temp_data.split('!!')
                if temp_num == 4:
                    attr_list = temp_data.split('!!')
                if temp_num >= 5:
                    break
                else:
                    temp_num += 1

        except Exception as e:
            print("exception:" + str(e))
            sleep(1)

        return data_list, station_list, attr_list

    #数据拆分
    def split_data(self, data_list, station_list, attr_list):
        station_dict = {}
        attr_dict = {}
        data_dict = {}
        #去除列表中的空值
        while '' in data_list:
            data_list.remove('')
        while '' in station_list:
            station_list.remove('')
        while '' in attr_list:
            attr_list.remove('')
        try:
            for i in data_list:
                Data = i.split('$')
                data_num = i.split('$')[0]
                data_dict[data_num] = Data

            for j in station_list:
                station_num = j.split('$')[0]
                station_name = j.split('$')[1]
                station_dict[station_num] = station_name

            for k in attr_list:
                attr_num = k.split('$')[0]
                attr_name = k.split('$')[1]
                attr_dict[attr_num] = attr_name

        except Exception as e:
            print("exception:" + str(e))
            sleep(1)
        return data_dict, station_dict, attr_dict

    #保存到本地CSV文件
    def data_store(self, data_dict, station_dict, attr_dict):
        try:
            for key in station_dict:
                self.storage.csv_store(station_dict[key], attr_dict[key], data_dict[key], 'data.csv')

        except Exception as e:
            print("exception:" + str(e))
            sleep(1)

    #保存到数据库
    def insert_mysql(self, data_dict, station_dict, attr_dict):
        try:
            for key in station_dict:
                self.storage.insert_database(station_dict[key], attr_dict[key], data_dict[key])

        except Exception as e:
            print("exception:" + str(e))
            sleep(1)