def get_xmjl_detail(self, detail_page, conp, datalist, driver, tablename): # try: data = [] # for entname, name,href in datalist: for shi_text, entname, name, href in datalist: driver.get(href) page = driver.page_source body = etree.HTML(page) content_list = body.xpath(detail_page.get_datas_xpaht())[1:] for content in content_list: ggname = content.xpath(detail_page.ggname_texts)[0].strip() zbtime = content.xpath(detail_page.zbtime_texts)[0].strip() # tmp = [href,entname, name,ggname,zbtime] tmp = [href, shi_text, entname, name, ggname, zbtime] data.append(tmp) # except BaseException as msg: # print(msg) print(data) df = pd.DataFrame(data=data, columns=[ "href", "shi_text", "entname", "name", "ggname", "zbtime" ]) # df = pd.DataFrame(data=data, columns=["href", "entname", "name", "ggname", "zbtime"]) db_write(df, tablename, dbtype='postgresql', datadict='postgresql-text', conp=conp, if_exists='append')
def write(self, **krg): url = krg["url"] f1 = krg["f1"] f2 = krg["f2"] tb = krg["tb"] col = krg["col"] if "total" not in krg.keys(): total = None else: total = krg["total"] if "num" not in krg.keys(): num = None else: num = krg["num"] if "dbtype" not in krg.keys(): dbtype = "postgresql" else: dbtype = krg["dbtype"] if "conp" not in krg.keys(): conp = ["postgres", "since2015", "127.0.0.1", "postgres", "public"] else: conp = krg["conp"] df = self.getdf(url, f1, f2, total, num) if len(df) > 1: print(url) #print(df) df.columns = col else: df = pd.DataFrame(columns=col) print("暂无数据") db_write(df, tb, dbtype=dbtype, conp=conp)
def work(conp): driver = init_driver() driver.get( "https://ggzy.chuzhou.gov.cn/Front_jyzx/ShowInfo/ShowSearchInfo.aspx?CategoryNum=002008001001&Eptr3=&datefrom=&dateto=&xiaqu=&zbfs=&Paging=1" ) result = f1(driver, 5) db_write(result, 'anhui_chuzhou_ggzy_gg', dbtype="postgresql", conp=conp)
def work(conp): driver = init_driver() driver.get("http://www.e-qyzc.com/gg/ggList") result = f1(driver, 5) db_write(result, 'guizhou_guizhousheng_qita_gg', dbtype="postgresql", conp=conp)
def work(conp): driver = init_driver() driver.get( "http://www.szzfcg.cn/portal/topicView.do?method=view&id=1660&agencyType=1" ) result = f1(driver, 5) db_write(result, 'guangdong_shenzhen_zfcg_gg', dbtype="postgresql", conp=conp)
def work(conp): driver = init_driver() driver.get( "http://www.liaoyang.gov.cn/OpenData/opendata/ggzy/list/PurchaseList1.html" ) result = f1(driver, 5) db_write(result, 'liaoning_liaoyang_ggzy_gg', dbtype="postgresql", conp=conp)
def write(self, **krg): url = krg["url"] f1 = krg["f1"] f2 = krg["f2"] tb = krg["tb"] col = krg["col"] # headless=krg["headless"] if "total" not in krg.keys(): total = None else: total = krg["total"] if "num" not in krg.keys(): num = None else: num = krg["num"] if "dbtype" not in krg.keys(): dbtype = "postgresql" else: dbtype = krg["dbtype"] if "conp" not in krg.keys(): conp = ["postgres", "since2015", "127.0.0.1", "postgres", "public"] else: conp = krg["conp"] if "headless" not in krg.keys(): self.headless = True else: self.headless = krg["headless"] if "pageloadstrategy" not in krg.keys(): self.pageloadstrategy = 'normal' else: self.pageloadstrategy = krg["pageloadstrategy"] if "pageloadtimeout" not in krg.keys(): self.pageloadtimeout = 40 else: self.pageloadtimeout = krg["pageloadtimeout"] print("%s 开始" % tb) df = self.getdf(url, f1, f2, total, num) if len(df) > 1: print(url) # print(df) df.columns = col else: df = pd.DataFrame(columns=col) print("暂无数据") db_write(df, tb, dbtype=dbtype, conp=conp)
def write_gg(path, conp, tbname, jytype=None): if jytype == 'gcjs': jytype = "工程建设" if jytype == 'zfcg': jytype = "政府采购" # path="D:\\bsttmp\\kuming_gcjs\\file" # conp=["postgres",'since2015','192.168.4.188','base','cdc'] # tbname="cdc_gg" arr = os.listdir(path) for w in arr: if w.endswith('csv'): #print(w) csv = w break dfs = pd.read_csv(os.path.join(path, csv), sep='\001', quotechar='\002', chunksize=1000) count = 1 for df in dfs: df.columns = [ 'bd_guid', 'bd_bh', 'bd_name', 'zbr', 'zbdl', 'xmjl', 'xmjl_dj', 'xmjl_zsbh', 'bm_endtime', 'bm_endtime_src', 'tb_endtime', 'tb_endtime_src', 'bzj_time', 'bzj_time_src', 'kb_time', 'kb_time_src', 'pb_time', 'pb_time_src', 'db_time', 'db_time_src', 'pb_time', 'pb_time_src', 'zhongbiao_hxr', 'zhongbiao_hxr_src', 'kzj', 'kzj_src', 'zhongbiaojia', 'zhongbiaojia_src', 'bd_dizhi', 'diqu', 'ggtype', 'gg_name', 'gg_fabutime', 'gg_file', 'gg_fujian_file', 'gg_href' ] df['jytype'] = jytype datadict = {w: TEXT() for w in df.columns} if count == 1: db_write(df, tbname, dbtype='postgresql', conp=conp, datadict=datadict) else: db_write(df, tbname, dbtype='postgresql', conp=conp, if_exists='append', datadict=datadict) print("写入第%d " % count) count += 1
def get_jst_qyzz(self,data_page,conp,total_ye,gotoyema,shengfen,yewu,tablename,data,shi=None,): # 选择企业资质 data_page.select_yewu(yewu) # 选择省份 data_page.select_shengfen(shengfen) # 选择市 if shi != None: data_page.select_shi(shi) # 点击搜索按钮 data_page.click_search_button() sleep(3) # 获得当前页码 cur_yema = data_page.get_cur_yema() # 翻页 if int(cur_yema) != int(gotoyema): data_page.goto_yema(gotoyema) sleep(3) page = self.driver.page_source body = etree.HTML(page) # print(body) # 得到当前页面所有的企业 qys = body.xpath(data_page.get_qys_xpath()) # print(qys) for qy in qys: qyname=qy.xpath(data_page.get_entname_xpath())[0].strip() print(qyname) #点击更多按钮 data_page.click_gengduo_button() sleep(2) #获得一家企业的所有企业资质 oneqy_all_qyzzs = qy.xpath(data_page.get_oneqy_allqyzz()) print(','.join(str(s) for s in oneqy_all_qyzzs if s not in [None])) for qyzz in oneqy_all_qyzzs: qyzzname = qyzz.xpath(data_page.get_qyzzname_xpath())[0].strip() print(qyname) print(qyzzname) tmp=[qyname,qyzzname] data.append(tmp) print(data) print(data) df = pd.DataFrame(data=data, columns=["qyname", "qyzzname"]) db_write(df, tablename, dbtype='postgresql', datadict='postgresql-text', conp=conp, if_exists='append')
def work(conp): data = [] parames = get_parames(conp) for parame in parames: time.sleep(0.1) qymc = parame[0] zzcode = parame[1] result = get_data(qymc, zzcode, conp) tmp = [qymc, zzcode, result] print(tmp) data.append(tmp) df = pd.DataFrame(data=data, columns=["qymc", "bst_zzcode", "result"]) db_write(df, "jianshetong_biaoshitong_result", dbtype='postgresql', conp=conp, if_exists='append')
def read_excel_2_db(conp, filename, table_name, if_exists="replace"): """ :param conp: :param filename: 文件路径 :param table_name: 表明 :param if_exists: replace 替换, append 追加 :return: """ result = read_excel(filename, sheet_name='Sheet1', converters={'person_key': str}) for column in result.columns: result[column] = result[column].astype(object) db_write(result, table_name, dbtype="postgresql", datadict='postgresql-text', conp=conp, if_exists=if_exists) print("导入成功")
def pg2pg(sql, tb, conp1, conp2, chunksize=100, f=None, if_exists='replace', datadict='postgresql-text'): conp = conp1 con = create_engine("postgresql://%s:%s@%s/%s" % (conp[0], conp[1], conp[2], conp[3]), encoding='utf-8', execution_options=dict(stream_results=True)) dfs = pd.read_sql(sql, con, chunksize=chunksize) count = 1 for df in dfs: try: total = count * chunksize print('第%d行写入中' % total) if f is not None: df = f(df) if count == 1: db_write(df, tb, dbtype="postgresql", conp=conp2, if_exists=if_exists, datadict=datadict) else: #krg['header']=False db_write(df, tb, dbtype="postgresql", conp=conp2, if_exists='append', datadict=datadict) count += 1 except: traceback.print_exc()
def write_html(path, conp, tbname): # path="D:\\bsttmp\\kuming_gcjs\\file" # conp=["postgres",'since2015','192.168.4.188','base','cdc'] # tbname="cdc_html" arr = os.listdir(path) data = [] count = 1 for w in arr: if w.endswith('html'): with open(os.path.join(path, w), 'r', encoding='utf8') as f: content = f.read() tmp = [w[:-5], content] data.append(tmp) if count == 1: df = pd.DataFrame(data=data, columns=['guid', 'page']) datadict = {"guid": TEXT(), 'page': TEXT()} db_write(df, tbname, dbtype='postgresql', conp=conp, if_exists='replace', datadict=datadict) data = [] elif count % 1000 == 0: df = pd.DataFrame(data=data, columns=['guid', 'page']) datadict = {"guid": TEXT(), 'page': TEXT()} db_write(df, tbname, dbtype='postgresql', conp=conp, if_exists='append', datadict=datadict) data = [] print("写入1000") count += 1 df = pd.DataFrame(data=data, columns=['guid', 'page']) datadict = {"guid": TEXT(), 'page': TEXT()} db_write(df, tbname, dbtype='postgresql', conp=conp, if_exists='append', datadict=datadict)
def get_jst_ryzz( self, jst_ryzz_page, conp, data, total_ye, shengfen, yewu, tablename, shi=None, ): # 选择人员资质 jst_ryzz_page.select_yewu() # 选择省份 jst_ryzz_page.select_shengfen(shengfen) # 选择市 if shi != None: jst_ryzz_page.select_shi(shi) page = self.driver.page_source # print(page) body = etree.HTML(page) ryzzlb_count = 2 # 得到所有的人员资质类别 ryzzs = body.xpath(jst_ryzz_page.get_ryzzs_xpath())[2:] # print("ryzzs "+ryzzs) for ryzzlb in ryzzs: # print("ryzzlb " + ryzzlb) # 选择人员资质类别 jst_ryzz_page.select_ryzzlb(ryzzlb_count) sleep(5) ryzzlb_count += 1 # 点击搜索按钮 jst_ryzz_page.click_search_button() sleep(3) for gotoyema in range(1, total_ye): # 获得当前页面 cur_yema = jst_ryzz_page.get_cur_yema() print(cur_yema) print(gotoyema) # 翻页 if int(cur_yema) != int(gotoyema): jst_ryzz_page.goto_yema2(gotoyema) sleep(3) page = self.driver.page_source # print(page) body = etree.HTML(page) content_list = body.xpath(jst_ryzz_page.get_datas_xpath()) for content in content_list: entname = content.xpath( jst_ryzz_page.get_entname_xpath())[0].strip() # print("entname " + entname) name = content.xpath( jst_ryzz_page.get_name_xpath())[0].strip() # print("name " + name) ryzz = content.xpath( jst_ryzz_page.get_ryzz_xpath())[0].strip() # print("ryzz " + ryzz) tmp = [entname, name, ryzz] data.append(tmp) print(data) df = pd.DataFrame(data=data, columns=["entname", "name", "ryzz"]) db_write(df, tablename, dbtype='postgresql', datadict='postgresql-text', conp=conp, if_exists='append')
def csv2pg(path, conp, **krg): para = { "chunksize": 1000, "tb": os.path.split(path)[1].replace('.csv', ''), "f": None, "if_exists": "replace", "sep": "\001", "datadict": "postgresql-text" } para.update(krg) chunksize = para['chunksize'] f = para['f'] if_exists = para['if_exists'] datadict = para["datadict"] tb = para['tb'] para1 = copy.deepcopy(para) for w in ['datadict', 'f', 'if_exists', 'tb']: para1.pop(w) dfs = pd.read_csv(path, **para1) count = 1 for df in dfs: total = count * chunksize print('第%d行写入中' % total) if f is not None: df = f(df) if count == 1: db_write(df, tb, dbtype="postgresql", conp=conp, if_exists=if_exists, datadict=datadict) else: #krg['header']=False db_write(df, tb, dbtype="postgresql", conp=conp, if_exists='append') count += 1 # sql="select * from hefei.gg limit 100" # conp=["postgres",'since2015','192.168.4.175','anhui','hefei'] # path="d:\\test.csv" # def f1(df): # df['name']='xx' # return df # pg2csv(sql,conp,path) # def f1(df): # df['name']='xx' # return df # conp1=["postgres",'since2015','192.168.4.175','anhui','hefei'] # conp2=["postgres",'since2015','192.168.4.175','mine','hunan'] # pg2pg("select * from hefei.gg limit 1000",'test',conp1,conp2,f=f1) # path="D:/webroot/bstdata/base_20190421.csv" # conp=["gpadmin","since2015",'192.168.4.179',"base_db","v2"] # sql="select distinct on(html_key) * from v2.t_gg where ggstart_time>='2019-04-21' and ggstart_time<'2020-05-20' and html_key>7923769 " # dfs=pg2csv(sql,conp,path,10,sep='\001')