def src_cdc2(conp): user, passwd, host, db, schema = conp tbs = db_query( """select tablename from pg_tables where schemaname='%s' and tablename ~'t_gg.*cdc' order by tablename """ % schema, dbtype="postgresql", conp=conp)['tablename'].values.tolist() tbs1 = db_query( """select tablename from pg_tables where schemaname='%s' order by tablename """ % schema, dbtype="postgresql", conp=conp)['tablename'].values.tolist() for tbname in tbs: print("更新,-%s表插入到t_gg_src" % tbname) print(tbs1) if 't_gg_src' not in tbs1: sql = """ select distinct on(gg_file) * into %s.t_gg_src from %s.%s """ % (schema, schema, tbname) else: sql = """insert into %s.t_gg_src select * from %s.%s as a where not exists( select 1 from %s.t_gg_src as b where a.gg_file=b.gg_file ) """ % (schema, schema, tbname, schema) print(sql) db_command(sql, dbtype="postgresql", conp=conp)
def get_conp(name,database=None): path1=join(dirname(__file__),"cfg_db") if database is None: df=db_query("select * from cfg where schema='%s' "%name,dbtype='sqlite',conp=path1) else: df=db_query("select * from cfg where schema='%s' and database='%s' "%(name,database),dbtype='sqlite',conp=path1) conp=df.values.tolist()[0] return conp
def work(conp, **krg): info = krg['info'] para = {"zfcg_bdate": None, "gcjs_bdate": None} if info is not None: info = json.loads(info) para.update(info) zfcg_bdate = para['zfcg_bdate'] gcjs_bdate = para['gcjs_bdate'] quyu = conp[4] schemas = db_query(""" SELECT nspname FROM pg_namespace """, dbtype="postgresql", conp=conp)['nspname'].values.tolist() if quyu not in schemas: db_command("""create schema if not exists %s """ % quyu, dbtype="postgresql", conp=conp) tables = db_query( """SELECT tablename FROM pg_tables where schemaname='%s' """ % quyu, dbtype="postgresql", conp=conp)['tablename'].values.tolist() if 't_gg' not in tables: tag = None else: tag = 'cdc' for st in data: if tag is None: jytype = st[1] if jytype == 'zfcg': bdate = st[4] if zfcg_bdate is None else zfcg_bdate else: bdate = st[4] if gcjs_bdate is None else gcjs_bdate print(bdate) else: bdate = datetime.strftime(datetime.now() + timedelta(days=-2), '%Y-%m-%d') src_update_dates(st[0], st[1], conp, bdate) est_func print("t_gg_update") t_gg_update(conp, st[2], st[3]) # conp=['postgres','since2015','192.168.4.201','zlsys','zlsys_yunnan_dalizhou'] # work(conp,info=None)
def pg2csv(sql, conp, path, **krg): para = {"chunksize": 1000, "f": None} para.update(krg) chunksize = para["chunksize"] f = para["f"] krg1 = copy.deepcopy(para) for w in ["f"]: krg1.pop(w) con = create_engine("postgresql://%s:%s@%s/%s" % (conp[0], conp[1], conp[2], conp[3]), encoding='utf-8', execution_options=dict(stream_results=True)) dfs = pd.read_sql(sql, con, chunksize=chunksize) count = 1 for df in dfs: total = count * chunksize print('第%d行写入中' % total) if f is not None: df = f(df) if count == 1: df.to_csv(path, index=False, **krg1) else: krg1['header'] = False df.to_csv(path, mode='a+', index=False, **krg1) count += 1 if count == 1: print("df为空") df = db_query(sql, dbtype="postgresql", conp=conp) df.to_csv(path, index=False, **krg1)
def est_html_work(conp, f, **args): if "size" in args.keys(): size = args["size"] else: size = None m = page() if size is not None: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d" % ( conp[4], size) else: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' " % ( conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values if "html_total" in args.keys(): html_total = args["html_total"] arr = arr[:html_total] print(arr[:3]) setting = { "num": 20, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": True } if "num" in args.keys(): setting["num"] = args["num"] setting = {**setting, **args} m.write(**setting)
def gg_meta(conp, data, diqu, i=-1, headless=True): sql = """select table_name from information_schema.tables where table_schema='%s'""" % ( conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["table_name"].values work(conp, data, diqu, i, headless)
def est_html_cdc(conp, f, **args): m = page() sql = "select distinct href from %s.gg where href not in(select href from %s.gg_html ) and (not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')" % ( conp[4], conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values if arr == []: print("无href更新") return None if "html_total" in args.keys(): html_total = args["html_total"] arr = arr[:html_total] setting = { "num": 5, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": True } if "num" in args.keys(): setting["num"] = args["num"] setting = {**setting, **args} if len(arr) > 2000 and setting['num'] < 20: setting["num"] = 20 m.write(**setting)
def get_parames(conp, total=100): user, passwd, host, dbname, schema = conp sql1 = '''select qymc,biaoshitong from "%s"."jianshetong_exist_zz_tran" EXCEPT select qymc,bst_zzcode from "%s"."jianshetong_biaoshitong_result" limit %s''' % ( schema, schema, total) parames = db_query(sql1, dbtype="postgresql", conp=conp).values.tolist() return parames
def gg_existed(conp): sql = """select table_name from information_schema.tables where table_schema='%s'""" % (conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["table_name"].values if "gg" in arr: return True else: return False
def get_data(qymc, zzcode, conp): sql = """SELECT zzcode FROM "public"."biaoshitong_exist_zz" where entname = '%s';""" % qymc zzcode_list = db_query(sql, dbtype="postgresql", conp=conp)['zzcode'].tolist() if zzcode in zzcode_list: return 1 return 0
def gg_html(conp,f,headless=True): sql="""select table_name from information_schema.tables where table_schema='%s'"""%(conp[4]) df=db_query(sql,dbtype="postgresql",conp=conp) arr=df["table_name"].values if "gg_html" in arr: html_cdc(conp,f,headless=headless) else: html_work(conp,f,headless=headless)
def est_meta(conp, data, **arg): sql = """select table_name from information_schema.tables where table_schema='%s'""" % (conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["table_name"].values if "gg" in arr: est_cdc(conp, data, **arg) else: est_work(conp, data, **arg)
def html_cdc(conp,f,headless=True): m=page() sql="select href from %s.gg where href not in(select href from %s.gg_html where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')"%(conp[4],conp[4]) df=db_query(sql,dbtype="postgresql",conp=conp) arr=df["href"].values if arr==[]: print("无href更新") return None setting={"num":5,"arr":arr,"f":f,"conp":conp,"tb":"gg_html","headless":headless} m.write(**setting)
def html_work(conp,f,size=None,headless=True): m=page() if size is not None: sql="select href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d"%(conp[4],size) else : sql="select href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' "%(conp[4]) df=db_query(sql,dbtype="postgresql",conp=conp) arr=df["href"].values print(arr[:3]) setting={"num":20,"arr":arr,"f":f,"conp":conp,"tb":"gg_html","headless":headless} m.write(**setting)
def t_gg_update(conp, quyu, diqu): user, passwd, host, db, schema = conp tbs1 = db_query( """select tablename from pg_tables where schemaname='%s' order by tablename """ % schema, dbtype="postgresql", conp=conp)['tablename'].values.tolist() print("更新,表插入到t_gg") print(tbs1) if 't_gg' not in tbs1: sql = """ SELECT gg_file as guid,gg_name ,gg_href as href ,"public".exttime(gg_fabutime)::timestamp(0) as fabu_time ,public.ggtype_tran(ggtype) as ggtype ,'%s' as quyu ,'%s' as diqu,jytype , now()::timestamp(0) as create_time ,public.merge_info(xmjl, bm_endtime , tb_endtime , bzj_time , kb_time , pb_time , db_time ,zhongbiao_hxr,zhongbiaojia,kzj,bd_guid,bd_bh ,bd_name ,zbr ,zbdl ,xmjl_dj ,xmjl_zsbh ) as info ,page into %s.t_gg from %s.t_gg_src """ % (quyu, diqu, schema, schema) else: sql = """ insert into %s.t_gg SELECT distinct on(guid) gg_file as guid,gg_name ,gg_href as href ,"public".exttime(gg_fabutime)::timestamp(0) as fabu_time ,public.ggtype_tran(ggtype) as ggtype ,'%s' as quyu ,'%s' as diqu,jytype , now()::timestamp(0) as create_time ,public.merge_info(xmjl, bm_endtime , tb_endtime , bzj_time , kb_time , pb_time , db_time ,zhongbiao_hxr,zhongbiaojia,kzj,bd_guid,bd_bh ,bd_name ,zbr ,zbdl ,xmjl_dj ,xmjl_zsbh ) as info ,page from %s.t_gg_src as b where not exists(select 1 from %s.t_gg as a where a.guid=b.gg_file) """ % (schema, quyu, diqu, schema, schema) print(sql) db_command(sql, dbtype="postgresql", conp=conp) # conp=['postgres','since2015','192.168.4.200','zlsys','guangdongsheng_shenzhenshi'] # t_gg_update(conp,'zlsys_guangdongsheng_shenzhenshi','广东省深圳市')
def gg(conp, diqu, i=-1): create_gg(conp) sql = """ select table_name from information_schema.tables where table_schema='%s' and table_name ~'_gg$' order by table_name """ % conp[4] df = db_query(sql, conp=conp, dbtype="postgresql") data = df['table_name'].tolist() if i == -1: data = data else: data = data[i:i + 1] for tbname in data: insert_tb(tbname, diqu=diqu, conp=conp)
def est_gg(conp, **arg): if "diqu" in arg.keys(): diqu = arg["diqu"] else: diqu = "未知" create_gg(conp) sql = """ select table_name from information_schema.tables where table_schema='%s' and table_name ~'_gg$' order by table_name """ % conp[4] df = db_query(sql, conp=conp, dbtype="postgresql") data = df['table_name'].tolist() for tbname in data: insert_tb(tbname, diqu=diqu, conp=conp)
def bujiu(tb): m = page() sql = "select href from wuhan.%s where href not in(select href from wuhan.%s_html)" % ( tb, tb) conp = ["postgres", "since2015", "192.168.3.171", "scrapy4", "wuhan"] df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values setting = { "num": 20, "arr": arr, "f": f, "conp": conp, "tb": "%s_html" % tb } m.write(**setting)
def template(tb, conp, size=None): m = page() if size is not None: sql = "select href from %s.%s limit %d" % (conp[4], tb, size) else: sql = "select href from %s.%s " % (conp[4], tb) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values setting = { "num": 20, "arr": arr, "f": f, "conp": conp, "tb": "%s_html" % tb } m.write(**setting)
def html_template(tb, size=None): m = page() if size is not None: sql = "select href from wuhan.%s limit %d" % (tb, size) else: sql = "select href from wuhan.%s " % tb conp = ["postgres", "since2015", "192.168.3.171", "scrapy4", "wuhan"] df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values setting = { "num": 20, "arr": arr, "f": f, "conp": conp, "tb": "%s_html" % tb } m.write(**setting)
def src_cdc1(shi, jytype, conp, date): user, passwd, host, db, schema = conp update(shi, jytype, date, conp) print("增量下载到数据库") tbs = db_query( """select tablename from pg_tables where schemaname='%s' and tablename ~'gg_cdc|html_cdc' order by tablename """ % schema, dbtype="postgresql", conp=conp)['tablename'].values.tolist() print(tbs) n = int(len(tbs) / 2) for i in range(n): j = i + 1 tbname = 't_gg_src_%d_cdc' % j gg_tbname, html_tbname = tbs[2 * i], tbs[2 * i + 1] sql = """drop table if exists %s.%s ; select a.*,b.page into %s.%s from %s.%s as a , %s.%s as b where a.gg_file=b.guid """ % ( schema, tbname, schema, tbname, schema, gg_tbname, schema, html_tbname) db_command(sql, dbtype="postgresql", conp=conp) sql = "drop table if exists %s.%s;drop table if exists %s.%s;" % ( schema, gg_tbname, schema, html_tbname) db_command(sql, dbtype="postgresql", conp=conp)
def copy(self,tb_src,tb_dst,coltext=None,**krg): #para={"distri":None,"pxf_partition":None,'external_only':False,"exclude_col":None,"rename":{},"drop":False} bg=time.time() #"&PARTITION_BY=year:int&RANGE=2011:2013&INTERVAL=1" para={"distri":None,"pxf_partition":None,'external_only':False,"exclude_col":None,"rename":{},"drop":False} para.update(krg) user,passwd,host,db,schema=self.conp_src src_schema,src_name=tb_src.split('.') dst_schema,dst_name=tb_dst.split('.') #删除可能存在的外部表 sql="select tablename from pg_tables where schemaname='%s' and tablename='%s_tmp' "%(dst_schema,dst_name) df=db_query(sql,dbtype="postgresql",conp=self.conp_dst) if not df.empty: sql="drop external table %s.%s_tmp"%(dst_schema,dst_name) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) coltext=self.get_coltext(tb_src,coltext) col_type=[[re.split('\s+',w,1)[0].strip(),re.split('\s+',w,1)[1].strip() ] for w in re.split(",(?!\s*[0-9\)])",coltext)] data1=[]#外部表 data2=[]#目的地表 data3=[] arr=[] #insert 处的内容 brr=[] #insert 处的select 内容 exclude_col=[] rename=para['rename'] if para['exclude_col'] is not None: exclude_col=[ w.strip() for w in para['exclude_col'].split(',') ] for w in col_type: if w[1] in ['json','tsvector','jsonb']: data1.append([w[0],'text']) elif w[1].startswith('_'): data1.append([w[0],'text']) else: data1.append(w) if w[0] not in exclude_col: data2.append(w) arr.append(w[0]) if w[0] in rename.keys(): brr.append(rename[w[0]]) else: if w[1] in ['json','jsonb','tsvector']: brr.append('%s::%s %s'%(w[0],w[1],w[0])) elif w[1].startswith('_'): brr.append("%s::%s %s"%(w[0],w[1],w[0])) else: brr.append(w[0]) coltext_external=','.join([ ' '.join(w1) for w1 in data1]) coltext_dsttb=','.join([ ' '.join(w1) for w1 in data2]) #dst里创建外部表 partition_by="" if para['pxf_partition'] is None else para['pxf_partition'] sql=""" create external table %s.%s_tmp(%s) LOCATION ('pxf://%s.%s?PROFILE=JDBC&JDBC_DRIVER=org.postgresql.Driver&DB_URL=jdbc:postgresql://%s/%s&USER=%s&PASS=%s%s') FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'); """%(dst_schema,dst_name,coltext_external,src_schema,src_name,host,db,user,passwd,partition_by) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) ##只生成外部表 if para['external_only']: print("只生成外部表") return #select into 生成表 if para['drop']: sql="drop table if exists %s.%s"%(dst_schema,dst_name) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) else: sql="truncate %s.%s"%(dst_schema,dst_name) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) colnames=','.join(arr) colnames1=','.join(brr) if para['distri'] is None: sql="""create table if not exists %s (%s ) """%(tb_dst,coltext_dsttb) else: sql="""create table if not exists %s (%s ) distributed by(%s) """%(tb_dst,coltext_dsttb,para['distri']) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) sql="""insert into %s(%s) select %s from %s_tmp """%(tb_dst,colnames,colnames1,tb_dst) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) #删除外部表 sql="select tablename from pg_tables where schemaname='%s' and tablename='%s_tmp' "%(dst_schema,dst_name) df=db_query(sql,dbtype="postgresql",conp=self.conp_dst) if not df.empty: sql="drop external table %s.%s_tmp"%(dst_schema,dst_name) print(sql) db_command(sql,dbtype="postgresql",conp=self.conp_dst) ed=time.time() cost=int(ed-bg) print("total--cost %d s"%cost)
def query(sql): path1=join(dirname(__file__),"cfg_db") df=db_query(sql,dbtype='sqlite',conp=path1) return df
def get_conp1(name): path1=join(dirname(__file__),"cfg_db") df=db_query("select * from cfg where database='%s' and schema='public' "%name,dbtype='sqlite',conp=path1) conp=df.values.tolist()[0] return conp
def get_coltext(self,tbname,coltext=None): schema,name=tbname.split(".") sql="""select col.ordinal_position, col.column_name, col.udt_name as data_type, col.character_maximum_length, col.numeric_precision, col.numeric_scale, col.is_nullable, col.column_default from information_schema.columns as col where table_schema = '%s' and table_name = '%s' order by ordinal_position; """%(schema,name) df=db_query(sql,dbtype="postgresql",conp=self.conp_src) data=[] if coltext is not None: coldict={ re.split('\s+',w,1)[0]:re.split('\s+',w,1)[1] for w in re.split(",(?!\s*[0-9\)])",coltext)} else: coldict={} for i in range(len(df)): col_name,data_type,char_len,numeric_precision,numeric_scale=df.at[i,'column_name'],df.at[i,'data_type'],df.at[i,'character_maximum_length'],df.at[i,'numeric_precision'],df.at[i,'numeric_scale'] if col_name in coldict.keys(): data.append("%s %s"%(col_name,coldict[col_name])) continue if data_type.startswith('_'): data.append("%s %s"%(col_name,data_type)) continue if 'char' in data_type: s="%s varchar"%(col_name) if char_len is None or str(char_len)=='nan' else "%s varchar(%s)"%(col_name,str(int(char_len))) data.append(s) continue if 'numeric' in data_type: s="%s numeric"%(col_name) if numeric_precision is None or str(numeric_precision)=='nan' else "%s numeric(%s,%s)"%(col_name,str(int(numeric_precision)),str(int(numeric_scale)) ) data.append(s) continue if 'double' in data_type or 'float' in data_type: s="%s float"%(col_name) data.append(s) continue if data_type=='integer': s="%s int"%(col_name) data.append(s) continue if 'json'==data_type: s="%s json"%(col_name) data.append(s) continue if 'jsonb'==data_type: s="%s jsonb"%(col_name) data.append(s) continue if 'timestamp' in data_type: s="%s timestamp(0)"%(col_name) data.append(s) continue data.append("%s %s"%(col_name,data_type)) coltext=','.join(data) return coltext