Пример #1
0
def src_cdc2(conp):
    user, passwd, host, db, schema = conp
    tbs = db_query(
        """select tablename from pg_tables where schemaname='%s' and tablename ~'t_gg.*cdc' order by tablename """
        % schema,
        dbtype="postgresql",
        conp=conp)['tablename'].values.tolist()
    tbs1 = db_query(
        """select tablename from pg_tables where schemaname='%s'  order by tablename """
        % schema,
        dbtype="postgresql",
        conp=conp)['tablename'].values.tolist()
    for tbname in tbs:
        print("更新,-%s表插入到t_gg_src" % tbname)
        print(tbs1)
        if 't_gg_src' not in tbs1:
            sql = """
                select distinct on(gg_file) * into %s.t_gg_src from %s.%s 
                """ % (schema, schema, tbname)
        else:

            sql = """insert into %s.t_gg_src 
                select * from %s.%s as a where not exists( select 1 from %s.t_gg_src as b  where a.gg_file=b.gg_file )
                """ % (schema, schema, tbname, schema)
        print(sql)
        db_command(sql, dbtype="postgresql", conp=conp)
Пример #2
0
def get_conp(name,database=None):
    path1=join(dirname(__file__),"cfg_db")
    if database is None:
        df=db_query("select * from cfg where schema='%s' "%name,dbtype='sqlite',conp=path1)
    else:
        df=db_query("select * from cfg where schema='%s' and database='%s' "%(name,database),dbtype='sqlite',conp=path1)
    conp=df.values.tolist()[0]
    return conp
Пример #3
0
def work(conp, **krg):
    info = krg['info']
    para = {"zfcg_bdate": None, "gcjs_bdate": None}
    if info is not None:
        info = json.loads(info)
        para.update(info)

    zfcg_bdate = para['zfcg_bdate']
    gcjs_bdate = para['gcjs_bdate']

    quyu = conp[4]
    schemas = db_query(""" SELECT nspname FROM  pg_namespace """,
                       dbtype="postgresql",
                       conp=conp)['nspname'].values.tolist()
    if quyu not in schemas:
        db_command("""create schema if not exists %s """ % quyu,
                   dbtype="postgresql",
                   conp=conp)

    tables = db_query(
        """SELECT tablename FROM  pg_tables where schemaname='%s' """ % quyu,
        dbtype="postgresql",
        conp=conp)['tablename'].values.tolist()

    if 't_gg' not in tables: tag = None
    else: tag = 'cdc'

    for st in data:
        if tag is None:
            jytype = st[1]
            if jytype == 'zfcg':
                bdate = st[4] if zfcg_bdate is None else zfcg_bdate
            else:
                bdate = st[4] if gcjs_bdate is None else gcjs_bdate
            print(bdate)
        else:
            bdate = datetime.strftime(datetime.now() + timedelta(days=-2),
                                      '%Y-%m-%d')

        src_update_dates(st[0], st[1], conp, bdate)
        est_func
        print("t_gg_update")
        t_gg_update(conp, st[2], st[3])


# conp=['postgres','since2015','192.168.4.201','zlsys','zlsys_yunnan_dalizhou']

# work(conp,info=None)
Пример #4
0
def pg2csv(sql, conp, path, **krg):

    para = {"chunksize": 1000, "f": None}
    para.update(krg)
    chunksize = para["chunksize"]
    f = para["f"]
    krg1 = copy.deepcopy(para)
    for w in ["f"]:
        krg1.pop(w)
    con = create_engine("postgresql://%s:%s@%s/%s" %
                        (conp[0], conp[1], conp[2], conp[3]),
                        encoding='utf-8',
                        execution_options=dict(stream_results=True))
    dfs = pd.read_sql(sql, con, chunksize=chunksize)

    count = 1
    for df in dfs:

        total = count * chunksize
        print('第%d行写入中' % total)
        if f is not None:
            df = f(df)
        if count == 1:
            df.to_csv(path, index=False, **krg1)
        else:
            krg1['header'] = False
            df.to_csv(path, mode='a+', index=False, **krg1)
        count += 1
    if count == 1:
        print("df为空")
        df = db_query(sql, dbtype="postgresql", conp=conp)
        df.to_csv(path, index=False, **krg1)
Пример #5
0
def est_html_work(conp, f, **args):
    if "size" in args.keys():
        size = args["size"]
    else:
        size = None

    m = page()
    if size is not None:
        sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d" % (
            conp[4], size)
    else:
        sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' " % (
            conp[4])

    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["href"].values
    if "html_total" in args.keys():
        html_total = args["html_total"]
        arr = arr[:html_total]
    print(arr[:3])
    setting = {
        "num": 20,
        "arr": arr,
        "f": f,
        "conp": conp,
        "tb": "gg_html",
        "headless": True
    }

    if "num" in args.keys():
        setting["num"] = args["num"]
    setting = {**setting, **args}
    m.write(**setting)
Пример #6
0
def gg_meta(conp, data, diqu, i=-1, headless=True):
    sql = """select table_name from information_schema.tables where table_schema='%s'""" % (
        conp[4])
    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["table_name"].values

    work(conp, data, diqu, i, headless)
Пример #7
0
def est_html_cdc(conp, f, **args):
    m = page()
    sql = "select distinct href from %s.gg where href not in(select href from %s.gg_html ) and (not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')" % (
        conp[4], conp[4])

    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["href"].values
    if arr == []:
        print("无href更新")
        return None
    if "html_total" in args.keys():
        html_total = args["html_total"]
        arr = arr[:html_total]

    setting = {
        "num": 5,
        "arr": arr,
        "f": f,
        "conp": conp,
        "tb": "gg_html",
        "headless": True
    }
    if "num" in args.keys():
        setting["num"] = args["num"]
    setting = {**setting, **args}
    if len(arr) > 2000 and setting['num'] < 20: setting["num"] = 20

    m.write(**setting)
Пример #8
0
def get_parames(conp, total=100):
    user, passwd, host, dbname, schema = conp
    sql1 = '''select qymc,biaoshitong from "%s"."jianshetong_exist_zz_tran"
EXCEPT select qymc,bst_zzcode from "%s"."jianshetong_biaoshitong_result" limit %s''' % (
        schema, schema, total)
    parames = db_query(sql1, dbtype="postgresql", conp=conp).values.tolist()

    return parames
Пример #9
0
def gg_existed(conp):
    sql = """select table_name from information_schema.tables where table_schema='%s'""" % (conp[4])
    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["table_name"].values
    if "gg" in arr:
        return True
    else:
        return False
Пример #10
0
def get_data(qymc, zzcode, conp):
    sql = """SELECT zzcode FROM "public"."biaoshitong_exist_zz" where  entname = '%s';""" % qymc
    zzcode_list = db_query(sql, dbtype="postgresql",
                           conp=conp)['zzcode'].tolist()

    if zzcode in zzcode_list:
        return 1
    return 0
Пример #11
0
def gg_html(conp,f,headless=True):
    sql="""select table_name from information_schema.tables where table_schema='%s'"""%(conp[4])
    df=db_query(sql,dbtype="postgresql",conp=conp)
    arr=df["table_name"].values 

    if "gg_html" in arr:
        html_cdc(conp,f,headless=headless)
    else:
        html_work(conp,f,headless=headless)
Пример #12
0
def est_meta(conp, data, **arg):
    sql = """select table_name from information_schema.tables where table_schema='%s'""" % (conp[4])
    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["table_name"].values

    if "gg" in arr:
        est_cdc(conp, data, **arg)
    else:
        est_work(conp, data, **arg)
Пример #13
0
def html_cdc(conp,f,headless=True):
    m=page()
    sql="select href from %s.gg where href not in(select href from %s.gg_html where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')"%(conp[4],conp[4]) 
    
    df=db_query(sql,dbtype="postgresql",conp=conp)
    arr=df["href"].values
    if arr==[]:
        print("无href更新")
        return None

    setting={"num":5,"arr":arr,"f":f,"conp":conp,"tb":"gg_html","headless":headless}
    m.write(**setting)
Пример #14
0
def html_work(conp,f,size=None,headless=True):
    m=page()
    if size is not None:
        sql="select href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d"%(conp[4],size) 
    else :
        sql="select href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' "%(conp[4])

    df=db_query(sql,dbtype="postgresql",conp=conp)
    arr=df["href"].values
    print(arr[:3])
    setting={"num":20,"arr":arr,"f":f,"conp":conp,"tb":"gg_html","headless":headless}
    m.write(**setting)
Пример #15
0
def t_gg_update(conp, quyu, diqu):

    user, passwd, host, db, schema = conp

    tbs1 = db_query(
        """select tablename from pg_tables where schemaname='%s'  order by tablename """
        % schema,
        dbtype="postgresql",
        conp=conp)['tablename'].values.tolist()

    print("更新,表插入到t_gg")
    print(tbs1)
    if 't_gg' not in tbs1:
        sql = """
SELECT  gg_file as guid,gg_name ,gg_href as href ,"public".exttime(gg_fabutime)::timestamp(0) as fabu_time ,public.ggtype_tran(ggtype) as ggtype

,'%s' as quyu ,'%s' as diqu,jytype

, now()::timestamp(0) as create_time

,public.merge_info(xmjl, bm_endtime ,  tb_endtime 

,  bzj_time , kb_time ,  pb_time
,  db_time ,zhongbiao_hxr,zhongbiaojia,kzj,bd_guid,bd_bh ,bd_name ,zbr  ,zbdl ,xmjl_dj  ,xmjl_zsbh 
) as info ,page into %s.t_gg
 from %s.t_gg_src 
    """ % (quyu, diqu, schema, schema)
    else:

        sql = """

insert into %s.t_gg
SELECT distinct on(guid) gg_file as guid,gg_name ,gg_href as href ,"public".exttime(gg_fabutime)::timestamp(0) as fabu_time ,public.ggtype_tran(ggtype) as ggtype

,'%s' as quyu ,'%s' as diqu,jytype

, now()::timestamp(0) as create_time

,public.merge_info(xmjl, bm_endtime ,  tb_endtime 

,  bzj_time , kb_time ,  pb_time
,  db_time ,zhongbiao_hxr,zhongbiaojia,kzj,bd_guid,bd_bh ,bd_name ,zbr  ,zbdl ,xmjl_dj  ,xmjl_zsbh 
) as info ,page 

from %s.t_gg_src  as b where not exists(select 1 from %s.t_gg as a where a.guid=b.gg_file)
            """ % (schema, quyu, diqu, schema, schema)
    print(sql)
    db_command(sql, dbtype="postgresql", conp=conp)


# conp=['postgres','since2015','192.168.4.200','zlsys','guangdongsheng_shenzhenshi']
# t_gg_update(conp,'zlsys_guangdongsheng_shenzhenshi','广东省深圳市')
Пример #16
0
def gg(conp, diqu, i=-1):
    create_gg(conp)
    sql = """
    select table_name from information_schema.tables where table_schema='%s' and table_name ~'_gg$' order by table_name
    """ % conp[4]

    df = db_query(sql, conp=conp, dbtype="postgresql")
    data = df['table_name'].tolist()
    if i == -1:
        data = data
    else:
        data = data[i:i + 1]
    for tbname in data:
        insert_tb(tbname, diqu=diqu, conp=conp)
Пример #17
0
def est_gg(conp, **arg):
    if "diqu" in arg.keys():
        diqu = arg["diqu"]
    else:
        diqu = "未知"
    create_gg(conp)
    sql = """
    select table_name from information_schema.tables where table_schema='%s' and table_name ~'_gg$' order by table_name
    """ % conp[4]

    df = db_query(sql, conp=conp, dbtype="postgresql")
    data = df['table_name'].tolist()

    for tbname in data:
        insert_tb(tbname, diqu=diqu, conp=conp)
Пример #18
0
def bujiu(tb):
    m = page()
    sql = "select href from wuhan.%s where href not in(select href from wuhan.%s_html)" % (
        tb, tb)
    conp = ["postgres", "since2015", "192.168.3.171", "scrapy4", "wuhan"]
    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["href"].values

    setting = {
        "num": 20,
        "arr": arr,
        "f": f,
        "conp": conp,
        "tb": "%s_html" % tb
    }
    m.write(**setting)
Пример #19
0
def template(tb, conp, size=None):
    m = page()
    if size is not None:
        sql = "select href from %s.%s limit %d" % (conp[4], tb, size)
    else:
        sql = "select href from %s.%s " % (conp[4], tb)

    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["href"].values

    setting = {
        "num": 20,
        "arr": arr,
        "f": f,
        "conp": conp,
        "tb": "%s_html" % tb
    }
    m.write(**setting)
Пример #20
0
def html_template(tb, size=None):
    m = page()
    if size is not None:
        sql = "select href from wuhan.%s limit %d" % (tb, size)
    else:
        sql = "select href from wuhan.%s " % tb

    conp = ["postgres", "since2015", "192.168.3.171", "scrapy4", "wuhan"]
    df = db_query(sql, dbtype="postgresql", conp=conp)
    arr = df["href"].values

    setting = {
        "num": 20,
        "arr": arr,
        "f": f,
        "conp": conp,
        "tb": "%s_html" % tb
    }
    m.write(**setting)
Пример #21
0
def src_cdc1(shi, jytype, conp, date):
    user, passwd, host, db, schema = conp
    update(shi, jytype, date, conp)

    print("增量下载到数据库")
    tbs = db_query(
        """select tablename from pg_tables where schemaname='%s' and tablename ~'gg_cdc|html_cdc' order by tablename """
        % schema,
        dbtype="postgresql",
        conp=conp)['tablename'].values.tolist()
    print(tbs)
    n = int(len(tbs) / 2)
    for i in range(n):
        j = i + 1
        tbname = 't_gg_src_%d_cdc' % j
        gg_tbname, html_tbname = tbs[2 * i], tbs[2 * i + 1]
        sql = """drop table if exists %s.%s ;
        select a.*,b.page into %s.%s from %s.%s as a , %s.%s as b  where  a.gg_file=b.guid """ % (
            schema, tbname, schema, tbname, schema, gg_tbname, schema,
            html_tbname)
        db_command(sql, dbtype="postgresql", conp=conp)
        sql = "drop table if exists %s.%s;drop table if exists %s.%s;" % (
            schema, gg_tbname, schema, html_tbname)
        db_command(sql, dbtype="postgresql", conp=conp)
Пример #22
0
    def copy(self,tb_src,tb_dst,coltext=None,**krg):
        #para={"distri":None,"pxf_partition":None,'external_only':False,"exclude_col":None,"rename":{},"drop":False}
        bg=time.time()
        #"&PARTITION_BY=year:int&RANGE=2011:2013&INTERVAL=1"
        para={"distri":None,"pxf_partition":None,'external_only':False,"exclude_col":None,"rename":{},"drop":False}
        para.update(krg)

        user,passwd,host,db,schema=self.conp_src
        src_schema,src_name=tb_src.split('.')
        dst_schema,dst_name=tb_dst.split('.')

        #删除可能存在的外部表
        sql="select tablename from pg_tables where schemaname='%s' and tablename='%s_tmp' "%(dst_schema,dst_name)
        df=db_query(sql,dbtype="postgresql",conp=self.conp_dst)
        if not df.empty:
            sql="drop external table %s.%s_tmp"%(dst_schema,dst_name)
            print(sql)
            db_command(sql,dbtype="postgresql",conp=self.conp_dst)


        coltext=self.get_coltext(tb_src,coltext)
        col_type=[[re.split('\s+',w,1)[0].strip(),re.split('\s+',w,1)[1].strip() ] for w in re.split(",(?!\s*[0-9\)])",coltext)]
        data1=[]#外部表
        data2=[]#目的地表
        data3=[]

        arr=[] #insert 处的内容
        brr=[] #insert 处的select 内容
        exclude_col=[]
        rename=para['rename']
        if para['exclude_col'] is not None:
            exclude_col=[ w.strip() for w in para['exclude_col'].split(',') ]
        for w in col_type:
            if w[1] in ['json','tsvector','jsonb']:
                data1.append([w[0],'text'])
            elif w[1].startswith('_'):
                data1.append([w[0],'text'])
            else:
                data1.append(w)
            if w[0] not in exclude_col:
                data2.append(w)
                arr.append(w[0])
                if w[0] in rename.keys():
                    brr.append(rename[w[0]])
                else:
                    if w[1] in ['json','jsonb','tsvector']:
                        brr.append('%s::%s %s'%(w[0],w[1],w[0]))
                    elif w[1].startswith('_'):
                        brr.append("%s::%s %s"%(w[0],w[1],w[0]))
                    else:
                        brr.append(w[0])

        coltext_external=','.join([ ' '.join(w1) for w1 in data1])
        coltext_dsttb=','.join([ ' '.join(w1) for w1 in data2])
        #dst里创建外部表
        partition_by="" if para['pxf_partition'] is None else para['pxf_partition']
        sql="""
        create  external table %s.%s_tmp(%s) 
        LOCATION ('pxf://%s.%s?PROFILE=JDBC&JDBC_DRIVER=org.postgresql.Driver&DB_URL=jdbc:postgresql://%s/%s&USER=%s&PASS=%s%s')
        FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import');
        """%(dst_schema,dst_name,coltext_external,src_schema,src_name,host,db,user,passwd,partition_by)
        print(sql)
        db_command(sql,dbtype="postgresql",conp=self.conp_dst)


        ##只生成外部表
        if para['external_only']:
            print("只生成外部表")
            return 
        #select into 生成表
        if para['drop']:
            sql="drop table if exists  %s.%s"%(dst_schema,dst_name)
            print(sql)
            db_command(sql,dbtype="postgresql",conp=self.conp_dst)
        else:
            sql="truncate  %s.%s"%(dst_schema,dst_name)
            print(sql)
            db_command(sql,dbtype="postgresql",conp=self.conp_dst)


        colnames=','.join(arr)
        colnames1=','.join(brr)
        if para['distri'] is None:
            sql="""create table if not exists %s (%s ) """%(tb_dst,coltext_dsttb)
        else:
            sql="""create table if not exists %s (%s ) distributed by(%s) """%(tb_dst,coltext_dsttb,para['distri'])
        print(sql)
        db_command(sql,dbtype="postgresql",conp=self.conp_dst)
        
        sql="""insert into  %s(%s) select %s from %s_tmp """%(tb_dst,colnames,colnames1,tb_dst)
        print(sql)
        db_command(sql,dbtype="postgresql",conp=self.conp_dst)


        #删除外部表
        sql="select tablename from pg_tables where schemaname='%s' and tablename='%s_tmp' "%(dst_schema,dst_name)
        df=db_query(sql,dbtype="postgresql",conp=self.conp_dst)
        if not df.empty:
            sql="drop external table %s.%s_tmp"%(dst_schema,dst_name)
            print(sql)
            db_command(sql,dbtype="postgresql",conp=self.conp_dst)

        ed=time.time()
        cost=int(ed-bg)
        print("total--cost %d s"%cost)
Пример #23
0
def query(sql):
    path1=join(dirname(__file__),"cfg_db")
    df=db_query(sql,dbtype='sqlite',conp=path1)
    return df 
Пример #24
0
def get_conp1(name):
    path1=join(dirname(__file__),"cfg_db")

    df=db_query("select * from cfg where database='%s' and schema='public' "%name,dbtype='sqlite',conp=path1)
    conp=df.values.tolist()[0]
    return conp
Пример #25
0
    def get_coltext(self,tbname,coltext=None):
        schema,name=tbname.split(".")
        sql="""select
            col.ordinal_position,
            col.column_name,
            col.udt_name as data_type,
            col.character_maximum_length,
            col.numeric_precision,
            col.numeric_scale,
            col.is_nullable,
            col.column_default
            from
            information_schema.columns as col 
            where
            table_schema = '%s'
            and table_name = '%s'
            order by
            ordinal_position;
            """%(schema,name)
        df=db_query(sql,dbtype="postgresql",conp=self.conp_src)
        data=[]
        if coltext is not None:  
            coldict={ re.split('\s+',w,1)[0]:re.split('\s+',w,1)[1]  for w in re.split(",(?!\s*[0-9\)])",coltext)}
        else:
            coldict={}

        for i in range(len(df)):
            col_name,data_type,char_len,numeric_precision,numeric_scale=df.at[i,'column_name'],df.at[i,'data_type'],df.at[i,'character_maximum_length'],df.at[i,'numeric_precision'],df.at[i,'numeric_scale']
            if col_name in coldict.keys():
                data.append("%s %s"%(col_name,coldict[col_name]))
                continue 
            if data_type.startswith('_'):
                data.append("%s %s"%(col_name,data_type))
                continue 
            if 'char' in data_type:
                s="%s varchar"%(col_name) if char_len is None or str(char_len)=='nan' else "%s varchar(%s)"%(col_name,str(int(char_len)))
                data.append(s)
                continue 
            if 'numeric' in data_type:
                s="%s numeric"%(col_name) if numeric_precision is None or  str(numeric_precision)=='nan' else "%s numeric(%s,%s)"%(col_name,str(int(numeric_precision)),str(int(numeric_scale)) )
                data.append(s)
                continue 
            if 'double' in data_type or  'float' in data_type:
                s="%s float"%(col_name) 
                data.append(s)
                continue
            if data_type=='integer':
                s="%s int"%(col_name) 
                data.append(s)
                continue
            if 'json'==data_type:
                s="%s json"%(col_name) 
                data.append(s)
                continue
            if 'jsonb'==data_type:
                s="%s jsonb"%(col_name) 
                data.append(s)
                continue
            if 'timestamp' in data_type:
                s="%s timestamp(0)"%(col_name) 
                data.append(s)
                continue
            
            data.append("%s %s"%(col_name,data_type))

        coltext=','.join(data)

        return coltext