예제 #1
0
    def get_xmjl_detail(self, detail_page, conp, datalist, driver, tablename):
        # try:
        data = []
        # for  entname, name,href in datalist:
        for shi_text, entname, name, href in datalist:
            driver.get(href)
            page = driver.page_source
            body = etree.HTML(page)
            content_list = body.xpath(detail_page.get_datas_xpaht())[1:]
            for content in content_list:
                ggname = content.xpath(detail_page.ggname_texts)[0].strip()
                zbtime = content.xpath(detail_page.zbtime_texts)[0].strip()
                # tmp = [href,entname, name,ggname,zbtime]
                tmp = [href, shi_text, entname, name, ggname, zbtime]
                data.append(tmp)

    # except BaseException as  msg:
    #     print(msg)
        print(data)
        df = pd.DataFrame(data=data,
                          columns=[
                              "href", "shi_text", "entname", "name", "ggname",
                              "zbtime"
                          ])
        # df = pd.DataFrame(data=data, columns=["href", "entname", "name", "ggname", "zbtime"])
        db_write(df,
                 tablename,
                 dbtype='postgresql',
                 datadict='postgresql-text',
                 conp=conp,
                 if_exists='append')
예제 #2
0
    def write(self, **krg):
        url = krg["url"]
        f1 = krg["f1"]
        f2 = krg["f2"]
        tb = krg["tb"]
        col = krg["col"]
        if "total" not in krg.keys():
            total = None
        else:
            total = krg["total"]

        if "num" not in krg.keys():
            num = None
        else:
            num = krg["num"]
        if "dbtype" not in krg.keys():
            dbtype = "postgresql"
        else:
            dbtype = krg["dbtype"]
        if "conp" not in krg.keys():
            conp = ["postgres", "since2015", "127.0.0.1", "postgres", "public"]
        else:
            conp = krg["conp"]

        df = self.getdf(url, f1, f2, total, num)
        if len(df) > 1:
            print(url)
            #print(df)
            df.columns = col
        else:
            df = pd.DataFrame(columns=col)
            print("暂无数据")
        db_write(df, tb, dbtype=dbtype, conp=conp)
예제 #3
0
def work(conp):
    driver = init_driver()
    driver.get(
        "https://ggzy.chuzhou.gov.cn/Front_jyzx/ShowInfo/ShowSearchInfo.aspx?CategoryNum=002008001001&Eptr3=&datefrom=&dateto=&xiaqu=&zbfs=&Paging=1"
    )
    result = f1(driver, 5)
    db_write(result, 'anhui_chuzhou_ggzy_gg', dbtype="postgresql", conp=conp)
예제 #4
0
def work(conp):
    driver = init_driver()
    driver.get("http://www.e-qyzc.com/gg/ggList")
    result = f1(driver, 5)
    db_write(result,
             'guizhou_guizhousheng_qita_gg',
             dbtype="postgresql",
             conp=conp)
예제 #5
0
def work(conp):
    driver = init_driver()
    driver.get(
        "http://www.szzfcg.cn/portal/topicView.do?method=view&id=1660&agencyType=1"
    )
    result = f1(driver, 5)
    db_write(result,
             'guangdong_shenzhen_zfcg_gg',
             dbtype="postgresql",
             conp=conp)
예제 #6
0
def work(conp):
    driver = init_driver()
    driver.get(
        "http://www.liaoyang.gov.cn/OpenData/opendata/ggzy/list/PurchaseList1.html"
    )
    result = f1(driver, 5)
    db_write(result,
             'liaoning_liaoyang_ggzy_gg',
             dbtype="postgresql",
             conp=conp)
예제 #7
0
    def write(self, **krg):
        url = krg["url"]
        f1 = krg["f1"]
        f2 = krg["f2"]
        tb = krg["tb"]
        col = krg["col"]
        # headless=krg["headless"]
        if "total" not in krg.keys():
            total = None
        else:
            total = krg["total"]

        if "num" not in krg.keys():
            num = None
        else:
            num = krg["num"]
        if "dbtype" not in krg.keys():
            dbtype = "postgresql"
        else:
            dbtype = krg["dbtype"]
        if "conp" not in krg.keys():
            conp = ["postgres", "since2015", "127.0.0.1", "postgres", "public"]
        else:
            conp = krg["conp"]
        if "headless" not in krg.keys():
            self.headless = True
        else:
            self.headless = krg["headless"]

        if "pageloadstrategy" not in krg.keys():
            self.pageloadstrategy = 'normal'
        else:
            self.pageloadstrategy = krg["pageloadstrategy"]

        if "pageloadtimeout" not in krg.keys():
            self.pageloadtimeout = 40
        else:
            self.pageloadtimeout = krg["pageloadtimeout"]

        print("%s 开始" % tb)
        df = self.getdf(url, f1, f2, total, num)
        if len(df) > 1:
            print(url)
            # print(df)
            df.columns = col
        else:
            df = pd.DataFrame(columns=col)
            print("暂无数据")
        db_write(df, tb, dbtype=dbtype, conp=conp)
예제 #8
0
def write_gg(path, conp, tbname, jytype=None):
    if jytype == 'gcjs': jytype = "工程建设"
    if jytype == 'zfcg': jytype = "政府采购"
    # path="D:\\bsttmp\\kuming_gcjs\\file"
    # conp=["postgres",'since2015','192.168.4.188','base','cdc']
    # tbname="cdc_gg"
    arr = os.listdir(path)
    for w in arr:
        if w.endswith('csv'):
            #print(w)
            csv = w
            break

    dfs = pd.read_csv(os.path.join(path, csv),
                      sep='\001',
                      quotechar='\002',
                      chunksize=1000)

    count = 1

    for df in dfs:
        df.columns = [
            'bd_guid', 'bd_bh', 'bd_name', 'zbr', 'zbdl', 'xmjl', 'xmjl_dj',
            'xmjl_zsbh', 'bm_endtime', 'bm_endtime_src', 'tb_endtime',
            'tb_endtime_src', 'bzj_time', 'bzj_time_src', 'kb_time',
            'kb_time_src', 'pb_time', 'pb_time_src', 'db_time', 'db_time_src',
            'pb_time', 'pb_time_src', 'zhongbiao_hxr', 'zhongbiao_hxr_src',
            'kzj', 'kzj_src', 'zhongbiaojia', 'zhongbiaojia_src', 'bd_dizhi',
            'diqu', 'ggtype', 'gg_name', 'gg_fabutime', 'gg_file',
            'gg_fujian_file', 'gg_href'
        ]
        df['jytype'] = jytype
        datadict = {w: TEXT() for w in df.columns}

        if count == 1:
            db_write(df,
                     tbname,
                     dbtype='postgresql',
                     conp=conp,
                     datadict=datadict)
        else:
            db_write(df,
                     tbname,
                     dbtype='postgresql',
                     conp=conp,
                     if_exists='append',
                     datadict=datadict)
            print("写入第%d " % count)
        count += 1
예제 #9
0
    def get_jst_qyzz(self,data_page,conp,total_ye,gotoyema,shengfen,yewu,tablename,data,shi=None,):
        # 选择企业资质
        data_page.select_yewu(yewu)
        # 选择省份
        data_page.select_shengfen(shengfen)
        # 选择市
        if shi != None:
            data_page.select_shi(shi)

        # 点击搜索按钮
        data_page.click_search_button()
        sleep(3)

        # 获得当前页码
        cur_yema = data_page.get_cur_yema()
        # 翻页
        if int(cur_yema) != int(gotoyema):
            data_page.goto_yema(gotoyema)
            sleep(3)


        page = self.driver.page_source
        body = etree.HTML(page)
        # print(body)
        # 得到当前页面所有的企业
        qys = body.xpath(data_page.get_qys_xpath())
        # print(qys)
        for qy  in  qys:
            qyname=qy.xpath(data_page.get_entname_xpath())[0].strip()
            print(qyname)
            #点击更多按钮
            data_page.click_gengduo_button()
            sleep(2)

            #获得一家企业的所有企业资质
            oneqy_all_qyzzs = qy.xpath(data_page.get_oneqy_allqyzz())
            print(','.join(str(s) for s in oneqy_all_qyzzs if s not in [None]))
            for qyzz  in  oneqy_all_qyzzs:
                qyzzname = qyzz.xpath(data_page.get_qyzzname_xpath())[0].strip()
                print(qyname)
                print(qyzzname)
                tmp=[qyname,qyzzname]
                data.append(tmp)
                print(data)
        print(data)
        df = pd.DataFrame(data=data, columns=["qyname", "qyzzname"])
        db_write(df, tablename, dbtype='postgresql', datadict='postgresql-text', conp=conp, if_exists='append')
예제 #10
0
def work(conp):

    data = []
    parames = get_parames(conp)

    for parame in parames:
        time.sleep(0.1)
        qymc = parame[0]
        zzcode = parame[1]
        result = get_data(qymc, zzcode, conp)
        tmp = [qymc, zzcode, result]
        print(tmp)
        data.append(tmp)
    df = pd.DataFrame(data=data, columns=["qymc", "bst_zzcode", "result"])
    db_write(df,
             "jianshetong_biaoshitong_result",
             dbtype='postgresql',
             conp=conp,
             if_exists='append')
예제 #11
0
def read_excel_2_db(conp, filename, table_name, if_exists="replace"):
    """
    :param conp:
    :param filename: 文件路径
    :param table_name: 表明
    :param if_exists: replace 替换, append 追加
    :return:
    """

    result = read_excel(filename,
                        sheet_name='Sheet1',
                        converters={'person_key': str})

    for column in result.columns:
        result[column] = result[column].astype(object)
    db_write(result,
             table_name,
             dbtype="postgresql",
             datadict='postgresql-text',
             conp=conp,
             if_exists=if_exists)
    print("导入成功")
예제 #12
0
def pg2pg(sql,
          tb,
          conp1,
          conp2,
          chunksize=100,
          f=None,
          if_exists='replace',
          datadict='postgresql-text'):
    conp = conp1
    con = create_engine("postgresql://%s:%s@%s/%s" %
                        (conp[0], conp[1], conp[2], conp[3]),
                        encoding='utf-8',
                        execution_options=dict(stream_results=True))
    dfs = pd.read_sql(sql, con, chunksize=chunksize)
    count = 1
    for df in dfs:
        try:
            total = count * chunksize
            print('第%d行写入中' % total)
            if f is not None:
                df = f(df)
            if count == 1:
                db_write(df,
                         tb,
                         dbtype="postgresql",
                         conp=conp2,
                         if_exists=if_exists,
                         datadict=datadict)
            else:
                #krg['header']=False
                db_write(df,
                         tb,
                         dbtype="postgresql",
                         conp=conp2,
                         if_exists='append',
                         datadict=datadict)
            count += 1
        except:
            traceback.print_exc()
예제 #13
0
def write_html(path, conp, tbname):
    # path="D:\\bsttmp\\kuming_gcjs\\file"
    # conp=["postgres",'since2015','192.168.4.188','base','cdc']
    # tbname="cdc_html"
    arr = os.listdir(path)
    data = []
    count = 1
    for w in arr:
        if w.endswith('html'):
            with open(os.path.join(path, w), 'r', encoding='utf8') as f:
                content = f.read()
                tmp = [w[:-5], content]
                data.append(tmp)
        if count == 1:

            df = pd.DataFrame(data=data, columns=['guid', 'page'])
            datadict = {"guid": TEXT(), 'page': TEXT()}
            db_write(df,
                     tbname,
                     dbtype='postgresql',
                     conp=conp,
                     if_exists='replace',
                     datadict=datadict)
            data = []
        elif count % 1000 == 0:
            df = pd.DataFrame(data=data, columns=['guid', 'page'])
            datadict = {"guid": TEXT(), 'page': TEXT()}
            db_write(df,
                     tbname,
                     dbtype='postgresql',
                     conp=conp,
                     if_exists='append',
                     datadict=datadict)
            data = []
            print("写入1000")
        count += 1
    df = pd.DataFrame(data=data, columns=['guid', 'page'])
    datadict = {"guid": TEXT(), 'page': TEXT()}
    db_write(df,
             tbname,
             dbtype='postgresql',
             conp=conp,
             if_exists='append',
             datadict=datadict)
예제 #14
0
    def get_jst_ryzz(
        self,
        jst_ryzz_page,
        conp,
        data,
        total_ye,
        shengfen,
        yewu,
        tablename,
        shi=None,
    ):

        # 选择人员资质
        jst_ryzz_page.select_yewu()
        # 选择省份
        jst_ryzz_page.select_shengfen(shengfen)
        # 选择市
        if shi != None:
            jst_ryzz_page.select_shi(shi)

        page = self.driver.page_source
        # print(page)
        body = etree.HTML(page)
        ryzzlb_count = 2
        # 得到所有的人员资质类别
        ryzzs = body.xpath(jst_ryzz_page.get_ryzzs_xpath())[2:]
        # print("ryzzs  "+ryzzs)
        for ryzzlb in ryzzs:
            # print("ryzzlb  " + ryzzlb)
            # 选择人员资质类别
            jst_ryzz_page.select_ryzzlb(ryzzlb_count)
            sleep(5)
            ryzzlb_count += 1
            # 点击搜索按钮
            jst_ryzz_page.click_search_button()
            sleep(3)

            for gotoyema in range(1, total_ye):
                # 获得当前页面
                cur_yema = jst_ryzz_page.get_cur_yema()
                print(cur_yema)
                print(gotoyema)
                # 翻页
                if int(cur_yema) != int(gotoyema):
                    jst_ryzz_page.goto_yema2(gotoyema)
                    sleep(3)

                page = self.driver.page_source
                # print(page)
                body = etree.HTML(page)
                content_list = body.xpath(jst_ryzz_page.get_datas_xpath())
                for content in content_list:
                    entname = content.xpath(
                        jst_ryzz_page.get_entname_xpath())[0].strip()
                    # print("entname  " + entname)
                    name = content.xpath(
                        jst_ryzz_page.get_name_xpath())[0].strip()
                    # print("name  " + name)
                    ryzz = content.xpath(
                        jst_ryzz_page.get_ryzz_xpath())[0].strip()
                    # print("ryzz  " + ryzz)
                    tmp = [entname, name, ryzz]
                    data.append(tmp)
                print(data)
                df = pd.DataFrame(data=data,
                                  columns=["entname", "name", "ryzz"])
                db_write(df,
                         tablename,
                         dbtype='postgresql',
                         datadict='postgresql-text',
                         conp=conp,
                         if_exists='append')
예제 #15
0
def csv2pg(path, conp, **krg):

    para = {
        "chunksize": 1000,
        "tb": os.path.split(path)[1].replace('.csv', ''),
        "f": None,
        "if_exists": "replace",
        "sep": "\001",
        "datadict": "postgresql-text"
    }
    para.update(krg)

    chunksize = para['chunksize']
    f = para['f']
    if_exists = para['if_exists']
    datadict = para["datadict"]
    tb = para['tb']
    para1 = copy.deepcopy(para)

    for w in ['datadict', 'f', 'if_exists', 'tb']:
        para1.pop(w)
    dfs = pd.read_csv(path, **para1)
    count = 1

    for df in dfs:
        total = count * chunksize
        print('第%d行写入中' % total)
        if f is not None:
            df = f(df)
        if count == 1:
            db_write(df,
                     tb,
                     dbtype="postgresql",
                     conp=conp,
                     if_exists=if_exists,
                     datadict=datadict)
        else:
            #krg['header']=False
            db_write(df,
                     tb,
                     dbtype="postgresql",
                     conp=conp,
                     if_exists='append')
        count += 1


# sql="select * from hefei.gg limit 100"
# conp=["postgres",'since2015','192.168.4.175','anhui','hefei']
# path="d:\\test.csv"

# def f1(df):
#     df['name']='xx'
#     return df
# pg2csv(sql,conp,path)
# def f1(df):
#     df['name']='xx'
#     return df

# conp1=["postgres",'since2015','192.168.4.175','anhui','hefei']

# conp2=["postgres",'since2015','192.168.4.175','mine','hunan']

# pg2pg("select * from hefei.gg limit 1000",'test',conp1,conp2,f=f1)
# path="D:/webroot/bstdata/base_20190421.csv"
# conp=["gpadmin","since2015",'192.168.4.179',"base_db","v2"]
# sql="select distinct on(html_key) * from v2.t_gg where ggstart_time>='2019-04-21' and ggstart_time<'2020-05-20' and html_key>7923769 "
# dfs=pg2csv(sql,conp,path,10,sep='\001')