Пример #1
0
def getMainHTML(target,mf_tpe):
    # session=requests.session()
    params={"currentPage":1,"type":"2"}

    res = requests.get(target, params=params, headers=random.sample(headers, 1)[0]);
    # cookie=res.cookies;
    soup = BeautifulSoup(res.content, 'lxml')
    trs = None
    page=soup.find('div',class_='paging').find_all('span')
    recordCount=re.sub('\D',"",page[1].get_text())
    recordCount=recordCount[1:recordCount.__len__()]
    for i in range(358,int(recordCount)+1):
        print("爬取第 %d 页的数据"%(i) )
        params={"currentPage":i,"type":"2"}
        res = requests.get(target, params=params, headers=random.sample(headers, 1)[0]);
        if(res.status_code==200 and res.text.find("400 Bad Request")==-1):
            soup = BeautifulSoup(res.content, 'lxml')
            trs = soup.find('table').find_all('tr')
        else:
            logging.error("爬行目标  %s 出现解析错误"%(target))
            continue
        ret =None
        info=[]
        mf=None
        if(trs):
            for index,tr in enumerate(trs):
                if index==0:
                    continue
                else:
                    a=tr.find("a")
                    url=a["href"]
                    txt=a.get_text()
                    ret=db_kit.findMFOnByUrl(url,mf_tpe);
                    if ret is None :
                        info=getDetailHtml(url)
                        mf=db_kit.Mf()
                        mf.type=mf_tpe
                        mf.catalog=str(info[0]).replace("\r\n","").replace("\t","").strip()
                        mf.reg_time=info[1]
                        mf.reg_org=info[2]
                        mf.reg_num=info[3]
                        mf.legal=info[4]
                        mf.mng_unit=str(info[5]).replace("\r\n","").replace("\t","").strip()
                        mf.expiry_date=str(info[6]).replace("\r\n","").replace("\t","").strip()
                        mf.scope=info[7]
                        mf.ads=info[8]
                        mf.zip_code=info[9]
                        mf.tel=info[10]
                        mf.phone=info[11]
                        mf.url=url
                        mf.reg_name=str(txt).replace("\r\n","").replace("\t","").strip()
                        db_kit.insert(mf)
                        time.sleep(0.5)
                    elif ret and not ret.reg_name:
                        print("执行了更新操作")
                        ret.reg_name = txt
                        db_kit.update(ret)
                    else:
                        print("数据已经存在, %s "%(ret.reg_name))
    send_email.sendEmail("程序完成,快去查看")
Пример #2
0
def WAndRRbcc():
    workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/人保财产.xls')
    sheet = workbook.sheet_by_index(1)
    obj = None
    fCode = ''
    fName = ''
    sCode = ''
    header = ''
    lastF = ''
    insurance='iorbcc'
    for i in range(1, 699):
        if (str(sheet.cell(i, 0).value).strip() != ''):
            fCode = (str(sheet.cell(i, 0).value).strip())[0:2]
            fName = (str(sheet.cell(i, 0).value).strip())[2:]
            if fCode != lastF:
                header = ''
            obj = INOCC()
            obj.code = fCode.replace(' ','')
            obj.name = fName.replace(' ','')
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 1).value).strip() != ''):
            sCode = (str(sheet.cell(i, 1).value).strip())[0:4]
            sName = (str(sheet.cell(i, 1).value).strip())[4:]
            obj = INOCC()
            obj.pCode = fCode
            obj.code = sCode.replace(' ','')
            obj.name = sName.replace(' ','')
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 2).value).strip() != '' and (str(sheet.cell(i, 2).value).startswith('0') or str(sheet.cell(i, 2).value).startswith('1') or str(sheet.cell(i, 2).value).startswith('2'))):

            obj = INOCC()
            obj.name = header + '-' + (str(sheet.cell(i, 2).value).strip())[6:] if header != '' else (str(sheet.cell(i, 2).value).strip())[6:]
            obj.name = obj.name.replace(' ','')
            obj.code = (str(sheet.cell(i, 2).value).strip())[0:6]
            obj.code=obj.code.replace(' ','')
            obj.insurance = insurance
            obj.pCode = sCode
            if sheet.cell(i, 3).ctype == 2:
                obj.type = int(sheet.cell(i, 3).value) if str(sheet.cell(i, 3).value).strip() != '' else ''
            else:
                obj.type = str(sheet.cell(i, 3).value).strip()
            db_kit.insert(obj)
            printObj(obj)
        elif str(sheet.cell(i, 2).value).strip().startswith('注:'):
            continue
        else:
            header = str(sheet.cell(i, 2).value).strip()

        lastF = fCode
        print(str(i))
Пример #3
0
def WAndRHt():
    workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/华泰职业类别表.xls')
    sheet = workbook.sheet_by_index(0)
    obj = None
    fCode = ''
    fName = ''
    sCode = ''
    header = ''
    lastF = ''
    insurance='ioht'
    for i in range(2, 998):
        if (str(sheet.cell(i, 0).value).strip() != ''):
            fCode = str(sheet.cell(i, 0).value).strip()
            fName = (str(sheet.cell(i, 1).value).strip())[2:]
            if fCode != lastF:
                header = ''
            obj = INOCC()
            obj.code = fCode
            obj.name = fName
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 2).value).strip() != ''):
            sCode = str(sheet.cell(i, 2).value).strip()
            sName = (str(sheet.cell(i, 3).value).strip())[4:]
            obj = INOCC()
            obj.pCode = fCode
            obj.code = sCode
            obj.name = sName
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 4).value).strip() != ''):

            obj = INOCC()
            obj.name = header + '-' + str(sheet.cell(i, 5).value) if header != '' else str(sheet.cell(i, 5).value)
            obj.name = obj.name.strip()
            obj.code = str(sheet.cell(i, 4).value).strip()
            obj.insurance = insurance
            obj.pCode = sCode
            if sheet.cell(i, 6).ctype == 2:
                obj.type = int(sheet.cell(i, 6).value) if str(sheet.cell(i, 6).value).strip() != '' else ''
            else:
                obj.type = str(sheet.cell(i, 6).value).strip()
            db_kit.insert(obj)
            printObj(obj)
        elif str(sheet.cell(i, 5).value).strip().startswith('注:'):
            continue
        else:
            header = str(sheet.cell(i, 5).value)

        lastF = fCode
Пример #4
0
def RAndWPAYL(file,bRow,eRow,insurance):
    workbook = xlrd.open_workbook(file)
    sheet = workbook.sheet_by_index(1)
    obj = None
    fCode = ''
    fName = ''
    sCode = ''
    header = ''
    lastF = ''
    for i in range(bRow, eRow):
        if (str(sheet.cell(i, 0).value).strip() != ''):
            fCode = str(sheet.cell(i, 0).value).strip()[0:2]
            fName = str(sheet.cell(i, 0).value).strip()[2:]
            if fCode!=lastF:
                header=''
            obj = INOCC()
            obj.code = fCode
            obj.name = fName
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 1).value).strip() != ''):
            sCode = str(sheet.cell(i, 1).value).strip()[0:4]
            sName = str(sheet.cell(i, 1).value).strip()[4:]
            obj = INOCC()
            obj.pCode = fCode
            obj.code = sCode
            obj.name = sName
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 2).value).strip() != ''):

            obj = INOCC()
            obj.name = header + '-' + str(sheet.cell(i, 3).value) if header != '' else str(sheet.cell(i, 3).value)
            obj.name = obj.name.strip()
            obj.code = str(sheet.cell(i, 2).value).strip()
            obj.insurance = insurance
            obj.pCode = sCode
            if sheet.cell(i,4).ctype==2:
                obj.type = int(sheet.cell(i, 4).value) if str(sheet.cell(i, 4).value).strip() != '' else ''
            else:
                obj.type = str(sheet.cell(i, 4).value)
            db_kit.insert(obj)
            printObj(obj)
        elif str(sheet.cell(i, 3).value).strip().startswith('注:'):
            continue
        else:
            header = str(sheet.cell(i, 3).value)

        lastF = fCode
Пример #5
0
def RAndWXls(file,col,bRow,eRow,insurance,hasCode=True):
    workbook = xlrd.open_workbook(file)
    sheet = workbook.sheet_by_index(1)





    obj = INOCC()
    fCode=''
    if hasCode:
        f = sheet.cell(bRow, col - 4).value
        obj.insurance = insurance
        obj.name = f
        obj.code=sheet.cell(bRow,col-5).value
        fCode=obj.code
    else:
        f = str(sheet.cell(bRow, col - 2).value)
        obj.insurance = insurance
        fa=f.split(" ")
        obj.name = fa[-1]
        obj.code = fa[0]
        fCode=obj.code

    print('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]))

    db_kit.insert(obj)


    g=''
    gCode='';
    fl=''
    for i in range(bRow,eRow):
        obj=INOCC()

        if sheet.cell(i,col-1).value!='':
            fl=''
            g=str(sheet.cell(i,col-1).value)
            ga=g.split(" ")
            if hasCode :
                obj.insurance = insurance
                obj.name = g
                obj.pCode=fCode
                obj.code = sheet.cell(i, col - 3).value
                gCode=obj.code
            else:
                obj.insurance = insurance
                obj.name = ga[-1]
                obj.pCode = fCode
                obj.code = ga[0]
                gCode = obj.code
            db_kit.insert(obj)
            print('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]))


        obj=INOCC()
        obj.pCode=gCode
        if hasCode:
            obj.code=sheet.cell(i,col-1).value
            obj.name=sheet.cell(i,col).value
            if sheet.cell(i,col+1).ctype==2:
                obj.type=int(sheet.cell(i,col+1).value)
            else:
                obj.type = str(sheet.cell(i, col + 1).value)
        else:
            d=str(sheet.cell(i, col).value)
            if d.startswith("注"):
                continue
            da=d.split(" ")
            if len(da) == 1 :
                fl=da[0]
                continue


            obj.code = da[0]
            obj.name = da[-1] if fl=='' else fl+'-'+da[-1]
            if sheet.cell(i, col + 1).ctype==2:
                obj.type = int(sheet.cell(i, col + 1).value)
            else:
                obj.type = str(sheet.cell(i, col + 1).value)

        obj.insurance=insurance


        print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]))
        db_kit.insert(obj)
Пример #6
0
def WAndRRbjk():
    workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/人保健康职业类别表.xlsx')
    sheet = workbook.sheet_by_index(0)
    obj = None
    fCode = ''
    fName = ''
    sCode = ''
    header = ''
    lastF = ''
    insurance='iorbjk'
    for i in range(1, 325):
        if (str(sheet.cell(i, 0).value).strip() != ''):
            fCode = (int(sheet.cell(i, 0).value))
            fName = (str(sheet.cell(i, 1).value).strip())
            if fCode != lastF:
                header = ''
            obj = INOCC()
            obj.code = fCode
            obj.name = fName.replace(' ','')
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if (str(sheet.cell(i, 2).value).strip() != ''):
            sCode = (int(sheet.cell(i, 2).value))
            sName = (str(sheet.cell(i, 3).value).strip())
            obj = INOCC()
            obj.pCode = fCode
            obj.code = sCode
            obj.name = sName.replace(' ','')
            obj.insurance = insurance
            # if db_kit.existCheck(obj.code, insurance) == 'no':
            db_kit.insert(obj)
            printObj(obj)
        if str(sheet.cell(i, 4).value).strip() != '' :

            obj = INOCC()

            s=str(sheet.cell(i,4).value).replace(' ','')
            if s.startswith('ns'):
                obj.name = header + '-' + (str(sheet.cell(i, 4).value).strip()) if header != '' else (str(sheet.cell(i, 4).value).strip())
                obj.name = obj.name.replace(' ','')
                obj.code = str(sCode)+'00'
                obj.code=obj.code.replace(' ','')
                obj.insurance = insurance
                obj.pCode = sCode
                if sheet.cell(i, 5).ctype == 2:
                    obj.type = int(sheet.cell(i, 5).value) if str(sheet.cell(i, 5).value).strip() != '' else ''
                else:
                    obj.type = str(sheet.cell(i, 5).value).strip()
                db_kit.insert(obj)
            else:
                l=s.split('、')
                j=0;
                for tname in l:
                    obj = INOCC()
                    obj.name = header + '-' + tname if header != '' else tname
                    obj.name = obj.name.replace(' ', '')
                    obj.code = str(sCode) + str(j).zfill(2)
                    obj.code = obj.code.replace(' ', '')
                    obj.insurance = insurance
                    obj.pCode = sCode
                    if sheet.cell(i, 5).ctype == 2:
                        obj.type = int(sheet.cell(i, 5).value) if str(sheet.cell(i, 5).value).strip() != '' else ''
                    else:
                        obj.type = str(sheet.cell(i, 5).value).strip()
                    db_kit.insert(obj)
                    j+=1

        elif str(sheet.cell(i, 2).value).strip().startswith('注:'):
            continue
        else:
            header = str(sheet.cell(i, 2).value).strip()

        lastF = fCode
        print(str(i))
Пример #7
0
def RAndWTPYRS_html():
    soup = BeautifulSoup(open('/Users/yuhaihui8913/Documents/wh/太平洋人寿final.html'))
    print(soup.prettify())
    trs = soup.find("table", class_="MsoNormalTable").find_all("tr")
    obj = None
    fCode = ''
    fName = ''
    sCode = ''
    header = ''
    lastF = ''
    for tr in trs:
        tds = tr.find_all("td")

        if len(tds)==5:
           if True:
               ps=tds[0].find_all('p')
               fCode = str(ps[0].get_text()).strip() if str(ps[0].get_text()).strip()!='' else fCode  if len(ps)==1 else ps[0].get_text()
               fName = str(ps[0].get_text()).strip() if str(ps[0].get_text()).strip()!='' else fCode if len(ps)==1 else ps[1].get_text()
               if fCode != lastF:
                   header = ''
               obj = INOCC()
               obj.code = fCode
               obj.name = fName
               obj.insurance = 'iotpyrs'
               if db_kit.existCheck(obj.code, 'iotpyrs') == 'no':
                    db_kit.insert(obj)
                    printObj(obj)
           if True:
               ps = tds[1].find_all('p')
               sCode = str(ps[0].get_text()).strip() if len(ps)==1 else str(ps[0].get_text()).strip()
               sName = str(ps[0].get_text()).strip() if len(ps)==1 else str(ps[1].get_text()).strip()
               obj = INOCC()
               obj.pCode = fCode
               obj.code = sCode
               obj.name = sName
               obj.insurance = 'iotpyrs'
               if db_kit.existCheck(obj.code, 'iotpyrs') == 'no':
                    db_kit.insert(obj)
                    printObj(obj)
           if True:

               obj = INOCC()
               spans=tds[2].p.contents
               obj.name = header + '-' + str(tds[2].p.contents[1].get_text()).strip() if header != '' else str(tds[2].p.contents[1].get_text()).strip()
               obj.name = obj.name.strip()
               obj.code = tds[2].p.contents[0].get_text().strip()
               obj.insurance = 'iotpyrs'
               obj.pCode = sCode
               obj.type = tds[4].p.span.get_text().strip()
               db_kit.insert(obj)
               printObj(obj)
        if len(tds)==4:
           if True:
               ps=tds[0].find_all('p')
               sCode = str(ps[0].get_text()).strip()
               sName = str(ps[1].get_text()).strip()
               obj = INOCC()
               obj.pCode = fCode
               obj.code = sCode
               obj.name = sName
               obj.insurance = 'iotpyrs'
               if db_kit.existCheck(obj.code, 'iotpyrs') == 'no':
                    db_kit.insert(obj)
                    printObj(obj)
           if True:
               obj = INOCC()
               obj.name = header + '-' + str(tds[1].p.contents[1].get_text().strip()) if header != '' else str(tds[1].p.contents[1].get_text()).strip()
               obj.name = obj.name.strip()
               obj.code = tds[1].p.contents[0].get_text().strip()
               obj.insurance = 'iotpyrs'
               obj.pCode = sCode
               obj.type = tds[3].p.span.get_text().strip()
               db_kit.insert(obj)
               printObj(obj)
        if len(tds)==3:
           if True:
               obj = INOCC()
               obj.name = header + '-' + str(tds[0].p.contents[1].get_text()).strip() if header != '' else str(tds[0].p.contents[1].get_text()).strip()
               obj.name = obj.name.strip()
               obj.code = tds[0].p.contents[0].get_text().strip()
               obj.insurance = 'iotpyrs'
               obj.pCode = sCode
               try:
                   obj.type = tds[2].get_text().strip()
               except AttributeError :
                   print(tds[2]+'===========================================================================')


               db_kit.insert(obj)
               printObj(obj)
        if len(tds)==2:
            ps = tds[0].find_all('p')
            if len(ps)==2:
                sCode = str(ps[0].get_text()).strip()
                sName = str(ps[1].get_text()).strip()
                obj = INOCC()
                obj.pCode = fCode
                obj.code = sCode
                obj.name = sName
                obj.insurance = 'iotpyrs'
                if db_kit.existCheck(obj.code, 'iotpyrs') == 'no':
                    db_kit.insert(obj)
                    printObj(obj)
            if tds[1].p.span.get_text().strip() != '':
               header=tds[1].p.span.get_text().strip()
        if len(tds)==1:
           if tds[0].p.span.get_text().strip() != '' and not str(tds[0].p.span.get_text()).strip().startswith('注:'):
               header=tds[0].p.span.get_text().strip()
           else:
               continue
    lastF = fCode
Пример #8
0
def getMainHTML():
    session = requests.session()
    res = session.get(target,
                      params=None,
                      headers=random.sample(headers, 1)[0])
    cookie = res.cookies
    soup = BeautifulSoup(res.content, 'lxml')
    div_a = soup.find('ess_contentpane').find_all('a', recursive=False)
    # div_a=div_a.find_next_siblings('a')
    bxgslx = ''
    for a in div_a:
        '''
            设置特殊查询的区域
        '''
        if a['name'] != '8245': continue
        pageTotal = soup.find(id='ess_ctr' + a['name'] +
                              '_OrganizationList_lblPageNum').get_text()
        pageNum = soup.find(id='ess_ctr' + a['name'] +
                            '_OrganizationList_lblAtPageNum').get_text()
        pageTotal = int(pageTotal if pageTotal else '1')
        pageNum = int(pageNum if pageNum else '1')
        bxgslx = soup.find(id='ess_ctr' + a['name'] +
                           '_OrganizationList_lblClassName')
        print('当前页 %d ,一共 %d 页' % (pageNum, pageTotal))
        __VIEWSTATE = ''
        __VIEWSTATEGENERATOR = ''
        for i in range(0, pageTotal + 1):
            logging.info('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i))
            print('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i))
            urls = []
            p = {}
            header = {}

            if (i == 0):
                urls = soup.find(id='ess_ctr' + a['name'] +
                                 '_OrganizationList_rptCompany').find_all('a')
                __VIEWSTATE = soup.find(id='__VIEWSTATE')['value']
                __VIEWSTATEGENERATOR = soup.find(
                    id='__VIEWSTATEGENERATOR')['value']

            else:

                p = {
                    "__EVENTTARGET": (
                        None,
                        "ess$ctr" + a['name'] + "$OrganizationList$lbnToPage",
                        None,
                    ),
                    "ess$ctr" + a['name'] + "$OrganizationList$lblAtPageNum":
                    (None, str(i + 1)),
                    "__EVENTARGUMENT": (None, ""),
                    # "ess$ctr8245$OrganizationList$lblAtPageNum": (None, ""),
                    "__VIEWSTATEGENERATOR": (None, __VIEWSTATEGENERATOR),
                    # "ess$ctr8247$OrganizationList$lblAtPageNum": (None, "")
                    # , "ess$ctr8248$OrganizationList$lblAtPageNum": (None, ""),
                    # "ess$ctr8249$OrganizationList$lblAtPageNum": (None, ""),
                    # "ess$ctr8250$OrganizationList$lblAtPageNum": (None, ""),
                    # "__essVariable": (None, ""),
                    # "ScrollTop": (None, ""),
                    # "select": (None, ""),
                    # "select2": (None, ""),
                    # "q": (None, ""),
                    "__VIEWSTATE": (None, __VIEWSTATE)
                }

                header = random.sample(headers, 1)[0]
                header = header['User-Agent']

                proxy = requests.get('http://192.168.50.229:5010/get').text
                proxies = {"http": proxy}
                _res = session.post(
                    target,
                    files=p,
                    cookies=cookie,
                    # proxies=proxies,
                    headers={
                        "Accept":
                        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                        "Referer":
                        "http://www.circ.gov.cn/tabid/2576/Default.aspx",
                        "Host": "www.circ.gov.cn",
                        "Accept-Language": "zh-CN,zh;q = 0.9",
                        "Origin": "http://www.circ.gov.cn",
                        # "Content-Type": "multipart/form-data; boundary=----WebKitFormBoundaryl8pqZs1k7pDwvrlo",
                        "User-Agent": header
                    })
                # print('分页请求参数')
                # print(_res.request.body.decode())
                # print(_res.request.headers)
                # print(_res.content.decode())
                _soup = BeautifulSoup(_res.content, 'lxml')
                urls = _soup.find(id='ess_ctr' + a['name'] +
                                  '_OrganizationList_rptCompany')
                # print(_res.text)
                if urls is not None:
                    urls = urls.find_all('a')
                else:
                    logging.error('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i) +
                                  ",主页没有解析到正确的链接数据")
                    continue
                __VIEWSTATE = soup.find(id='__VIEWSTATE')['value']
                __VIEWSTATEGENERATOR = soup.find(
                    id='__VIEWSTATEGENERATOR')['value']
            print(urls)
            companyInfo = None
            insurer = None
            # companyInfos=[];
            for url in urls:
                detailUrl = re.findall(r"'(.+?)'", str(url['onclick']))
                ret = db_kit.findOnByUrl(detailUrl[0])
                if (ret is None):
                    companyInfo = getCompanyHtml(detailUrl[0])
                    if (companyInfo.__len__() == 9):
                        insurer = db_kit.Insurer()
                        insurer.orgName = companyInfo[0]
                        insurer.orgType = companyInfo[1]
                        insurer.cat = companyInfo[2]
                        insurer.orgAddress = companyInfo[3]
                        insurer.tel = companyInfo[4]
                        insurer.leader = companyInfo[5]
                        insurer.capital = companyInfo[6]
                        insurer.registerAddress = companyInfo[7]
                        insurer.state = companyInfo[8]
                        insurer.url = detailUrl[0]
                        insurer.catalog = bxgslx.get_text()
                        db_kit.insert(insurer)
                    else:
                        logging.error(detailUrl[0] + '返回的内容不正确。没有解析出正确内容')
                # companyInfos.append(companyInfo)
                    time.sleep(random.randint(1, 2))