Exemplo n.º 1
0
def down_department_details(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc,
                            time):
    xz_plan_num = xz_real_num = xz_lone_num = sy_plan_num = sy_real_num = sy_lone_num = gq_plan_num = gq_real_num = gq_lone_num = '0'
    url = get_department_url(base, dwbh)
    try:
        response = requests.get(url, timeout=1000, headers=headers)
    except:
        get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base,
                           time)
        return
    response.encoding = 'utf-8'
    try:
        soup = BeautifulSoup(response.text, "html.parser").find(
            'div', style="width: 757; height: 582; background-color: #EFF8FF;"
        ).table.find_all('tr')[2].td.table
        # soup = BeautifulSoup(response.text, "html.parser").div.table.find_all('tr')[2].td.table
    except AttributeError:
        get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base,
                           time)
        return
    else:
        if soup.find_all('tr')[0].find_all('td')[1].span.string is not None:
            dwmc = soup.find_all('tr')[0].find_all('td')[1].span.string.strip()
        elif soup.find_all('tr')[0].find_all(
                'td')[1].span.b.font.string is not None:
            dwmc = soup.find_all('tr')[0].find_all(
                'td')[1].span.b.font.string.strip()
        else:
            dwmc = ''
        if dwmc == '':
            department_text('编号:' + dwbh + '-->不存在!')
            return
        if soup.find_all('tr')[1].find_all('td')[1].string is not None:
            qtmc = soup.find_all('tr')[1].find_all('td')[1].string.strip()
        else:
            qtmc = ''
        if qtmc == "无":
            qtmc = ''
        if soup.find_all('tr')[2].find_all('td')[1].span.string is not None:
            ldzs = soup.find_all('tr')[2].find_all('td')[1].span.string.strip()
        else:
            ldzs = ''
        if soup.find_all('tr')[2].find_all('td')[3].span.string is None:
            jb = ''
        else:
            try:
                soup.find_all('tr')[2].find_all('td')[3].span.b.font.string
            except AttributeError:
                jb = soup.find_all('tr')[2].find_all(
                    'td')[3].span.string.strip()
            else:
                if soup.find_all('tr')[2].find_all(
                        'td')[3].span.b.font.string is not None:
                    jb = soup.find_all('tr')[2].find_all(
                        'td')[3].span.b.font.string.strip()
                else:
                    jb = ''
        if soup.find_all(id="lblNeiSheJG")[0].string is not None:
            nsjg = soup.find_all(id="lblNeiSheJG")[0].string.strip()
        else:
            nsjg = ''
        if nsjg == "\'":
            nsjg = ''
            # 有一行的情况
        if soup.find_all(id="lblMainDuty")[0].string is not None:
            zyzz = soup.find_all(id="lblMainDuty")[0].string.strip()
        else:
            # 获取单位的主要职责:大部分主要职责似乎是延迟加载,正常的方式抓取不到,需要借助浏览器
            # browser = webdriver.Chrome("c:\\chromedriver.exe")
            # browser.get(url)
            # rt = browser.page_source
            # browser.close()
            # zyzz = BeautifulSoup(rt, "html.parser").find_all(id="lblMainDuty")[0].get_text()
            zyzz = ''
        if zyzz == "\'":
            zyzz = ''
        if soup.find_all('tr')[4].td.div.table is not None:
            number = soup.find_all('tr')[4].td.div.table.find_all('tr')
            for num in number:
                if num.find_all('td')[0].string.strip().find("行政编制数") != -1:
                    if num.find_all('td')[1].font is not None:
                        if num.find_all('td')[1].font.string.strip(
                        ) == " " or num.find_all(
                                'td')[1].font.string.strip() == "":
                            xz_plan_num = "0"
                        else:
                            xz_plan_num = num.find_all(
                                'td')[1].font.string.strip()
                    else:
                        if num.find_all('td')[1].string.strip(
                        ) == " " or num.find_all(
                                'td')[1].string.strip() == "":
                            xz_plan_num = "0"
                        else:
                            xz_plan_num = num.find_all('td')[1].string.strip()
                    if num.find_all('td')[3].a.string.strip() == " ":
                        xz_real_num = "0"
                        xz_lone_num = "0"
                    else:
                        if len(num.find_all('td')[3].find_all('a')) == 1:
                            xz_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            xz_lone_num = "0"
                        else:
                            xz_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            xz_lone_num = num.find_all('td')[3].find_all(
                                'a')[1].string.strip()
                elif num.find_all('td')[0].string.strip().find("事业编制数") != -1:
                    if num.find_all('td')[1].font is not None:
                        if num.find_all('td')[1].font.string.strip(
                        ) == " " or num.find_all(
                                'td')[1].font.string.strip() == "":
                            sy_plan_num = "0"
                        else:
                            sy_plan_num = num.find_all(
                                'td')[1].font.string.strip()
                    else:
                        if num.find_all('td')[1].string.strip(
                        ) == " " or num.find_all(
                                'td')[1].string.strip() == "":
                            sy_plan_num = "0"
                        else:
                            sy_plan_num = num.find_all('td')[1].string.strip()
                    if num.find_all('td')[3].a.string.strip() == " ":
                        sy_real_num = "0"
                        sy_lone_num = "0"
                    else:
                        if len(num.find_all('td')[3].find_all('a')) == 1:
                            sy_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            sy_lone_num = "0"
                        else:
                            sy_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            sy_lone_num = num.find_all('td')[3].find_all(
                                'a')[1].string.strip()
                elif num.find_all('td')[0].string.strip().find("工勤编制数") != -1:
                    if num.find_all('td')[1].font is not None:
                        if num.find_all('td')[1].font.string.strip(
                        ) == " " or num.find_all(
                                'td')[1].font.string.strip() == "":
                            gq_plan_num = "0"
                        else:
                            gq_plan_num = num.find_all(
                                'td')[1].font.string.strip()
                    else:
                        if num.find_all('td')[1].string.strip(
                        ) == " " or num.find_all(
                                'td')[1].string.strip() == "":
                            gq_plan_num = "0"
                        else:
                            gq_plan_num = num.find_all('td')[1].string.strip()
                    if num.find_all('td')[3].a.string.strip() == " ":
                        gq_real_num = "0"
                        gq_lone_num = "0"
                    else:
                        if len(num.find_all('td')[3].find_all('a')) == 1:
                            gq_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            gq_lone_num = "0"
                        else:
                            gq_real_num = num.find_all('td')[3].find_all(
                                'a')[0].string.strip()
                            gq_lone_num = num.find_all('td')[3].find_all(
                                'a')[1].string.strip()
                else:
                    pass
                lx = re.search(re.compile(r'BZLX=.+?$'),
                               num.find_all('td')[3].a['href']).group(0)
                bzlx = lx[5:len(lx)]
                down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh,
                                 dwmc, bzlx)
            save_department(
                get_department(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, qtmc,
                               ldzs, jb, nsjg, zyzz, xz_plan_num, xz_real_num,
                               xz_lone_num, sy_plan_num, sy_real_num,
                               sy_lone_num, gq_plan_num, gq_real_num,
                               gq_lone_num, url, time))
        else:
            department_text(dwzd + ':' + dwbh + '-' + dwmc + '-' +
                            '--->无编制人员!')
Exemplo n.º 2
0
def down_department_details(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc,
                            time):
    xz_plan_num = xz_real_num = xz_lone_num = sy_plan_num = sy_real_num = sy_lone_num = gq_plan_num = gq_real_num = gq_lone_num = '0'
    url = get_department_url(base, dwbh)
    try:
        response = requests.get(url, timeout=1000, headers=headers)
    except:
        get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base,
                           time)
        return
    response.encoding = 'utf-8'
    try:
        soup = BeautifulSoup(
            response.text, "html.parser").div.table.find_all('tr')[2].td.table
    except AttributeError:
        get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base,
                           time)
        return
    else:
        if soup.find_all('tr')[0].find_all('td')[1].span.string is not None:
            dwmc = soup.find_all('tr')[0].find_all('td')[1].span.string.strip()
        elif soup.find_all('tr')[0].find_all(
                'td')[1].span.b.font.string is not None:
            dwmc = soup.find_all('tr')[0].find_all(
                'td')[1].span.b.font.string.strip()
        else:
            dwmc = ''
        if dwmc == '':
            department_text('编号:' + dwbh + '-->不存在!')
            return
        if soup.find_all('tr')[1].find_all('td')[1].string is not None:
            qtmc = soup.find_all('tr')[1].find_all('td')[1].string.strip()
        else:
            qtmc = ''
        if qtmc == "无":
            qtmc = ''
        if soup.find_all('tr')[2].find_all('td')[1].string is not None:
            ldzs = soup.find_all('tr')[2].find_all('td')[1].string.strip()
        else:
            ldzs = ''
        if soup.find_all('tr')[2].find_all('td')[3].span.string is None:
            jb = ''
        else:
            try:
                soup.find_all('tr')[2].find_all('td')[3].span.b.font.string
            except AttributeError:
                jb = soup.find_all('tr')[2].find_all(
                    'td')[3].span.string.strip()
            else:
                if soup.find_all('tr')[2].find_all(
                        'td')[3].span.b.font.string is not None:
                    jb = soup.find_all('tr')[2].find_all(
                        'td')[3].span.b.font.string.strip()
                else:
                    jb = ''
        if soup.find_all(id="lblNeiSheJG")[0].string is not None:
            nsjg = soup.find_all(id="lblNeiSheJG")[0].string.strip()
        else:
            nsjg = ''
        if nsjg == "\'":
            nsjg = ''
            # 有一行的情况
        if soup.find_all(id="lblMainDuty")[0].string is not None:
            zyzz = soup.find_all(id="lblMainDuty")[0].string.strip()
        else:
            # 获取单位的主要职责:大部分主要职责似乎是延迟加载,正常的方式抓取不到,需要借助浏览器
            # browser = webdriver.Chrome("c:\\chromedriver.exe")
            # browser.get(url)
            # rt = browser.page_source
            # browser.close()
            # zyzz = BeautifulSoup(rt, "html.parser").find_all(id="lblMainDuty")[0].get_text()
            zyzz = ''
        if zyzz == "\'":
            zyzz = ''
        if soup.find_all(id="LabelXZ") == []:
            xz_plan_num = "0"
        else:
            xz_plan_num = soup.find_all(id="LabelXZ")[0].get_text()
        if soup.find_all(id="RealXZ") == []:
            xz_real_num = "0"
            xz_lone_num = "0"
        else:
            xz = soup.find_all(id="RealXZ")[0].get_text()
            if '(' in xz:
                xz_real_num = xz.split("(")[0]
                xz_lone_num = xz.split("(")[1][3:-2]
            else:
                xz_real_num = xz
                xz_lone_num = "0"
            xz_lx = re.search(re.compile(r'BZLX=.+?$'),
                              soup.find_all(id="RealXZ")[0].a['href']).group(0)
            xz_bzlx = xz_lx[5:len(xz_lx)]
            down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc,
                             xz_bzlx)

        if soup.find_all(id="LabelSY") == []:
            sy_plan_num = "0"
        else:
            sy_plan_num = soup.find_all(id="LabelSY")[0].get_text()
        if soup.find_all(id="RealSY") == []:
            sy_real_num = "0"
            sy_lone_num = "0"
        else:
            sy = soup.find_all(id="RealSY")[0].get_text()
            if '(' in sy:
                sy_real_num = xz.split("(")[0]
                sy_lone_num = xz.split("(")[1][3:-2]
            else:
                sy_real_num = sy
                sy_lone_num = "0"
            sy_lx = re.search(re.compile(r'BZLX=.+?$'),
                              soup.find_all(id="RealSY")[0].a['href']).group(0)
            sy_bzlx = sy_lx[5:len(sy_lx)]
            down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc,
                             sy_bzlx)

        if soup.find_all(id="LabelGQ") == []:
            gq_plan_num = "0"
        else:
            gq_plan_num = soup.find_all(id="LabelGQ")[0].get_text()
        if soup.find_all(id="RealGQ") == []:
            gq_real_num = "0"
            gq_lone_num = "0"
        else:
            gq = soup.find_all(id="RealGQ")[0].get_text()
            if '(' in gq:
                gq_real_num = xz.split("(")[0]
                gq_lone_num = xz.split("(")[1][3:-2]
            else:
                gq_real_num = gq
                gq_lone_num = "0"
            gq_lx = re.search(re.compile(r'BZLX=.+?$'),
                              soup.find_all(id="RealGQ")[0].a['href']).group(0)
            gq_bzlx = gq_lx[5:len(gq_lx)]
            down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc,
                             gq_bzlx)

        save_department(
            get_department(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, qtmc,
                           ldzs, jb, nsjg, zyzz, xz_plan_num, xz_real_num,
                           xz_lone_num, sy_plan_num, sy_real_num, sy_lone_num,
                           gq_plan_num, gq_real_num, gq_lone_num, url, time))