예제 #1
0
def loadCommonExcel(hospitalALL, path, code, name, addr, kind):
    data = xlrd.open_workbook(path)
    print(data.sheet_names())
    table = data.sheet_by_name(data.sheet_names()[0])
    row = table.nrows
    for i in range(row):
        hospital = Hospital()
        hospital.code = table.cell(i, code).value
        hospital.name = table.cell(i, name).value.strip()
        hospitalALL.append(hospital)
예제 #2
0
def loadB(hospitalB):
    data = xlrd.open_workbook(
        'E:\\work-doc\\项目\\DMS\DOC\\试用阶段遇到问题\\客户抓取对比\\ab-b.xls')
    print(data.sheet_names())
    table = data.sheet_by_name(data.sheet_names()[0])
    row = table.nrows
    for i in range(row):
        hospital = Hospital()
        hospital.code = table.cell(i, 0).value.strip()
        hospital.name = table.cell(i, 1).value.strip()
        hospitalB.append(hospital)
예제 #3
0
def doWork(html):
    soup = BeautifulSoup(html, 'html.parser')
    hospital = Hospital()
    hospital.code = soup.select('title')[0].text.replace('_百度搜索', '')
    try:
        hospital.name = soup.select('h3 a em')[0].text
        hospital.address = soup.select(
            '.op-map-singlepoint-info-right')[0].contents[0]
        hospital.print
    except:
        hospital.print
예제 #4
0
def loadCommonExcel(hospitalALL, path, code, name, addr, kind):
    data = xlrd.open_workbook(path)
    print(data.sheet_names())
    table = data.sheet_by_name(data.sheet_names()[0])
    row = table.nrows
    for i in range(row):
        if (len(table.cell(i, addr).value.strip()) < 7):
            continue
        hospital = Hospital()
        if (code != -1):
            hospital.code = table.cell(i, code).value
        hospital.name = table.cell(i, name).value.strip()
        if (addr != -1):
            hospital.address = table.cell(i, addr).value.strip()
        hospital.type = '医院'
        if (kind != -1):
            hospital.kind = table.cell(i, kind).value.strip()
        hospitalALL.append(hospital)
예제 #5
0
def baidu(key, hospitalNew, hospitalBLL):
    # key = "成县妇幼保健院"
    url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=" + key + "&rsv_pq=c975914300115949&rsv_t=e7f3%2FJ8sovjmaqT%2B6p6ID4KVYbFRyG9dPQjqKtszA7eNO7jE0ynUBwuzYek&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=3&rsv_sug1=3&rsv_sug7=101&rsv_sug2=0&inputT=2503&rsv_sug4=4616&rsv_sug=2&&usm=3&rsv_idx=2&rsv_page=1"
    # 上面这行代码是在百度首页查询python关键字,将此网站赋值给url
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"}  # 设置网站请求头
    params = {
        'kw': key
    }

    response = requests.get(url, headers=headers, params=params)  # 对网站进行get请求,并伪装成浏览器进行请求
    response.encoding = "utf-8"  # 设置网页编码格式为utf-8
    # 3、打印浏览器解析的内容
    html = response.text  # 将网页源代码的文本文件赋值给html
    content = response.content  # 将网页源代码的二进制文件赋值给content
    print(html)  # 打印网页源代码的文本文件
    print(content)  # 打印二进制源码文件
    print("response.status_code:", response.status_code)  # 打印状态码,结果为200时表示请求成功
    print("headers:", response.headers)  # 打印网页的头部headers信息
    soup = BeautifulSoup(html, "lxml")
    # 4、打印查找到的标题信息
    print(soup.findAll("h3"))  # 经查实所有的标题信息在h3标签里,故打印h3标签的内容
    list1 = []
    hospital = Hospital()
    hospital.onlyCode = key
    try:
        searchName = soup.findAll("h3")[0].text.replace("_百度地图", "").strip()
        searchAddress = soup.select(".op-map-singlepoint-info-right")[0].text
        print(searchName, searchAddress)
        hospital.name = searchName

        hospital.address = searchAddress
        if len(hospital.address) > 6:
            df = cpca.transform([hospital.address], cut=False)
            hospital.code = hospital.address
            hospital.name = hospital.name
            hospital.oldName = df.iat[0, 2]
            hospital.privince = df.iat[0, 0]
            hospital.city = df.iat[0, 1]
            hospital.address = df.iat[0, 3]
        else:
            hospital.code = hospital.address
            hospital.name = hospital.name
            hospital.oldName = ''
            hospital.privince = ''
            hospital.city = ''
            hospital.address = hospital.address
        if (hospital.name.find('检验') != -1):
            hospital.type = '检验中心'
        elif (hospital.name.find('妇幼') != -1):
            hospital.type = '妇幼'
        elif (hospital.name.find('儿童') != -1):
            hospital.type = '儿童'
        else:
            hospital.type = '医院'
        hospital.level = ""
        hospitalNew.append(hospital)
    except:
        hospital.code = ""
        hospital.name = key
        hospital.oldName = ""
        hospital.privince = ""
        hospital.city = ""
        hospital.address = ""
        hospital.type = ""
        hospital.kind = ""
        hospital.level = ""
        hospital.onlyCode = ""
        hospitalBLL.append(hospital)
        print("eerrrorrr")
예제 #6
0
        "/html[1]/body[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/div[3]/div[1]/div[1]/section[1]/div[2]/div[1]/table[1]/tbody[1]/tr[3]/td[2]"
    ).text
    driver.close()


ind = 0
for pageNum in range(3000):
    print("进行第" + str(pageNum) + "页")
    for i in range(11):
        if i == 0: continue

        hospitalAll = []
        try:
            pages = driver.window_handles
            driver.switch_to_window(pages[0])
            hospital = Hospital()
            time.sleep(5)
            driver.find_element_by_xpath(
                "/html[1]/body[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]/div[4]/div[1]/div["
                + str(i) + "]/div[1]/div[3]/div[1]/div[1]/a[1]").click()
            oneSN(driver, hospital)
            # hospital.printings(0)
            hospitalAll.append(hospital)
            ExcelUtils.updateExcel('d:\\a.xls', hospitalAll, ind)
            ind += 1
        except Exception as e:
            print(e)
            pages = driver.window_handles
            if pages.__len__() > 1:
                driver.switch_to_window(pages[1])
                driver.close()
예제 #7
0
row = table.nrows
col = table.ncols

allHospital = []
hospitalA = []
hospitalB = []

# for i in range(row):
#     hos = Hospital()
#     hos.code = table.cell(i, 0).value
#     hos.name = table.cell(i, 1).value
#     allHospital.append(hos)

# -------bbb----
for i in range(row):
    hos = Hospital()
    hos.name = table.cell(i, 0).value
    allHospital.append(hos)

for i in allHospital:
    if i.name == '购货单位' or i.name == '医院':
        continue
    i.print()
    Builder.buildModel(i, hospitalA, hospitalB)

# 爬对应的信息 并组装数据,2部分
print('aaaa')
for A in hospitalA:
    A.print()
print('bbbb')
for A in hospitalB:
예제 #8
0
def loadCommonExcel(hospitalALL, path):

    data = xlrd.open_workbook(path)
    print(data.sheet_names())
    table = data.sheet_by_name(data.sheet_names()[0])
    row = table.nrows
    for i in range(row):
        if i == 0:
            continue
        hospital = Hospital()
        hospital.oldName = table.cell(i, 0).value.strip()
        hospital.name = table.cell(i, 2).value.strip()
        hospital.privince = table.cell(i, 4).value
        hospital.city = table.cell(i, 5).value
        # 区
        hospital.area = table.cell(i, 3).value
        hospital.address = table.cell(i, 6).value
        hospital.kind = table.cell(i, 7).value
        hospital.level = table.cell(i, 9).value

        if (hospital.name.find('检验') != -1):
            hospital.type = '检验中心'
        elif (hospital.name.find('妇幼') != -1):
            hospital.type = '妇幼'
        elif (hospital.name.find('儿童') != -1):
            hospital.type = '儿童'
        else:
            hospital.type = '综合'

        hospitalALL[hospital.name] = hospital
예제 #9
0
def buildModel(province, allHospital):
    url = "http://www.yixue.com/" + province + "医院列表"
    print(url)

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
    }
    html = requests.get(url, headers).text
    soup = BeautifulSoup(html, "html.parser")
    try:
        num = soup.select('#mw-content-text ul li b a').__len__()
        for i in range(num):
            hospital = Hospital()
            hospital.name = soup.select(
                '#mw-content-text ul li b a')[i].contents[0]
            hospital.address = soup.select('#mw-content-text ul li ul')[
                i].contents[0].contents[1].replace(':', '').replace('\n', '')
            hospital.level = soup.select('#mw-content-text ul li ul'
                                         )[i].contents[2].contents[1].replace(
                                             ':', '').replace('\n', '')
            hospital.type = '医院'
            hospital.kind = ''
            allHospital.append(hospital)

            hospital.print()
    except IndexError:
        hospital.print()