Пример #1
0
def gteData(url, classAttr, exceName):
    # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm"
    html_str = requests.get(url).content.decode("gbk", "replace")
    print(html_str)
    # bs = BeautifulSoup(html_str, "html.parser")
    # bs = BeautifulSoup(html_str, "html5lib")
    bs = BeautifulSoup(html_str, "lxml")

    dict = {}
    table = bs.findAll(name="table", attrs={"border": '1'})
    i = 0
    for tableObj in table:
        tr = tableObj.find_all(name="tr")
        father = []
        for obj in tr:
            td = obj.find_all(name="td")
            # print(p)
            son = []
            str = ""
            for obj in td:
                print(obj.text)
                son.append(obj.text)
                print("*" * 100)
            father.append(son)
        dict.update({classAttr[i]: father})
        i = i + 1
    createExcel.create(exceName, **dict)
Пример #2
0
def gteData(url, classAttr, exceName):
    # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm"
    html_str = requests.get(url).content.decode("gbk", "replace")
    print(html_str)
    # bs = BeautifulSoup(html_str, "html.parser")
    # bs = BeautifulSoup(html_str, "html5lib")
    bs = BeautifulSoup(html_str, "lxml")

    dict = {}
    table = bs.findAll(
        name="table",
        attrs={
            "style":
            'PADDING-BOTTOM: 0pt; PADDING-LEFT: 5.4pt; PADDING-RIGHT: 5.4pt; BORDER-COLLAPSE: collapse; PADDING-TOP: 0pt; mso-table-layout-alt: fixed'
        })
    i = 0
    for tableObj in table:
        tr = tableObj.find_all(name="tr")
        father = []
        for obj in tr:
            td = obj.find_all(name="td")
            # print(p)
            son = []
            str = ""
            for obj in td:
                print(obj.text)
                son.append(obj.text)
                print("*" * 100)
            father.append(son)
        dict.update({classAttr[i]: father})
        i = i + 1
    createExcel.create(exceName, **dict)
Пример #3
0
def gteData(url, classAttr, exceName):
    # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm"
    html_str = requests.get(url).content.decode("gbk", "replace")
    bs = BeautifulSoup(html_str, "html.parser")
    dict = {}
    for attr in classAttr:
        table = bs.find(name="table", attrs={"class": attr})
        tr = table.find_all(name="tr")
        father = []
        for obj in tr:
            td = obj.find_all(name="td")
            # print(p)
            son = []
            str = ""
            for obj in td:
                if obj.text.startswith("二等奖"):
                    dict.update({"一等奖": father})
                    father = []
                    son = []
                    continue
                else:
                    # son.append(obj.text)
                    p = obj.find_all(name="p")
                    if  len(p) > 1:
                        for pObj in p:
                            son.append(pObj.text)
                    else:
                        son.append(obj.text)
            father.append(son)
        dict.update({"二等奖": father})
    createExcel.create(exceName, **dict)
Пример #4
0
def gteData(url,classAttr,exceName):
    # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm"
    html_str = requests.get(url).content.decode("gbk", "ignore")
    bs = BeautifulSoup(html_str, "html.parser")
    dict = {}
    for attr in classAttr:
        table = bs.find(name="table", attrs={"class": attr})
        tr = table.find_all(name="tr")
        father = []
        for obj in tr:
            td = obj.find_all(name="td")
            # print(p)
            son = []
            str = ""
            for obj in td:
                son.append(obj.text)
            father.append(son)
        dict.update({attr:father})
    createExcel.create(exceName, **dict)
Пример #5
0
def gteData(url, classAttr, exceName):
    # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm"
    html_str = requests.get(url).content.decode("gbk", "replace")
    bs = BeautifulSoup(html_str, "html.parser")
    dict = {}
    table = bs.findAll(name="table", attrs={"width": "95%", "border": "1"})
    i = 0
    for tableObj in table:
        tr = tableObj.find_all(name="tr")
        father = []
        for obj in tr:
            td = obj.find_all(name="td")
            # print(p)
            son = []
            str = ""
            for obj in td:
                # son.append(obj.text)
                # print(obj.text)
                brFlag = True
                first = ""
                last = ""
                if len(obj.contents) == 3:
                    for con in obj.contents:
                        if con.find("<br/>") != -1:
                            brFlag = False
                        else:
                            if first == "":
                                first = con
                            else:
                                last = con

                    if brFlag:
                        son.append(obj.text)
                    else:
                        son.append(first)
                        son.append(last)
                else:
                    son.append(obj.text)
            father.append(son)
        dict.update({classAttr[i]: father})
        i = i + 1
    createExcel.create(exceName, **dict)