def gteData(url, classAttr, exceName): # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm" html_str = requests.get(url).content.decode("gbk", "replace") print(html_str) # bs = BeautifulSoup(html_str, "html.parser") # bs = BeautifulSoup(html_str, "html5lib") bs = BeautifulSoup(html_str, "lxml") dict = {} table = bs.findAll(name="table", attrs={"border": '1'}) i = 0 for tableObj in table: tr = tableObj.find_all(name="tr") father = [] for obj in tr: td = obj.find_all(name="td") # print(p) son = [] str = "" for obj in td: print(obj.text) son.append(obj.text) print("*" * 100) father.append(son) dict.update({classAttr[i]: father}) i = i + 1 createExcel.create(exceName, **dict)
def gteData(url, classAttr, exceName): # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm" html_str = requests.get(url).content.decode("gbk", "replace") print(html_str) # bs = BeautifulSoup(html_str, "html.parser") # bs = BeautifulSoup(html_str, "html5lib") bs = BeautifulSoup(html_str, "lxml") dict = {} table = bs.findAll( name="table", attrs={ "style": 'PADDING-BOTTOM: 0pt; PADDING-LEFT: 5.4pt; PADDING-RIGHT: 5.4pt; BORDER-COLLAPSE: collapse; PADDING-TOP: 0pt; mso-table-layout-alt: fixed' }) i = 0 for tableObj in table: tr = tableObj.find_all(name="tr") father = [] for obj in tr: td = obj.find_all(name="td") # print(p) son = [] str = "" for obj in td: print(obj.text) son.append(obj.text) print("*" * 100) father.append(son) dict.update({classAttr[i]: father}) i = i + 1 createExcel.create(exceName, **dict)
def gteData(url, classAttr, exceName): # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm" html_str = requests.get(url).content.decode("gbk", "replace") bs = BeautifulSoup(html_str, "html.parser") dict = {} for attr in classAttr: table = bs.find(name="table", attrs={"class": attr}) tr = table.find_all(name="tr") father = [] for obj in tr: td = obj.find_all(name="td") # print(p) son = [] str = "" for obj in td: if obj.text.startswith("二等奖"): dict.update({"一等奖": father}) father = [] son = [] continue else: # son.append(obj.text) p = obj.find_all(name="p") if len(p) > 1: for pObj in p: son.append(pObj.text) else: son.append(obj.text) father.append(son) dict.update({"二等奖": father}) createExcel.create(exceName, **dict)
def gteData(url,classAttr,exceName): # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm" html_str = requests.get(url).content.decode("gbk", "ignore") bs = BeautifulSoup(html_str, "html.parser") dict = {} for attr in classAttr: table = bs.find(name="table", attrs={"class": attr}) tr = table.find_all(name="tr") father = [] for obj in tr: td = obj.find_all(name="td") # print(p) son = [] str = "" for obj in td: son.append(obj.text) father.append(son) dict.update({attr:father}) createExcel.create(exceName, **dict)
def gteData(url, classAttr, exceName): # url = "http://www.most.gov.cn/cxfw/kjjlcx/kjjl2000/200802/t20080214_59081.htm" html_str = requests.get(url).content.decode("gbk", "replace") bs = BeautifulSoup(html_str, "html.parser") dict = {} table = bs.findAll(name="table", attrs={"width": "95%", "border": "1"}) i = 0 for tableObj in table: tr = tableObj.find_all(name="tr") father = [] for obj in tr: td = obj.find_all(name="td") # print(p) son = [] str = "" for obj in td: # son.append(obj.text) # print(obj.text) brFlag = True first = "" last = "" if len(obj.contents) == 3: for con in obj.contents: if con.find("<br/>") != -1: brFlag = False else: if first == "": first = con else: last = con if brFlag: son.append(obj.text) else: son.append(first) son.append(last) else: son.append(obj.text) father.append(son) dict.update({classAttr[i]: father}) i = i + 1 createExcel.create(exceName, **dict)