def getNewlyBug(self) -> dict: result = {} productInfos = self.getProductInfo() param = {"charts[]": "openedBugsPerDay"} for id, name in productInfos.items(): date_list = [] value_list = [] self.opener.open(self.bugBrowseTemp % id) response = self.opener.open( self.bugReportTemp % id, urllib.parse.urlencode(param).encode('utf-8')) soup = BeautifulSoup(response.read(), "html.parser", from_encoding="utf8") date_data = soup.find_all(name="td", attrs={'class': 'chart-label'}) for item in date_data: if len(item.contents) == 1: date = str(item.contents[0]) date_list.append(date) value_data = soup.find_all(name="td", attrs={'class': 'chart-value'}) for item in value_data: if len(item.contents) == 1: value_list.append(int(item.contents[0])) # date_list = date_list[-60:] # value_list = value_list[-60:] bugs_dict = dict(zip(date_list, value_list)) result[name] = bugs_dict soup.clear() return result
def get_webflow(response): '''获得选课页面必须的lt 以及execution参数''' soup = BeautifulSoup(response.text,'html.parser') lt = soup.find('input',{'name' : 'lt'})['value'] execution = soup.find('input',{'name' : 'execution'})['value'] soup.clear() return(lt,execution)
def get_pois(page_url): try: poi_request = requests.get(page_url) poi_soup = BeautifulSoup(poi_request.text) tb_div = poi_soup.find_all('table', class_='table table-bordered table-striped table-hover data-table') # 表头 thead_ele = tb_div[0].find_all('th') # 表格 rows = tb_div[0].find_all('tr') data_path = r'D:\Code\gis-poi\data\poi.txt' data = open(data_path, 'a', encoding='utf-8') for row in rows: cells = row.find_all('td') # 插入式需要将特殊符号转义,此处不完整,只进行了单引号的转义 sql = "insert into " \ "tb_poi(f_name,f_pname,f_cname,f_dname,f_dcode,f_tel,f_area,f_address,f_b,f_s,f_x,f_y) " \ "values('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}',{10},{11});".format( cells[1].text.replace("'", "\\'"), cells[2].text.replace("'", "\\'"), cells[3].text.replace("'", "\\'"), cells[4].text.replace("'", "\\'"), cells[5].text.replace("'", "\\'"), cells[6].text.replace("'", "\\'"), cells[7].text.replace("'", "\\'"), cells[8].text.replace("'", "\\'"), cells[9].text.replace("'", "\\'"), cells[10].text.replace("'", "\\'"), cells[11].text.replace("'", "\\'"), cells[12].text.replace("'", "\\'")) data.write(sql + "\n") data.close() poi_soup.clear() poi_request.close() except Exception as e: print(e)
class BS4Parser: def __init__(self, *args, **kwargs): # list type param of "feature" arg is not currently correctly tested by bs4 (r353) # so for now, adjust param to provide possible values until the issue is addressed kwargs_new = {} for k, v in kwargs.items(): if 'features' in k and isinstance(v, list): v = [ item for item in v if item in ['html5lib', 'html.parser', 'html', 'lxml', 'xml'] ][0] kwargs_new[k] = v tag, attr = [ x in kwargs_new and kwargs_new.pop(x) or y for (x, y) in [('tag', 'table'), ('attr', '')] ] if attr: args = (re.sub( r'(?is).*(<%(tag)s[^>]+%(attr)s[^>]*>.*</%(tag)s>).*' % { 'tag': tag, 'attr': attr }, r'<html><head></head><body>\1</body></html>', args[0]).strip(), ) + args[1:] self.soup = BeautifulSoup(*args, **kwargs_new) def __enter__(self): return self.soup def __exit__(self, exc_ty, exc_val, tb): self.soup.clear(True) self.soup = None
def post(request): ########################################################## #############新使用者先建立DIR再用這個DIR要爬的東西############# ##############因為我們要一個介面解決這他媽改死的教學平台########## ########################################################### if request.method == "POST": #以下是爬蟲程式 cID = request.POST['stuID'] cPassword = request.POST['pwd'] url = "http://ecampus.nqu.edu.tw/eCampus3P/Learn/LoginPage2/product_login.aspx" #ecampus的login url headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } #headers還不知道怎麼抓但是我想都差不多一樣 resp = requests.post( url, headers=headers, data={ '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': ' /wEPDwUKMjAzODk5NzA3Mg8WAh4EX2N0bAUMYnRuTG9naW5IZWxwFgICAw9kFiYCAQ8WAh4KYmFja2dyb3VuZAUWaW1hZ2VzL3poLVRXL2xvZ2luLmdpZhYMAgEPFgIeBXN0eWxlBRpwb3NpdGlvbjpyZWxhdGl2ZTtsZWZ0OjBweBYCAgEPDxYCHghJbWFnZVVybAUTaW1hZ2VzL3poLVRXL2lkLmdpZmRkAgMPFgIfAgUacG9zaXRpb246cmVsYXRpdmU7bGVmdDowcHhkAgUPFgIfAgUacG9zaXRpb246cmVsYXRpdmU7bGVmdDowcHgWAmYPZBYCAgEPDxYCHwMFGWltYWdlcy96aC1UVy9wYXNzd29yZC5naWZkZAIHDxYCHwIFGnBvc2l0aW9uOnJlbGF0aXZlO2xlZnQ6MHB4ZAIJD2QWCAIBDw8WBh4IQ3NzQ2xhc3MFC21lbnVfdGV4dDAyHgRUZXh0BQ5b5b+Y6KiY5a+G56K8XR4EXyFTQgICZGQCAw8PFgYfBAUQbWVudV90ZXh0MDJfb190dx8FBQ5b55m75YWl6Kqq5piOXR8GAgJkZAIFDw8WBh8EBQttZW51X3RleHQwMh8FBQ5b6Kiq5a6i5Y+D6KeAXR8GAgJkZAIHDw8WCB8EBQttZW51X3RleHQwMh8FBQ5b5Y+D6KeA6Kqy56iLXR8GAgIeB1Zpc2libGVoZGQCCw8PFgIfAwUcaW1hZ2VzL3poLVRXL2xvZ2luIEVudGVyLmpwZxYEHgtvbm1vdXNlb3ZlcgU4amF2YXNjcmlwdDp0aGlzLnNyYz0naW1hZ2VzL3poLVRXL2xvZ2luIEVudGVyX292ZHcuanBnJzseCm9ubW91c2VvdXQFM2phdmFzY3JpcHQ6dGhpcy5zcmM9J2ltYWdlcy96aC1UVy9sb2dpbiBFbnRlci5qcGcnO2QCAw8PFgIfAwUTaW1hZ2VzL3poLVRXL0dCLmdpZmRkAgQPDxYCHwMFE2ltYWdlcy96aC1UVy9Fbi5naWZkZAIGDw8WAh8DBRZpbWFnZXMvemgtVFcvdGl0ZWwuanBnZGQCCA8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+ebuOmXnOmAo+e1kF0fBgICZGQCCg8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+W5s+WPsOS7i+e0uV0fBgICZGQCDA8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+W4uOimi+WVj+mhjF0fBgICZGQCDg8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+mAo+e1oeaIkeWAkV0fBgICZGQCEA8PFggfBAULbWVudV90ZXh0MDIfBQUOW+eUs+iri+W4s+iZn10fBgICHwdoZGQCFA8PFgIfAwUdaW1hZ2VzL3poLVRXL21haW4gcGljdHVyZS5qcGdkZAIWDxYCHwEFH2ltYWdlcy96aC1UVy9sb2dpbiB0ZXh0IHBhbi5qcGdkAhgPDxYCHwMFFWltYWdlcy96aC1UVy9uZXdzLmpwZ2RkAhwPDxYCHwMFGmltYWdlcy96aC1UVy9mcmFtZV90b3AuZ2lmZGQCHg8WAh8BBR9pbWFnZXMvemgtVFcvbG9naW4gdGV4dCBwYW4uanBnZAIgDxYEHgZoZWlnaHQFBTI0MHB4HgNzcmMFFy4uL2xvZ2luX0hlbHBJbmRleC5hc3B4ZAIiDxYCHwEFGGltYWdlcy96aC1UVy9mcmFtZV9SLmdpZmQCJA8PFgIfAwUaaW1hZ2VzL3poLVRXL2ZyYW1lX2Rvdy5naWZkZAIoDxYEHwUFHGVDYW1wdXMgSUlJIHYxLjYuMDkxOTguMDEwNDAfB2dkAi4PDxYCHwMFH2ltYWdlcy96aC1UVy9sb2dvIG9mIDNwcm9iZS5naWZkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAwUIYnRuTG9naW4FCmJ0bkNoaW5lc2UFCmJ0bkVuZ2xpc2hqzHG9hdaHqyty7OyKa8boh3mpUA==', '__VIEWSTATEGENERATOR': '8B4B7C2A', 'txtLoginId': cID, # studentID 'txtLoginPwd': cPassword, # password 'btnLogin.x': '44', #應該填多少也沒差 'btnLogin.y': '25', #應該填多少也沒差 }) soup = BeautifulSoup(resp.text, "lxml") course_Name = [] course_URL = [] for i in soup.find_all(id=re.compile("CourseName")): course_Name.append(i.string) for i in soup.find_all( "input", {'url': re.compile('stu_course_default.aspx?')}): course_URL.append(i["url"][33:69]) cName = soup.find(id="ctl00_lbAccount").string user_Dict = dict(zip(course_Name, course_URL)) if len(user_Dict) == 0: return render(request, "error_login.html", eooro_login=True) #回傳到index做登入錯誤 soup.decompose = True soup.clear() try: unit = userData.objects.get(cID=cID, cPassword=cPassword) print("有人登入拉") except: cCurrAccID = find_CurrAccID( "http://ecampus.nqu.edu.tw/eCampus3P/Learn/stu_course_default.aspx?CourseId=" + course_URL[0] + "&Role=Student", cID, cPassword) unit = userData.objects.create(cID=cID, cPassword=cPassword, cCurrAccID=cCurrAccID[10:], cName=cName) unit.save() print("有人創帳號嘍") request.session["user_id"] = cID #設定本站的session return render(request, "course/course.html", locals()) #會跑到course.html模板
def parseHtml(self, html): items = list() bs = BeautifulSoup(html, 'html.parser') for rootelement in bs.find('ul', attrs={'id': 's-results-list-atf'}): for ref in rootelement.find('a'): items.append(item(ref['alt'], ref.parent['href'])) bs.clear(decompose=True) return items
def convertHTML2PDF(html_files): # Call sorting function ordered_html_files = filterAndSortHTMLfiles(html_files) for file in ordered_html_files: print(file) file_text = codecs.open(path + '/' + file, 'r') html_text = file_text.read() soup = BeautifulSoup(html_text, 'lxml') div_fhwrapper = soup.find("div", {"class": "fh-wrapper"}) # "header", "class": "border-vertical"}) div_fhwrapper.find('div', {"class": "header"}).decompose() div_fhwrapper.find('div', {"class": "blank-class-outer footer visible-md visible-lg"}).decompose() div_fhwrapper.find('div', {"class": "footer-small visible-xs visible-sm"}).decompose() # adds # before link changeLinksInDIV(div_fhwrapper) div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"}) # sets id name to div # it is necessary for anchor links div_breadcrumbs.attrs['id'] = str(file) # decreases padding div_jumbotron = soup.find("div", {"class": "jumbotron"}) div_jumbotron.attrs[ 'style'] = "padding-top: 5px !important; padding-bottom: 10px !important; margin-bottom: 0px !important;" # deletes empty bullets in list for x in div_fhwrapper.findAll('li'): # print(x) if len(x.get_text(strip=True)) == 0: x.extract() get_image_file_as_base64_data(div_fhwrapper) content = str(div_fhwrapper) outputfile.write(content) soup.clear() # print(content) # Close read file in this iteration file_text.close() meshfree_file = codecs.open(path + '/MESHFREE.html', 'r') meshfree_text = meshfree_file.read() meshfree_soup = BeautifulSoup(meshfree_text, 'lxml') div_footer = meshfree_soup.find("div", {"class": "footer-small visible-xs visible-sm"}) changeLinksInDIV(div_footer) content_footer = str(div_footer) meshfree_file.close() outputfile.write(content_footer)
def RaspUFG(pag, nArq): arq = open('UFG-ValorMensal' + str(nArq) + '.txt', 'w') #For para percorrer todas 374 paginas com os servidores UFG for j in range(pag, min(374, pag + 100)): #Criacao da "sopa" com os dados da pagina atual do poral de transparencia da UFG html = urllib2.urlopen( 'http://www.portaldatransparencia.gov.br/servidores/OrgaoLotacao-ListaServidores.asp?CodOS=15000&DescOS=MINISTERIO%20DA%20EDUCACAO&CodOrg=26235&DescOrg=UNIVERSIDADE%20FEDERAL%20DE%20GOIAS&Pagina=' + str(j)) bsObj = BeautifulSoup(html, "html.parser") #Obtencao dos elementos da tabela com os dados dos servidores tabela = bsObj.find( "table", {"summary": "Lista de servidores lotados por órgão"}) teto = 20000.00 #Obtencao dos dados dos elementos da tabela if tabela != None: tds = tabela.findAll("a") for link in tds: link2 = link.get('href') nome = link.contents[0] idServidor = link2[44:51] if idServidor != "": html = urllib2.urlopen( 'http://www.portaldatransparencia.gov.br/servidores/Servidor-DetalhaRemuneracao.asp?Op=3&IdServidor=' + str(idServidor) + '&CodOS=15000&CodOrgao=26235&bInformacaoFinanceira=True' ) bsObj = BeautifulSoup(html, "html.parser") #Obtencao dos elementos da tabela com os dados dos servidores salario = bsObj.find( "tr", {"class": "remuneracaolinhatotalliquida"}) if salario != None: salarioValor = salario.find("td", { "class": "colunaValor" }).get_text() else: salarioValor = "" textoVM = [] if salarioValor != "": if (float(salarioValor.strip().replace( '.', '').replace(',', '.')) >= teto): print nome print salarioValor textoVM.append('Nome:' + nome + '\n') textoVM.append('Salario:' + salarioValor + '\n') arq.writelines(textoVM) #Limpa a "sopa" bsObj.clear()
def create_raw_descs(link_inside): def _remove_all_attrs(text): # removing tag attributes for tag in text.find_all(True): tag.attrs = {} return text # FORMING THE DESCRIPTIONS page = requests.get(link_inside) # getting the object from url soup = BeautifulSoup(page.content, 'html.parser') # loading it into the soup desc_divs = [] # a list for all descs' divs # the code below is a sample, do NOT paste it in your project as is """ main_heading = soup.find("h1") main_heading.name = "div" # change the name for uniformity if main_heading: desc_divs.append(main_heading) else: pass main_desc = soup.find("div", class_="product-main-text") # main desc if main_desc: desc_divs.append(main_desc) else: pass features_table_desc = soup.find("div", class_="title") # features (table heading) if features_table_desc: desc_divs.append(features_table_desc) else: pass features_table = soup.find("table", class_="table-striped") # features table if features_table: desc_divs.append(features_table) else: pass catalog_detail_block = soup.find("div", class_="catalog_detail_info") # text after features table if catalog_detail_block: desc_divs.append(catalog_detail_block) else: pass """ soup.clear() # clearing the old soup for desc_div in desc_divs: # loading the new soup with objects from the list soup.append(desc_div) soup_without_attrs = _remove_all_attrs( soup) # removing all unnecessary attrs return soup_without_attrs
class BS4Parser: def __init__(self, *args, **kwargs): self.soup = BeautifulSoup(*args, **kwargs) def __enter__(self): return self.soup def __exit__(self, exc_ty, exc_val, tb): self.soup.clear(True) self.soup = None
class BS4Parser(object): def __init__(self, *args, **kwargs): self.soup = BeautifulSoup(*args, **kwargs) def __enter__(self): return self.soup def __exit__(self, exc_ty, exc_val, tb): _ = exc_ty, exc_val, tb # Throw away unused values self.soup.clear(True) self.soup = None
def data_search(r_text, s_text): from bs4 import BeautifulSoup soup = BeautifulSoup(r_text, "html.parser") # считываем данные dirty_desc = soup.find_all("div", class_="pxc-prod-detail-txt") dirty_short_desc = soup.h1 # check existing short description, if none - return '0', because at phoenix site it mean no data! if dirty_short_desc is None: return '0' dirty_tech_data = soup.find_all("table", class_="pxc-tbl") # len(soup.find_all("table", class_="pxc-tbl")) can count num of tables, and split them # test = soup.find_all("table", class_="pxc-tbl")[0] soup.clear() # выгружаем текст desc = str(dirty_desc) soup = BeautifulSoup(desc, "html.parser") desc = soup.get_text() desc = desc.split("\n") # выгружаем текст short_desc = str(dirty_short_desc) soup = BeautifulSoup(short_desc, "html.parser") short_desc = soup.get_text() # выгружаем текст tech_data = str(dirty_tech_data) soup = BeautifulSoup(tech_data, "html.parser") tech_data = soup.get_text() tech_data = tech_data.split("\n") tech_data1 = [] # пересобираем текстовые данные for a in tech_data: if a != '' and a != '[' and a != ']' and a != ', ': tech_data1.append(a) # загружаем данные в парсер soup = BeautifulSoup(s_text, "html.parser") # считываем данные dirty_comm_data = soup.find("table", class_="pxc-tbl") soup.clear() # выгружаем текст comm_data = str(dirty_comm_data) soup = BeautifulSoup(comm_data, "html.parser") comm_data = soup.get_text() comm_data = comm_data.split("\n") comm_data1 = [] # пересобираем текстовые данные for a in comm_data: if a != '': comm_data1.append(a) return short_desc, desc[1], tech_data1, comm_data1
def convertHTML2PDF(html_files): # Call sorting function ordered_html_files = filterAndSortHTMLfiles(html_files) meshfree_file = codecs.open(path + '/MESHFREE.html', 'r') meshfree_text = meshfree_file.read() meshfree_soup = BeautifulSoup(meshfree_text, 'lxml') div_footer = meshfree_soup.find( "div", {"class": "blank-class-outer footer visible-md visible-lg"}) changeLinksInDIV(div_footer) content_footer = str(div_footer) meshfree_file.close() for file in ordered_html_files: print(file) file_text = codecs.open(path + '/' + file, 'r') html_text = file_text.read() soup = BeautifulSoup(html_text, 'lxml') div_fhwrapper = soup.find( "div", {"class": "fh-wrapper"}) # "header", "class": "border-vertical"}) div_fhwrapper.find('div', {"class": "header"}).decompose() div_fhwrapper.find( 'div', { "class": "blank-class-outer footer visible-md visible-lg" }).decompose() # adds # before link changeLinksInDIV(div_fhwrapper) div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"}) # sets id name to div # it is necessary for anchor links div_breadcrumbs.attrs['id'] = str(file) # style = "display:block; clear:both; page-break-after:always;" content = str(div_fhwrapper) outputfile.write(content) soup.clear() # print(content) # Close read file in this iteration file_text.close() outputfile.write(content_footer)
def find_byid(id): print('start to spider to find tweets') print('id is ' + id) URL = 'https://twitter.com/%s' % id print(URL) URL_MOBILE = 'https://mobile.twitter.com' f = request.urlopen(URL) # html = f.read().encode('utf-8') soup = BeautifulSoup(f, 'lxml') find = soup.find_all( class_='tweet-timestamp js-permalink js-nav js-tooltip', limit=5) print(find) re_url = [] re_text = [] for tag in find: dic = tag.attrs url = URL_MOBILE + dic['href'] re_url.append(url) # print(dic['href'][1:] +' this is ' + dic['title']) # print(soup) find = soup.find_all( class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text', limit=5) # print(find) for tag in find: # print(type(tag)) # print(tag) try: str = tag.string if str == None: str = '' try: for intag in tag: # intag =tag[0] if not intag.string == None: str += intag.string print(str) re_text.append(str) except: print('error to read a string in tag') else: re_text.append(str) except: print('error to read the string') print(re_text) print(re_url) f.close() soup.clear() re = {'pic': re_url, 'text': re_text} return re
def get_webflow(self): """ 流水号webflow获取。随便访问包含登陆页链接的CSDN网页就可以得到这串数据。应为是动态变化的 所以,先获取下来,以备使用。 :return: """ url = 'https://passport.csdn.net/account/login?ref=toolbar' response = self.session.get(url=url, headers=self.headers) soup = BeautifulSoup(response.text, 'html.parser') lt = soup.find('input', {'name': 'lt'})['value'] execution = soup.find('input', {'name': 'execution'})['value'] # 释放不必要的对象 soup.clear() return (lt, execution)
def fill_cupple_tr(tr): for td in tr.find_all('td'): if is_title_td(td): k = td.next_sibling.next_sibling if k and (not is_title_td(k)): if get_title_td(td).find('意见') != -1: new_tag = BeautifulSoup().new_tag('textarea',id=get_title_pinyin_td(td)) new_tag['rows'] = 20 new_tag.clear() else: new_tag = BeautifulSoup().new_tag('input',id=get_title_pinyin_td(td)) new_tag['value'] = new_tag['id'] if k.input: k.input.replace_with(new_tag) else: k.append(new_tag)
def _trySuggestions(platform: str, region: int, soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup: """ Goes through the list of games and tries to find one that matches the platform :param platform: The platform we're looking for :param region: Which region. 0: NTSC (JP), 1: NTSC (NA), 2: PAL :param soup: BeautifulSoup object :return: BeautifulSoup object for new page if found, else a NoneType BeautifulSoup object """ logger.info("Couldn't find game at url. Trying alternatives...") titleUrlRegex = re.compile(r'href=\".*?\"') titles = soup.find_all("td", {"class": "title"}) consoles = soup.find_all("td", {"class": "console"}) url = "" for title, console in zip(titles, consoles): if console.text.lower().replace(" ", "-") == _platforms[platform][region]: url = titleUrlRegex.findall(title.decode()).pop()[5:].strip('"') break if len(url) > 0: logger.info(f"New url found: {url}") res = requests.get(url) soup = bs4.BeautifulSoup(res.text, "html.parser") return soup logger.info("Couldn't find title in alternate urls.") return soup.clear()
def get_pages(url_path, city_url): try: city_request = requests.get(city_url) city_soup = BeautifulSoup(city_request.text) # 获取分页列表标签 page_ele = city_soup.find_all('ul', class_='pagination pagination-sm mar-t5') page_hrefs = page_ele[0].find_all('a') for page_href in page_hrefs: if str.strip(page_href.get('href')) == '': continue page_url = url_path + page_href.get('href') get_pois(page_url) city_soup.clear() city_request.close() except Exception as e: print(e)
def get_catogrys(url_path): try: url_request = requests.get(url_path) url_soup = BeautifulSoup(url_request.text) # 获取分类标签 <div class="catgory"></div> catgory_eles = url_soup.find_all('div', class_='catgory') if catgory_eles is None or len(catgory_eles) == 0: raise Exception('未能读取到目录信息') for catgory_ele in catgory_eles: # 获取分类标签中的a标签的属性href值 catgory_url = url_path + catgory_ele.find_all('a')[0].get('href') get_citys(url_path, catgory_url) # 使用完成后清理对象内存 url_soup.clear() url_request.close() except Exception as e: print(e)
def fill_cupple_tr(tr): for td in tr.find_all('td'): if is_title_td(td): k = td.next_sibling.next_sibling if k and (not is_title_td(k)): if get_title_td(td).find('意见') != -1: new_tag = BeautifulSoup().new_tag( 'textarea', id=get_title_pinyin_td(td)) new_tag['rows'] = 20 new_tag.clear() else: new_tag = BeautifulSoup().new_tag( 'input', id=get_title_pinyin_td(td)) new_tag['value'] = new_tag['id'] if k.input: k.input.replace_with(new_tag) else: k.append(new_tag)
def do_glosary_desc_page(url, download_dir): content_type = site_urls[url]['content-type'] page_file_name = url_to_file_name(url, content_type) input_file_path = download_dir + page_file_name with open(input_file_path, 'r') as f: html = f.read() page = BeautifulSoup(html, "html5lib") glossary_detail = page.find("div", class_="main-glossary-detail-container") #page = BeautifulSoup(html, "html.parser") head_lines_text = ''' <html> <head> <link href="/assets/style.css" rel="stylesheet"> <link href="/assets/style-override.css" rel="stylesheet"> <style> #currentGlossaryText {font-weight: 700!important; font-size: 18px!important; display: block!important; } .hidden { display: block!important; } </style> </head> <body> ''' head_lines = BeautifulSoup(head_lines_text, 'html.parser') bottom_lines_text = ''' </body> </html> ''' bottom_lines = BeautifulSoup(bottom_lines_text, 'html.parser') page.clear() page.append(head_lines) page.append(glossary_detail) page.append(bottom_lines) page = fix_links(page, url) return page, page_file_name
def test_search(): # pylint: disable=too-many-locals """ Test searching """ url = "http://kickass.to/" search_url = ( "http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc" ) html = getURL(search_url, session=requests.Session()) if not html: return soup = BeautifulSoup(html, "html5lib") torrent_table = soup.find("table", attrs={"class": "data"}) torrent_rows = torrent_table.find_all("tr") if torrent_table else [] # cleanup memory soup.clear(True) # Continue only if one Release is found if len(torrent_rows) < 2: print "The data returned does not contain any torrents" return for row in torrent_rows[1:]: try: link = urlparse.urljoin(url, (row.find("div", {"class": "torrentname"}).find_all("a")[1])["href"]) _id = row.get("id")[-7:] title = (row.find("div", {"class": "torrentname"}).find_all("a")[1]).text or ( row.find("div", {"class": "torrentname"}).find_all("a")[2] ).text url = row.find("a", "imagnet")["href"] verified = True if row.find("a", "iverify") else False trusted = True if row.find("img", {"alt": "verified"}) else False seeders = int(row.find_all("td")[-2].text) leechers = int(row.find_all("td")[-1].text) _ = link, _id, verified, trusted, seeders, leechers except (AttributeError, TypeError): continue print title
def test_search(): # pylint: disable=too-many-locals """ Test searching """ url = 'http://kickass.to/' search_url = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc' html = getURL(search_url, session=make_session(), returns='text') if not html: return soup = BeautifulSoup(html, 'html5lib') torrent_table = soup.find('table', attrs={'class': 'data'}) torrent_rows = torrent_table.find_all('tr') if torrent_table else [] # cleanup memory soup.clear(True) # Continue only if one Release is found if len(torrent_rows) < 2: print "The data returned does not contain any torrents" return for row in torrent_rows[1:]: try: link = urlparse.urljoin(url, (row.find('div', { 'class': 'torrentname' }).find_all('a')[1])['href']) _id = row.get('id')[-7:] title = (row.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \ or (row.find('div', {'class': 'torrentname'}).find_all('a')[2]).text url = row.find('a', 'imagnet')['href'] verified = True if row.find('a', 'iverify') else False trusted = True if row.find('img', {'alt': 'verified'}) else False seeders = int(row.find_all('td')[-2].text) leechers = int(row.find_all('td')[-1].text) _ = link, _id, verified, trusted, seeders, leechers except (AttributeError, TypeError): continue print title
def rasparUSP(pag, nArq): # Abertura em modo de escrita do arquivo para super salarios com base no Valor Mensal arq = open('USP-ValorMensal' + str(nArq) + '.txt', 'w') # For para percorrer todas 895 paginas com os servidores USP for j in range(pag, min(896, pag + 100)): # Criação da "sopa" com os dados da pagina atual do poral de transparencia da USP html = urlopen( 'https://uspdigital.usp.br/portaltransparencia/portaltransparenciaListar?paginar=s&dtainictc=01%2F12%2F2016&nompes=&nomundorg=&nomdepset=&tipcon=&tipcla=&nomabvfnc=&Submit=Solicitar+pesquisa&reload=buscar&imagem=S&print=true&chars=21ni&pag=' + str(j)) bsObj = BeautifulSoup(html, "html.parser") # Obtenção dos elementos da tabela com os dados dos servidores tabela = bsObj.find("table", {"class": "table_list"}) # Obtenção dos dados dos elementos da tabela tds = tabela.findAll("td") # Valor de teto salarial do estado de SP teto = 21631.05 # Vetores para modelar os dados de interesse textoVM = [] # For para percorrer todos os dados excluindo o cabeçalho for i in range(14, len(tds), 14): # Caso o Valor Mensal seja superior ao teto adiciona os dados do servidor ao vetor if (float(tds[i + 12].getText().strip().replace('.', '').replace( ',', '.')) >= teto): textoVM.append('Nome:' + tds[i].getText() + '\n') textoVM.append('Instituto:' + tds[i + 2].getText() + '\n') textoVM.append('Função:' + tds[i + 8].getText() + '\n') textoVM.append('Salário:' + tds[i + 12].getText() + '\n') textoVM.append('\n\n') # Escreve no respectivo arquivo os dados formatados do servidor com super salario arq.writelines(textoVM) # Limpa a "sopa" bsObj.clear() # Exibe qual pagina foi raspada para controle do usuario print(j) # Fechamento dos arquivos arq.close()
def __class_finder(self, session_code): self.session.post(OpenClassSearcher.URL, data=self.clg_trm_dict) self.cls_details_dict["class_session"] = session_code soup = BeautifulSoup( self.session.post(OpenClassSearcher.URL, data=self.cls_details_dict).content, 'html.parser') results = soup.find_all("td", {"class": "cunylite_LEVEL3GRIDROW"}) i = 0 for elem in results: val = elem.text.strip() if match("^\\d+$", val) and self.class_num_5_digit == int(val): self.found = True if elem.find_next("img")["title"] == "Open": self.status = True self.session.close() break i = i + 1 soup.clear(decompose=True) return self
def get_citys(url_path, catogry): try: catgory_request = requests.get(catogry) catgory_soup = BeautifulSoup(catgory_request.text) # 读取城市标签元素 city_eles = catgory_soup.find_all('div', class_='col-xs-10') for city_ele in city_eles: city_eles = city_ele.find_all('a') for city_ele in city_eles: # 城市标签中的href值只是完整标签的后半段,需要进行拼接 city_url = url_path + city_ele.get('href') if city_url.find('北京') == -1: continue print(city_url) get_pages(url_path, city_url) # 完成后清理对象 catgory_soup.clear() catgory_request.close() except Exception as e: print(e)
def test_search(self): self.url = 'http://kickass.to/' searchURL = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc' html = getURL(searchURL, session=requests.Session()) if not html: return soup = BeautifulSoup(html, features=["html5lib", "permissive"]) torrent_table = soup.find('table', attrs={'class': 'data'}) torrent_rows = torrent_table.find_all('tr') if torrent_table else [] # cleanup memory soup.clear(True) #Continue only if one Release is found if len(torrent_rows) < 2: print(u"The data returned does not contain any torrents") return for tr in torrent_rows[1:]: try: link = urlparse.urljoin(self.url, (tr.find('div', { 'class': 'torrentname' }).find_all('a')[1])['href']) id = tr.get('id')[-7:] title = (tr.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \ or (tr.find('div', {'class': 'torrentname'}).find_all('a')[2]).text url = tr.find('a', 'imagnet')['href'] verified = True if tr.find('a', 'iverify') else False trusted = True if tr.find('img', {'alt': 'verified'}) else False seeders = int(tr.find_all('td')[-2].text) leechers = int(tr.find_all('td')[-1].text) except (AttributeError, TypeError): continue print title
def test_search(): # pylint: disable=too-many-locals """ Test searching """ url = 'http://kickass.to/' search_url = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc' html = getURL(search_url, session=requests.Session(), returns='text') if not html: return soup = BeautifulSoup(html, 'html5lib') torrent_table = soup.find('table', attrs={'class': 'data'}) torrent_rows = torrent_table.find_all('tr') if torrent_table else [] # cleanup memory soup.clear(True) # Continue only if one Release is found if len(torrent_rows) < 2: print "The data returned does not contain any torrents" return for row in torrent_rows[1:]: try: link = urlparse.urljoin(url, (row.find('div', {'class': 'torrentname'}).find_all('a')[1])['href']) _id = row.get('id')[-7:] title = (row.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \ or (row.find('div', {'class': 'torrentname'}).find_all('a')[2]).text url = row.find('a', 'imagnet')['href'] verified = True if row.find('a', 'iverify') else False trusted = True if row.find('img', {'alt': 'verified'}) else False seeders = int(row.find_all('td')[-2].text) leechers = int(row.find_all('td')[-1].text) _ = link, _id, verified, trusted, seeders, leechers except (AttributeError, TypeError): continue print title
class BS4Parser: def __init__(self, *args, **kwargs): # list type param of "feature" arg is not currently correctly tested by bs4 (r353) # so for now, adjust param to provide possible values until the issue is addressed kwargs_new = {} for k, v in kwargs.items(): if 'features' in k and isinstance(v, list): v = [ item for item in v if item in ['html5lib', 'html.parser', 'html', 'lxml', 'xml'] ][0] kwargs_new[k] = v self.soup = BeautifulSoup(*args, **kwargs_new) def __enter__(self): return self.soup def __exit__(self, exc_ty, exc_val, tb): self.soup.clear(True) self.soup = None
def test_search(self): self.url = 'http://kickass.to/' searchURL = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc' html = getURL(searchURL, session=requests.Session()) if not html: return soup = BeautifulSoup(html, features=["html5lib", "permissive"]) torrent_table = soup.find('table', attrs={'class': 'data'}) torrent_rows = torrent_table.find_all('tr') if torrent_table else [] # cleanup memory soup.clear(True) #Continue only if one Release is found if len(torrent_rows) < 2: print(u"The data returned does not contain any torrents") return for tr in torrent_rows[1:]: try: link = urlparse.urljoin(self.url, (tr.find('div', {'class': 'torrentname'}).find_all('a')[1])['href']) id = tr.get('id')[-7:] title = (tr.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \ or (tr.find('div', {'class': 'torrentname'}).find_all('a')[2]).text url = tr.find('a', 'imagnet')['href'] verified = True if tr.find('a', 'iverify') else False trusted = True if tr.find('img', {'alt': 'verified'}) else False seeders = int(tr.find_all('td')[-2].text) leechers = int(tr.find_all('td')[-1].text) except (AttributeError, TypeError): continue print title
def extract(html, start, end): """Extract a snippet out of an HTML document. Locations are computed over UTF-8 bytes, and doesn't count HTML tags. Extraction is aware of tags, so: >>> '<p><u>Hello</u> there <i>World</i></p>'[17:27] 'here <i>Wo' >>> extract('<p><u>Hello</u> there <i>World</i></p>', 7, 14) '<p>here <i>Wo</i></p>' """ soup = BeautifulSoup(html, 'html5lib') # Trim the right side first, because that doesn't mess our start position if end is not None: e = find_pos(soup, end, False) e[0].replace_with(NavigableString(split_utf8(e[0].string, e[1])[0])) delete_right(soup, e[2]) # Trim the left side if start is not None: s = find_pos(soup, start, True) s[0].replace_with(NavigableString(split_utf8(s[0].string, s[1])[1])) delete_left(soup, s[2]) # Remove everything but body body = soup.body soup.clear() soup.append(body) # Remove the body tag itself to only have the contents soup.body.unwrap() # Back to text return str(soup)
class BS4Parser: def __init__(self, *args, **kwargs): # list type param of "feature" arg is not currently correctly tested by bs4 (r353) # so for now, adjust param to provide possible values until the issue is addressed kwargs_new = {} for k, v in kwargs.items(): if 'features' in k and isinstance(v, list): v = [item for item in v if item in ['html5lib', 'html.parser', 'html', 'lxml', 'xml']][0] kwargs_new[k] = v tag, attr = [x in kwargs_new and kwargs_new.pop(x) or y for (x, y) in [('tag', 'table'), ('attr', '')]] if attr: args = (re.sub(r'(?is).*(<%(tag)s[^>]+%(attr)s[^>]*>.*</%(tag)s>).*' % {'tag': tag, 'attr': attr}, r'<html><head></head><body>\1</body></html>', args[0]).strip(),) + args[1:] self.soup = BeautifulSoup(*args, **kwargs_new) def __enter__(self): return self.soup def __exit__(self, exc_ty, exc_val, tb): self.soup.clear(True) self.soup = None
def convertHTML2PDF(html_files): # Get sorted list of html files from Web Documentation folder #ordered_html_files = filterAndSortHTMLfiles(html_files) ordered_html_files = html_files # Add Header createHeader() #header_text = createHeader() #outputfile.write(header_text) # Add Table of Contents outline_text = createOutline(html_files) outputfile.write(outline_text) outputfile.write("""<div class="my-fh-wrapper"> """) ChapterNum = '' ChapterNum1 = 1 ChapterNum2 = 0 ChapterNum3 = 0 ChapterNum4 = 0 # After header add following texts of files to prepared large html file for file in ordered_html_files: print(file) #### level = 1 if file == 'MESHFREE.html': level = 1 prev_splitted_file = file.split(".") else: #print(prev_splitted_file) splitted_file = file.split(".") #print(splitted_file) num_level = min(len(prev_splitted_file), len(splitted_file)) for lev in range(num_level): if splitted_file[lev] != prev_splitted_file[lev]: break else: level = level + 1 """ if level == 1: ChapterNum1=ChapterNum1+1 ChapterNum2=0 ChapterNum3=0 ChapterNum4=0 ChapterNum = str(ChapterNum1) """ if level == 2: ChapterNum2 = ChapterNum2 + 1 ChapterNum3 = 0 ChapterNum4 = 0 ChapterNum = str(ChapterNum2) + '. ' if level == 3: ChapterNum3 = ChapterNum3 + 1 ChapterNum4 = 0 ChapterNum = str(ChapterNum2) + '.' + str(ChapterNum3) + '. ' if level == 4: ChapterNum4 = ChapterNum4 + 1 ChapterNum = str(ChapterNum2) + '.' + str( ChapterNum3) + '.' + str(ChapterNum4) + '. ' prev_splitted_file = splitted_file if level > 4: level = -1 #if file.split(".")[0] == 'Index': # level=-1 #### file_text = codecs.open(path + '/' + file, 'r') html_text = file_text.read() soup = BeautifulSoup(html_text, 'lxml') div_fhwrapper = soup.find( "div", {"class": "fh-wrapper"}) # "header", "class": "border-vertical"}) div_fhwrapper.find('div', {"class": "header"}).decompose() # Delete footer #if div_fhwrapper.find('div', {"class": "blank-class-outer-top footer"}): # div_fhwrapper.find('div', {"class": "blank-class-outer-top footer"}).decompose() # Change Link in DOWNLOAD COMPREHENSIVE EXAMPLE to downlad files in svn if div_fhwrapper.find('div', {"class": "blank-class-outer-top footer"}): div_footer_download = div_fhwrapper.find( 'div', {"class": "blank-class-outer-top footer"}) changeLinksInDIV(div_footer_download) if div_fhwrapper.find( 'div', {"class": "blank-class-outer footer visible-md visible-lg"}): div_fhwrapper.find( 'div', { "class": "blank-class-outer footer visible-md visible-lg" }).decompose() if div_fhwrapper.find('div', {"class": "footer-small visible-xs visible-sm"}): div_fhwrapper.find('div', { "class": "footer-small visible-xs visible-sm" }).decompose() # Change style of div "fh-wrapper" div_fhwrapper.attrs[ 'style'] = "margin-left: 2.5rem; margin-right: 2.5rem; display: block;" # adds # before link, to navigate inside pdf changeLinksInDIV(div_fhwrapper) #rename_h1_h2_h3_to_p(div_fhwrapper) # Delete tables with header text : 'This item referenced in:' if div_fhwrapper.find('div', {"class": "blank-class-outer-top"}): #div_blank_class_outer_top = div_fhwrapper.find('div', {"class": "blank-class-outer-top"}) #deleteThisItemReferencedIN(div_blank_class_outer_top) for div_blank_class_outer_top in div_fhwrapper.findAll( 'div', {"class": "blank-class-outer-top"}): # be careful not to change .pdf files links deleteThisItemReferencedIN(div_blank_class_outer_top) if not div_fhwrapper.find('div', {"class": "blank-class-outer-top"}): div_description = soup.find("div", {"class": "description"}) if str(div_description.text).strip() == "": continue # Deletes empty <li> tag in this file for x in div_fhwrapper.findAll('li'): #print(str(x.get_text(strip=True))) if len(x.get_text(strip=True)) == 0: x.extract() for p in x.findAll('p'): p.replaceWithChildren() # Find div breadcrumbs div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"}) # sets id name to div # it is necessary for anchor links div_breadcrumbs.attrs['id'] = str(file) # Change styles div_bordervertical = div_fhwrapper.find("div", {"class": "border-vertical"}) if div_bordervertical: div_bordervertical.attrs[ 'style'] = 'padding-left: 10px; padding-right: 10px;' div_jumbotron = soup.find("div", {"class": "jumbotron"}) div_jumbotron.attrs[ 'style'] = "padding-top: 5px !important; padding-bottom: 10px !important; margin-bottom: 0px !important;" rename_h1_h2_h3_to_p(div_jumbotron, level, ChapterNum) # Renames div 'jumbotron' to 'my-jumbotron' # That is needed not to inherit many default styles for div in soup.find_all('div', class_='jumbotron'): pos = div.attrs['class'].index('jumbotron') div.attrs['class'][pos] = 'my-jumbotron' div_description = soup.find("div", {"class": "description"}) rename_h1_h2_h3_to_p(div_description) for div in div_description.find_all('table', {"id": "customTable"}): spanRowsofTable(div) """ Deletes many break lines fefore text """ strip_text(div_description, file) resize_img_responsive(div_description) # Replaces image to base64 format get_image_file_as_base64_data(div_fhwrapper) # Renames div 'fh-wrapper' to 'my-fh-wrapper' # That is needed not to inherit many default styles for div in soup.find_all('div', class_='fh-wrapper'): pos = div.attrs['class'].index('fh-wrapper') div.attrs['class'][pos] = 'my-fh-wrapper' #for div in soup.find_all('div', class_='code'): # pos = div.attrs['class'].index('code') # div.attrs['class'][pos] = 'my-code' # Trims break lines and spaces of code at the start and end for div in soup.findAll("div", {"class": "note"}): divcode = div.find("div", {"class": "code"}) if divcode: divcode.string = divcode.get_text().strip() for div in soup.find_all('div', class_='note'): pos = div.attrs['class'].index('note') div.attrs['class'][pos] = 'my-note' if file == 'MESHFREE.InstallationGuide.Execute.CommandLine.html': for div in soup.find_all("div", {"class": "my-note"}): div.attrs['style'] = 'white-space: pre-wrap;' # Write to prepared html file content of div 'my-fh-wrapper' div_myfhwrapper = soup.find("div", {"class": "my-fh-wrapper"}) #content = str(div_myfhwrapper) for text in div_myfhwrapper.find_all(recursive=False): #print(j) #find_all(recursive=False) outputfile.write(str(text)) #outputfile.write(content) soup.clear() # print(content) # Close read file in this iteration file_text.close() outputfile.write("""</div>""") """ # Add footer to file meshfree_file = codecs.open(path + '/MESHFREE.html', 'r') meshfree_text = meshfree_file.read() meshfree_soup = BeautifulSoup(meshfree_text, 'lxml') # Find footer div_footer = meshfree_soup.find("div", {"class": "footer-small visible-xs visible-sm"}) # Change links if necessary changeLinksInDIV(div_footer) content_footer = str(div_footer) meshfree_file.close() # write to file outputfile.write(content_footer) """ content_footer = """ <div class="footer-small visible-xs visible-sm"> <div class="blank-class-outer-bottom"> <div class="blank-class-inner"> <div class="row "> <div class="col-md-12"> <a href="#MESHFREE.Releases.html" target="_blank">Releases</a> </div> </div> </div> </div> <div class="blank-class-outer-bottom"> <div class="blank-class-inner"> <div class="row "> <div class="col-md-12"> <a href="https://svn.itwm.fraunhofer.de/svn/MESHFREEdocu/Executables/" target="_blank">Executables</a> </div> </div> </div> </div> <div class="blank-class-outer-left-right"> <div class="blank-class-inner"> <div class="row "> <div class="col-md-12"> <a href="http://itwm.fraunhofer.de" target="_blank"> © 2020 Fraunhofer Institute for Industrial Mathematics ITWM</a> </div> </div> </div> </div> </div> """ outputfile.write(content_footer)
data = table[row].findChildren(name="td") # Getting name and id of the player-card the a element from futbin player_a_element = data[0].findChild(name="a", attrs={"class": "player_name_players_table"}) player_url = player_a_element['href'] player_id = player_url.split("/")[3] player_name = player_a_element.get_text() player_data.append(player_id) player_data.append(player_name) # Getting overall rating of player rating = data[1].findChild(name="span").get_text() player_data.append(rating) # Getting the stats of the player # pace: 8, shooting: 9, passing: 10, dribbling: 11, defending: 12, physicality: 13 pace = 8 physicality = 13 for stat_num in range(pace, physicality): stat = data[stat_num].findChild(name="span").get_text() player_data.append(stat) player_string = ",".join(player_data) file.write(player_string + "\n") soup.clear() page.close() file.close()
def highlight(html, highlights, show_tags=False): """Highlight part of an HTML documents. :param highlights: Iterable of (start, end, tags) triples, which are computed over UTF-8 bytes and don't count HTML tags :param show_tags: Whether to show the tag names within brackets after each highlight """ # Build a list of starting points and ending points starts = [] ends = [] for hl in highlights: starts.append((hl[0], 'start', [])) if len(hl) == 2: ends.append((hl[1], 'end', [])) else: ends.append((hl[1], 'end', hl[2])) # This relies on the fact that 'end' < 'start' events = sorted(ends + starts) events = iter(events) soup = BeautifulSoup(html, 'html5lib') pos = 0 node = soup highlighting = 0 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = event_type = tags = None while node is not None: if getattr(node, 'contents', None): # Move down node = node.contents[0] continue if isinstance(node, NavigableString): # Move through text nb = len(node.string.encode('utf-8')) while event_pos is not None: if event_pos == pos and event_type == 'start': # Start highlighting at beginning of text node highlighting += 1 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None elif pos + nb > event_pos: # Next event falls inside of this text node if event_type == 'start' and highlighting: # Keep highlighting (can't highlight *more*) highlighting += 1 elif ( event_type == 'end' and not show_tags and highlighting > 1 ): # Keep highlighting (no need to put labels) highlighting -= 1 else: # 'end' and (show_tags or highlighting becomes 0) # Split it char_idx = byte_to_str_index( node.string, event_pos - pos, ) left = node.string[:char_idx] right = node.string[char_idx:] # Left part newnode = NavigableString(left) if highlighting: # Optionally highlight left part span = soup.new_tag( 'span', attrs={'class': 'highlight'}, ) span.append(newnode) newnode = span node.replace_with(newnode) node = newnode if event_type == 'start': highlighting += 1 else: highlighting -= 1 if show_tags: # Add tag labels comment = soup.new_tag( 'span', attrs={'class': 'taglist'}, ) comment.string = ' [%s]' % ', '.join(tags) node.insert_after(comment) node = comment # Right part newnode = NavigableString(right) node.insert_after(newnode) node = newnode nb -= event_pos - pos pos = event_pos # Next loop will highlight right part if needed try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None elif highlighting: # and pos + nb <= event_pos: # Highlight whole text node newnode = soup.new_tag( 'span', attrs={'class': 'highlight'}, ) node.replace_with(newnode) newnode.append(node) node = newnode if pos + nb == event_pos and event_type == 'end': if show_tags: comment = soup.new_tag( 'span', attrs={'class': 'taglist'}, ) comment.string = ' [%s]' % ', '.join(tags) newnode.insert_after(comment) node = comment highlighting -= 1 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None break else: # not highlighting and pos + nb <= event_pos # Skip whole text node break pos += nb # Move up until there's a sibling while not node.next_sibling and node.parent: node = node.parent if not node.parent: break # Move to next node node = node.next_sibling # Remove everything but body body = soup.body soup.clear() soup.append(body) # Remove the body tag itself to only have the contents soup.body.unwrap() # Back to text return str(soup)
if "Киләһе бит" in nav_tag.string: next_page_link = HOST_URL + nav_tag.get('href') # getting info from found pages for link in to_parse: webpage = urllib.request.urlopen(link) soup = BeautifulSoup(webpage) link.encode('utf-8') name = soup.find('title').string for tag_refl in soup.find_all('ol', class_="references"): amount = 0 for tag_link in tag_refl.contents: if tag_link.name == 'li': amount += 1 results.append((name, link, amount)) soup.clear() """ looking through tag_refl.contents """ # end of reading Wiki # sorting what we have results.sort(key=lambda x: -x[2]) results = results[:limit + 1] # Jinja code interpretating results result_page = Template(u'''\ <html> <head><title>Results of searching</title></head> <body>