def get_paper_keywords(link): #поиск ключевых слов на странице paper_page = requests.get(link) paper_soup = BeautifulSoup(paper_page.content, 'html.parser') if len(paper_soup.head('meta', attrs={'name': 'keywords'})) != 0: return paper_soup.head('meta', attrs={'name': 'keywords'})[0]['content'] else: print(link) return None
def highlight(url): r = requests.get(url) html_text = r.text soup = BeautifulSoup(html_text, "lxml") headSnippetSoup = BeautifulSoup(SNIPPET_HEADER, "lxml") bodySnippetSoup = BeautifulSoup(SNIPPET_BODY, "lxml") head_snippet = removeTags(headSnippetSoup) body_snippet = removeTags(bodySnippetSoup) head = soup.head head.insert(1, soup.new_tag('style', type='text/css')) head.style.append(highlight_css) head.insert(0, head_snippet) soup.head = head # soup.body = add_text(soup, " I didn't find any helpful answers here") body = soup.body body.insert(0, body_snippet) soup.body = body newsoup = Markup(soup) html = soup.prettify("utf-8") templates = "/SnippetIQ/templates/output_template.html" pwd = os.getcwd() filename = pwd + templates with open(filename, "wb") as file: file.write(html) return newsoup
def _add_instant_tags(self, request, response): if hasattr(response, "content") and getattr(settings, "WTM_INJECT_TAGS", True): strategy = TagStrategy(request) content = response.content.decode(response.charset) doc = BeautifulSoup(content, "html.parser") head = getattr(doc, "head", []) body = getattr(doc, "body", []) for tag in strategy.result: obj = tag.get("object") element = tag.get("element") if head and obj.tag_location == Tag.TOP_HEAD: head.insert(1, element) elif head and obj.tag_location == Tag.BOTTOM_HEAD: head.append(element) elif body and obj.tag_location == Tag.TOP_BODY: body.insert(1, element) elif body and obj.tag_location == Tag.BOTTOM_BODY: body.append(element) doc.head = head doc.body = body response.content = doc.encode(formatter=None) return response return response
def insert_stats(stats): print("Adding " + str(len(stats)) + " statistics to webpage") soup = BeautifulSoup(open(mapfile, "r").read(), "html.parser") body = soup.body head = soup.head head.append(soup.new_tag('style', type='text/css')) head.style.append( '#stats {background-color:#FF6766;\n\tposition:absolute;\n\ttop:5%;\n\tleft:5%;\n\tpadding:10px;\n\tz-index:999;\n\topacity:0.8;\n\tborder-radius:25px;}' ) soup.head = head soup.head.title.string = "Map of Attempted Logins" stats_div = soup.new_tag('div', id='stats') stats_h = soup.new_tag('h1', id='stats_h') stats_h['font-weight'] = "bold" stats_h.string = "Statistics" stats_ul = soup.new_tag('ul') for stat in stats: new_h = soup.new_tag('h3') new_h.string = stat stats_ul.insert(len(stats_ul.contents), new_h) stats_div.insert(0, stats_h) stats_div.insert(1, stats_ul) body.insert(0, stats_div) newtxt = soup.prettify() with open(mapfile, "w") as f: f.write(newtxt)
def get_content(lon=None, lat=None): log.info(f"Pharmacy map creator started [{lon}][{lat}]") locator = Nominatim(user_agent='bot') radius = 1000 # read full dataset raw = requests.get( "http://overpass-api.de/api/interpreter?data=<query type='node'><around lat='" + str(lat) + "' lon='" + str(lon) + "' radius='" + str(radius) + "'/></query><print/>").text data = BeautifulSoup(raw, features='xml') pharmacies = data.find_all('tag', {'k': 'amenity', 'v': 'pharmacy'}) address = pd.Series([ locator.reverse((pharmacy.parent['lat'], pharmacy.parent['lon']), timeout=10000).address for pharmacy in pharmacies ], dtype=str) latitude = pd.Series([pharmacy.parent['lat'] for pharmacy in pharmacies], dtype=float) longitude = pd.Series([pharmacy.parent['lon'] for pharmacy in pharmacies], dtype=float) data = pd.DataFrame({ 'address': address, 'latitude': latitude, 'longitude': longitude }) data['distance'] = data.apply(lambda row: geopy.distance.distance( (row['latitude'], row['longitude']), (lat, lon)).km, axis=1) # getting area borders min_lat = data['latitude'].min() max_lat = data['latitude'].max() min_lon = data['longitude'].min() max_lon = data['longitude'].max() data = data.sort_values(by=['distance']) message = f"Ближайшие аптеки в радиусе {radius} метров \r\n \r\n" for index, point in data.head(min(10, len(data))).iterrows(): message += f"⚕ [{round(point['distance'], 2)} км] {point['address']} \r\n" return map_extension.save_plot(log, data, min_lat, max_lat, min_lon, max_lon, pt_color='#00EB62FF', pt_size=1000, user_lat=lat, user_lon=lon), message
def _add_instant_tags(self): if hasattr(self.response, "content") and getattr( settings, "WTM_INJECT_TAGS", True): doc = BeautifulSoup(self.response.content, "html.parser") head = getattr(doc, "head", []) body = getattr(doc, "body", []) for tag in self.strategy.result: obj = tag.get("object") element = tag.get("element") if head and obj.tag_location == Tag.TOP_HEAD: head.insert(1, element) elif head and obj.tag_location == Tag.BOTTOM_HEAD: head.append(element) elif body and obj.tag_location == Tag.TOP_BODY: body.insert(1, element) elif body and obj.tag_location == Tag.BOTTOM_BODY: body.append(element) doc.head = head doc.body = body self.response.content = doc.decode()
def fetch_html(url): urls = [] words = '' title = '' desc = '' keywords = '' body = '' status = '' server = '' content_type = '' last_modified = '' err = 0 url = re.compile(r"/$").sub('', url) url = re.compile(r"^http://").sub('', url) url = "http://" + url http = urllib3.PoolManager() response = http.request('GET', url) status = response.status server = response.headers['Server'] content_type = response.headers['Content-Type'] last_modified = response.headers['Date'] print( response.status) # 200: ('OK', 'Request fulfilled, document follows'), # print class(response.status) if response.status != 200: status = response.status return (urls, body, title, desc, keywords, status, server, content_type, last_modified, err) print(response.headers) # print (response.data) soup = BeautifulSoup(response.data, "lxml") try: title = clean_html(soup.html.head.title.string) title = convert_accents(title) except: title = '' try: for meta in soup.head('meta'): ctxt = str(meta) pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*key").findall( ctxt.lower()) if pat: temp = re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt) if len(temp) > 1: keywords = temp[1] keywords = re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub( ' ', keywords) keywords = clean_html(keywords) keywords = convert_accents(keywords) keywords = keywords.strip() pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*descrip").findall( ctxt.lower()) if pat: temp = re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt) if len(temp) > 1: desc = temp[1] desc = re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub(' ', desc) desc = convert_accents(desc) desc = desc.strip() except: err = 1 for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() for script in soup.findAll('script'): script.extract() for link in soup.findAll('a', href=True): if len(link['href']) > 9: pat = re.compile(r'^http').findall(link['href']) if pat: href = re.compile(r"/$").sub('', link['href']) temp = re.compile(r"\.").split(href.lower()) size = len(temp) - 1 urls.append(href) body = soup.body(text=True) body = ' '.join(body) body = convert_accents(body) body = clean_html(body) try: body = unicodedata.normalize('NFKD', body).encode('ascii', 'ignore') except: err = 2 try: title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore') except: err = 3 return (urls, body, title, desc, keywords, status, server, content_type, last_modified, err)
error_code = 2 return (urls,body,title,desc,keywords,error_code,error_reason,content_type,last_modified,err) err=0 if info.has_key("content-type"): content_type = str(info["content-type"]) if info.has_key("last-modified"): last_modified = str(info["last-modified"]) soup=BeautifulSoup(data) try: title=cleanHTML(soup.html.head.title.string) title=convertAccents(title) except: title = '' try: for meta in soup.head('meta'): ctxt = str(meta) pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*key").findall(ctxt.lower()) if pat: temp=re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt) if len(temp) > 1: keywords=temp[1] keywords=re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub(' ',keywords) keywords=cleanHTML(keywords) keywords=convertAccents(keywords) keywords=keywords.strip() pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*descrip").findall(ctxt.lower()) if pat: temp=re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt) if len(temp) > 1: desc=temp[1]
# Import part import urllib import urllib.request import json from bs4 import BeautifulSoup # Page reading and converting quote_page = 'https://yandex.by/' # setting the page to read page = urllib.request.urlopen(quote_page) # opening the quoted page data_raw = (page.read() # reading the page soup_page = BeautifulSoup(page, 'html.parser') # create format readable by BeautifulSoup # Start working with BeautifulSoup head = soup_page.head() #tag_found = soup.find(‘h1’, attrs={‘class’: ‘name’}) #tag_found = soup_page.find_all('a') #name = tag_found.strip() # strip() is used to remove starting and trailing print(head)
# -*- coding: utf-8 -*- """ Created on Sat Nov 16 15:57:17 2019 @author: leona """ from bs4 import BeautifulSoup import requests import json soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies2.html'), "html.parser") soup.head() lst_a = soup.select('a') urls = [] for i in lst_a: urls.append(i.get('href')) urls[0] soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies1.html'), "html.parser") soup.head() lst_a = soup.select('a') lst_a for i in lst_a: urls.append(i.get('href')) urls[10000] soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies3.html'), "html.parser") soup.head() lst_a = soup.select('a') lst_a
from bs4 import BeautifulSoup with open("./crawl/beautiful/story.html", "r") as f: response = f.read() soup = BeautifulSoup(response, "html.parser") print(soup.head()) print("**" * 10) print(soup.title()) print("**" * 10) print(soup.body()) print(soup.title.string) print("**" * 10) print(soup.title.parent) print("**" * 10) print("**" * 10) print(soup.h1) p1 = soup.p print("p class name>>{}".format(p1['class'])) p2 = p1.find_next_sibling("p") print("첫번쨰 p >> {}".format(p2)) print("p text >> {}".format(p2.string)) print("p gettext >>{}".format(p2.get_text())) print("p class name >> {}".format(p2["class"])) b = soup.b
__author__ = 'Martin' from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc) if __name__ == '__main__': print soup.head() for key in soup.find_all('a'): print key.get('class'), key.get("href") print soup.findAll('a')