def process_folder(filename): print >> sys.stderr, 'Processing %s' % filename with open(filename, 'r') as f: soup = BeautifulSoup(f.read()) list_name = soup.title(text=True)[0] task_elems = soup.find_all("li", class_="task") print >> sys.stderr, 'Found list {0} with {1} tasks'.format(list_name, len(task_elems)) tasks = [] for el in task_elems: task = { 'title': list(el.find('a', class_='body').children)[0].strip(), 'completed': 'completed' in el['class'], 'created-by': detail(el, 'Created by'), 'assigned-to': detail(el, 'Assigned to'), 'created-on': detail(el, 'Created on', dateparser.parse), 'completed-on': detail(el, 'Completed on', dateparser.parse, True), 'followers': detail(el, 'Followers'), 'activities': [] } tasks.append(task) #completed_at = ac.find(class_='completed-at'.text.strip()) #if completed_at: # task['completed_at'] = dateparser.parse(completed_at) for activity_el in el.findAll('li', class_='activity'): task['activities'].append({ 'summary': activity_el.find(class_='summary').text.strip(), 'detail': text(activity_el.find(class_='activity-detail')), 'date': activity_el.find(class_='date').text.strip() }) return (list_name, tasks)
def s_loader(url, check): # suspended(url) start_time = time.time() urldata = urllib2.urlopen(url, timeout=25).read() load_time = time.time() - start_time #print "url: "+ url + " load %f " %(load_time) if check == 1 and load_time > 5: time.sleep(5) #print url + " step 1 > 4: load_time : %f" % (load_time) load_time2 = s_loader(url, '0') else: load_time2 = load_time if check == 1 and load_time2 > 5: time.sleep(5) #print url + " step 2 > 4: load_time2 : %f" % (load_time2) load_time3 = s_loader(url, '0') else: load_time3 = load_time2 soup = BeautifulSoup(urldata, "html5lib") # print soup #print soup.find_all('img') #print soup.title.string t = soup.title.string broi = str(len(soup.find_all('img'))) if soup.find_all( 'img' ) == "[]" and broi == 0 or soup.title.string == "Account Suspended": # if soup.title() != "503 Service Temporarily Unavailable": print soup.title.string # suspended(urldata) load_time = 999.9 return load_time else: return ((load_time + load_time2 + load_time3) / 3) print soup.title() print "OK it's Working"
def process_one_list_file(filename): stderr(u'Processing %s', filename) with open(filename, 'r') as f: soup = BeautifulSoup(f.read()) list_name = soup.title(text=True)[0] task_elems = soup.find_all("li", class_="task") stderr(u'Found list {0} with {1} tasks'.format(list_name, len(task_elems))) tasks = [] for el in task_elems: task = { 'title': list(el.find('a', class_='body').children)[0].strip(), 'completed': 'completed' in el.find('a', class_='body')['class'], 'assigned-to': detail(el, 'Assigned to'), # 'created-on': detail(el, 'Created on', dateparser.parse), # 'completed-on': detail(el, 'Completed on', dateparser.parse, True), 'subscribers': detail(el, 'Subscribers'), 'activities': [], #'due-on': None, #'subtasks': [] } tasks.append(task) if el.find('span', class_="due-on"): task['due-on'] = dateparser.parse(el.find('span', class_="due-on").text.strip(u'— ')) # for subtask in el.findAll('li', class_='subtask'): task.setdefault('subtasks', []).append(subtask.text.strip()) for activity_el in el.findAll('li', class_='activity'): activity = { 'summary': activity_el.find(class_='summary').text.strip(), 'date': dateparser.parse(activity_el.find(class_='date').text.strip()) } detail_el = activity_el.find(class_='activity-detail') if detail_el: detail_html = "".join([str(x) for x in detail_el.contents]) detail_html = detail_html.decode('utf-8') activity['detail'] = detail_html activity['detail_plain'] = html2text.html2text(detail_html) if detail_el and 'comment' in detail_el['class']: activity['is_comment'] = True task['activities'].append(activity) task['created-at'] = \ activity_log(task['activities'], 'created this task') return (list_name, tasks)
def s_loader( url, check ): # suspended(url) start_time = time.time() urldata = urllib2.urlopen(url, timeout = 25 ).read() load_time = time.time() - start_time #print "url: "+ url + " load %f " %(load_time) if check == 1 and load_time > 5: time.sleep(5) #print url + " step 1 > 4: load_time : %f" % (load_time) load_time2 = s_loader(url, '0') else : load_time2 = load_time if check == 1 and load_time2 > 5: time.sleep(5) #print url + " step 2 > 4: load_time2 : %f" % (load_time2) load_time3 = s_loader(url, '0') else : load_time3 = load_time2 soup = BeautifulSoup(urldata, "html5lib") # print soup #print soup.find_all('img') #print soup.title.string t = soup.title.string broi = str(len(soup.find_all('img'))) if soup.find_all('img') == "[]" and broi==0 or soup.title.string == "Account Suspended": # if soup.title() != "503 Service Temporarily Unavailable": print soup.title.string # suspended(urldata) load_time = 999.9 return load_time else: return ( ( load_time + load_time2 + load_time3 ) / 3 ) print soup.title() print "OK it's Working"
def find(self) -> str: bs = BeautifulSoup(urlopen(self.url).read(), 'lxml') full_content = bs.title(string=True)[0] spl: List[str] = full_content.split(' ') rank: List[str] = [] for x in spl: if x in self.ranks.keys(): print(self.ranks.get(x)) rank.append(str(self.ranks.get(x))) if x.isnumeric(): rank.append(x) break return ' '.join(rank)
def process_folder(filename): print >> sys.stderr, 'Processing %s' % filename with open(filename, 'r') as f: soup = BeautifulSoup(f.read()) list_name = soup.title(text=True)[0] task_elems = soup.find_all("li", class_="task") print >> sys.stderr, 'Found list {0} with {1} tasks'.format( list_name, len(task_elems)) tasks = [] for el in task_elems: task = { 'title': list(el.find('a', class_='body').children)[0].strip(), 'completed': 'completed' in el['class'], 'created-by': detail(el, 'Created by'), 'assigned-to': detail(el, 'Assigned to'), 'created-on': detail(el, 'Created on', dateparser.parse), 'completed-on': detail(el, 'Completed on', dateparser.parse, True), 'followers': detail(el, 'Followers'), 'activities': [] } tasks.append(task) #completed_at = ac.find(class_='completed-at'.text.strip()) #if completed_at: # task['completed_at'] = dateparser.parse(completed_at) for activity_el in el.findAll('li', class_='activity'): task['activities'].append({ 'summary': activity_el.find(class_='summary').text.strip(), 'detail': text(activity_el.find(class_='activity-detail')), 'date': activity_el.find(class_='date').text.strip() }) return (list_name, tasks)
""" import requests from bs4 import BeautifulSoup #attempting to extract the HTML using the request module import urllib.request, re response = urllib.request.urlopen('https://danpeluso.wordpress.com/') html = response.read() #trying again with high school website for number of times town is said # google says this is 15 response2 = urllib.request.urlopen('https://www.yourhtmlsource.com/') html2 = response2.read() #code from beutiful soup soup = BeautifulSoup(html, 'html.parser') print(soup.title()) for link in soup.find_all('a'): print(link.get('href')) #print(soup.get_text()) soup2 = BeautifulSoup(html2, 'html.parser') numsims = soup2.get_text().lower().count('html') #function for finding number of instances of one word (why not lol) # string -> number def dans(txt): for i in range(len(txt)): if (txt[i] == ('P' or 'p')) and (txt[i + 1] == ('r')) and (txt[i + 2] == 'o'): return (1 + dans(txt[i + 3:]))
# basicaly content is a property of requests -->response object would be used to access certain features such as content, headers, etc. #print("\n\n\n\n\n\n\n\n\n###############################################################################################################################\n\n\n\n\n\n\n\n\n\n") #It can parse almost all the elements of an HTML doc, # breaking it down into different tags and pieces which can be filtered out for various use cases. # Parse the HTML soup = BeautifulSoup(r.content, "html5lib") #Beautiful Soup is a Python package for parsing HTML(hypertext markup language) and XML(extensible markup language) documents # (including having malformed markup, # i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used # to extract data from HTML, which is useful for web scraping. print(soup.prettify()) #Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document. #print("\n\n\n\n\n\n\n\n\n###############################################################################################################################\n\n\n\n\n\n\n\n\n\n") title = soup.title() # commonly use type of objects in # print(type(title))# 1 = tag # print(type(title.string))# 2 = NavigableString # print(type(soup))# 3 = BeautifulSoup # 4 = Comment # markup = "<p><!----this is a comment ----></p>" # soup2 = BeautifulSoup(markup) # print(soup2.p.string) #print(title) # get all the paragraph from website #paras = soup.find_all('p') #print(paras) # get all the anchor tags from website #anchor = soup.find_all('a')
def execute_crawler(self, keywords, url): # ToDo: 기자가 많이 언급될 경우 다른 기자이름에 다른 이메일이 매칭 될 수 있음. 수정 필요 self.log.debug("search keywords - {0}".format(keywords)) self.log.debug("target url - {0}".format(url)) for idx, (code, keyword, business_code, business) in tqdm(enumerate(keywords), total=len(keywords)): page_num = 1 search_url = url.format(keyword) self.log.debug("search URL - {0}".format(search_url)) self.driver.get(search_url) tmp_df = pd.DataFrame( columns=['title', 'link', 'press', 'date', 'reporter', 'email', 'article', 'search_keyword', 'company', 'company_code', 'business_code', 'business'] ) while page_num <= 10: html = self.driver.page_source soup = BeautifulSoup(html, 'html.parser') for urls in soup.select("a.info"): if urls["href"].startswith("https://news.naver.com"): self.log.debug("Get URL - {0}".format(urls['href'])) self.driver.get(urls["href"]) news_html = self.driver.page_source news_html_soup = BeautifulSoup(news_html, 'html.parser') tmp_title = news_html_soup.title(string=True) tmp_date = news_html_soup.select('.t11') tmp_article = news_html_soup.select('#articleBodyContents') tmp_press = news_html_soup.select('#footer address') title = tmp_title[0].replace(" : 네이버 뉴스", "") if len(tmp_date) == 0: tmp_date = news_html_soup.select('.article_info')[0].find('em') p_date = tmp_date.get_text().split(" ")[0] else: p_date = tmp_date[0].get_text().split(" ")[0] p_date = datetime.datetime.strptime(p_date, "%Y.%m.%d.") if len(tmp_article) == 0: tmp_article = news_html_soup.select('#articeBody') article = tmp_article[0].get_text().replace('\n', "").replace('\t', "") if not tmp_press[0].a: tmp_press = news_html_soup.select(".article_footer") press = tmp_press[0].a.get_text().replace("\n", "").replace("\t", "").split(" ")[0] else: press = tmp_press[0].a.get_text() email = "" publisher = "" publisher_match = re.search(self.pattern_publisher, article) email_match = re.search(self.pattern_email, article) if publisher_match and email_match: tmp_publisher = publisher_match.group() tmp_publisher = tmp_publisher.strip().split(" ") if len(tmp_publisher) == 1: publisher = tmp_publisher[0].replace("기자", "") elif len(tmp_publisher) == 2: publisher = tmp_publisher[0] elif len(tmp_publisher) == 3: publisher = tmp_publisher[1] tmp_email = email_match.group() email = tmp_email.strip() tokens = self.mecab.pos(str(article)) nouns_tokens = [word for word, tag in tokens if tag == 'NNG' or tag == 'NNP'] tokens_str = ' '.join(nouns_tokens) tmp_df = tmp_df.append({ 'title': title, 'link': urls["href"], 'press': press, 'date': p_date, 'reporter': publisher, 'email': email, 'article': article, 'search_keyword': keyword, 'company': keyword, 'company_code': code, 'business_code': business_code, 'business': business, 'tokens_split': nouns_tokens, 'tokens': tokens_str }, ignore_index=True) self.log.debug("title-{0}, date-{1}, press-{2}, company-{3}".format(title, p_date, press, keyword)) time.sleep(random.randrange(3, 10)) self.driver.back() page_num += 1 if len(self.driver.find_elements_by_class_name('next')) > 0: element = self.driver.find_element_by_class_name("next") element.click() else: break self.log.debug("Save News data cnt - {0}".format(len(tmp_df))) if len(tmp_df) > 0: self.data_handler.save_db(tmp_df) self.driver.quit()
... ''' import bs4 from bs4 import BeautifulSoup import re soup = BeautifulSoup(html_doc, 'lxml') # 调用tag的find_all方法时,BeautifulSoup会检索当前tag的 # 所有子孙节点,如果只想搜索tag的直接子节点,可以使用 # 参数recursive = false print(soup.html.find_all("title")) print(soup.html.find_all("title", recursive=False)) # 只有find_all() 和 find()支持recursive参数 # 像调用find_all()一样调用tag # find_all()的简写方法 print(soup.find_all("a") == soup("a")) print(soup.title.find_all(string=True) == soup.title(string=True)) # 使用find_all方法并设置limit=1参数不如直接调用find()方法 print(soup.find_all("title", limit=1)) print(soup.find('title')) print("\n\n\n", soup.head.title) # 原理 # soup.find("head").find("title") print(soup.title) print(soup.head.title == soup.title)
# recursive soup.html.find_all("title") # [<title>The Dormouse's story</title>] soup.html.find_all("title", recursive=False) # [] # Beautiful Soup offers a lot of tree-searching methods (covered below), and they mostly take the same arguments as find_all(): name, attrs, text, limit, and the keyword arguments. But the recursive argument is different: find_all() and find() are the only methods that support it. Passing recursive=False into a method like find_parents() wouldnt be very useful. # Calling a tag is like calling find_all() soup.find_all("a") soup("a") soup.title.find_all(text=True) soup.title(text=True) # These two lines of code are nearly equivalent: soup.find_all("title", limit=1) # [<title>The Dormouse's story</title>] soup.find("title") # <title>The Dormouse's story</title> soup.head.title # <title>The Dormouse's story</title> soup.find("head").find("title") # <title>The Dormouse's story</title>
print soup.find_all('a', limit=2) # 返回最先搜索到的两个a标签 # recursive参数 # 调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False . print soup.find_all('title') # [<title>The Dormouse's story</title>] print soup.find_all('title', recursive=False) #[] # 像调用 find_all() 一样调用tag # find_all() 几乎是Beautiful Soup中最常用的搜索方法,所以我们定义了它的简写方法. # BeautifulSoup 对象和 tag 对象可以被当作一个方法来使用,这个方法的执行结果与调用这个对象的 find_all() 方法相同 # 下面两行代码是等价的: print soup.find_all('title') print soup('title') # 下面两行代码是等价的: print soup.title.find_all(text=True) print soup.title(text=True) # find() # find( name , attrs , recursive , text , **kwargs ) # find_all() 方法将返回文档中符合条件的所有tag,尽管有时候我们只想得到一个结果.比如文档中只有一个<body>标签, # 那么使用 find_all() 方法来查找<body>标签就不太合适, 使用 find_all 方法并设置 limit=1 参数不如直接使用 find() 方法. # 下面两行代码是等价的: print soup.find_all('title', limit=1) print soup.find('title') # 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果. # find_all() 方法没有找到目标是返回空列表, find() 方法找不到目标时,返回 None . print soup.find_all('tite', limit=1) #[] print soup.find('tite') # None # soup.head.title 是 tag的名字 方法的简写.这个简写的原理就是多次调用当前tag的 find() 方法: print soup.head.title print soup.find('head').find('title')
request = Request(url) response = urlopen(request) html = response.read() #To read the response and store in a file response.close() #To close the link #Shortcut for above steps is to use the requests package import requests url = "https://www.wikipedia.org" r = requests.get(url) #To activate the request text = r.text #To convert html file into text file from bs4 import BeautifulSoup import requests url = "https://www.wikipedia.org" r = requests.get(url) html_doc = r.text soup = BeautifulSoup(html_doc) print(soup.prettify()) #Prints well indendeted HTML files print(soup.title()) #Prints the title of the HTML file print(soup.get_text()) #Extracts the text from the HTML file for link in soup.find_all('a'): print(link.get('href')) #To print all the hyperlinks in the html file with open("a_movie.json") as json_file: json_data = json.load(json_file) #imports as a list r = requests.get(url) json_data = r.json() for key, value in json_data.items(): print(key + ' ', value) #An API is a bunch of codes that allows two software programs to interact with each other
#通过string参数可以搜索文档中的字符串类容 print(soup.find_all(string='Elsie')) print(soup.find_all(string=['Elsie', 'Lacie'])) soup.find_all(string=re.compile("Dormouse")) print("-----------------limit参数-----------------") #find_all() 方法返回全部的搜索结构,如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 limit 参数限制返回结果的数量.效果与SQL中的limit关键字类似,当搜索到的结果数量达到 limit 的限制时,就停止搜索返回结果. soup.find_all('a', limit=2) print('___________________像调用find_all()一样调用tag_______________________') soup.find_all('a') #等价于 soup("a") soup.title.find_all(string=True) soup.title(string=True) print("______________CSS选择器________________") print(soup.select('title')) print(soup.select("p:nth-of-type(3)")) #通过tag标签逐层查找 soup.select("body a") #找到某个tag标签下的直接子标签 soup.select('head > title') soup.select("p > #link1") #找到兄弟节点标签 soup.select("#link1 ~ .sister") #通过CSS的类名查找
############################################## ############# BeautifulSoup import requests from bs4 import BeautifulSoup url = 'https://www.python.org/~guido/' # package the request, send the request and catch the response r = requests.get(url) # extrack the response as html html_doc = r.text # create a BeautifulSoup object from the html soup = BeautifulSoup(html_doc) pretty_soup = soup.prettify() # prettify the BeautifulSoup object print(pretty_soup) guido_title = soup.title() # get the title of the page print(guido_title) print(soup.title) print(soup.get_text) # get the text of the page a_tags = soup.find_all('a') # find all hyperlinks (a tags) for link in a_tags: print(link.get('href')) ############# Scrapy from scrapy import Selector import requests url html = requests.get(url).content sel = Selector(text=html)
#Note: x=0 & y=5 is [0,1,2,3,4]; next numbers should be x=5 & y=10 to check [5,6,7,8,9]. #Note: content[y] does not work if value for len(content) is inputed; last value is len(content)-1. #Beautiful Soup instruction: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ data = {} count = 0 exceptions = {} for i in range(x, y): try: body = [] bodystring = '' aa = a + content[i] data[str(i) + '-(0)Reference'] = aa html = requests.get(aa) soup = BeautifulSoup(html.text, 'html.parser') #print soup.title soup.title = soup.title.encode('utf-8') soup.title = soup.title[7:-8] data[str(i) + '-(1)Title'] = soup.title #Title for author in soup.find('footer'): #Author author = author.find('a') if author == -1: pass else: #print author author1 = author.encode('utf-8') author2 = author1[:-4] author3 = author2.find('>') author4 = author2[author3 + 1:] data[str(i) + '-(2)Author'] = author4 for date in soup.find('time'): #DateTime #print date
try: days_left = (difference / subs_per_second) / 86400 if days_left > 0: days_left_array.append(days_left) except: days_left = 999999999999 avg_value = 0 for value in days_left_array: avg_value += value try: avg_value = avg_value / (len(days_left_array)) except: avg_value = 999999999999 print("Subs Per Second:", subs_per_second) print(first_channel_name.title(), "Subs:", first_channel_subs) print(second_channel_name.title(), "Subs:", second_channel_subs) print(first_channel_name.title(), "Gains:", first_channel_gains) print(second_channel_name.title(), "Gains:", second_channel_gains) print("Difference:", difference) print("Change:", change) print("Time Since Last Update (seconds):", round(time_delta, 2)) print("Days Left:", days_left) print("Days Left (Avg):", avg_value) print("-----------------------------") first = False time1 = time.time() #time.sleep(1) except ValueError: if flag == False: #only alert you the first time print(
def handle_text(filename, img_keyword, sound_keyword, video_keyword): """ :param paras:file name of a charpter, such like 'Charpter1.txt', without directory path. :result: a html file """ # open file and read paragraphs with open(os.path.join('./text/', filename), 'r+') as f: paras = [p.strip() for p in f.readlines() if len(p) > 4] # read html template with open(r'base.txt', 'r+') as f: template_text = f.read() temp = BeautifulSoup(template_text, "lxml") # replace cover img # cover = temp.find('img', {'id': 'cover'}) # cover['src'] = './pics/cover.jpg' # handle title title = temp.find('h3') title.string = paras[0] temp.title = paras[0] # handle paras text_box = temp.find('div', {'id': 'text'}) js_box = temp.find('script', {'id': 'main'}) count = [0,0] img_pat = re.compile(r'\((\W+?)\)\['+img_keyword+r'(\S+?)\]') sound_pat = re.compile(r'\((\W+?)\)\['+sound_keyword+r'(\S+?)\]') video_pat = re.compile(r'\((\W+?)\)\['+video_keyword+r'(\S+?)\]') for i in range(1, len(paras)): new_p = temp.new_tag('p') new_br = temp.new_tag('br') # handle img in text if img_pat.findall(paras[i]): imgs = img_pat.findall(paras[i])# a list of tuple(text, img_id) for img in imgs: img_result = insert_img(img[1], temp, count) new_img_div, count = img_result[0], img_result[1] text_box.append(new_img_div) new_p.string = re.sub(img_pat, r'\1', paras[i])# delete () and [] # text_box.append(new_p) # text_box.append(new_br) if sound_pat.findall(paras[i]): sounds = sound_pat.findall(paras[i]) new_p.string = re.sub(sound_pat, r'\1', paras[i]) for sound in sounds: new_play_logo = insert_sound(sound[0], sound[1], paras[i], temp) new_p.append(new_play_logo) # text_box.append(new_p) # text_box.append(new_br) if video_pat.findall(paras[i]): videos = video_pat.findall(paras[i]) for video in videos: new_video_link = temp.new_string("<a target='_blank' href='"+insert_video(video[1], paras[i], temp) + ".html'>"+video[0]+"</a>") new_p.string = re.sub(video_pat, new_video_link, new_p.string) new_p = BeautifulSoup(html_parser.unescape(str(new_p)), 'lxml') if not (img_pat.findall(paras[i]) or sound_pat.findall(paras[i]) or video_pat.findall(paras[i])): new_p.string = paras[i] text_box.append(new_p) text_box.append(new_br) with open('audio.txt', 'r+') as f: text = f.read() audio_tag = BeautifulSoup(text, 'lxml').div text_box.append(audio_tag) # add js about sound to html script # with open('static/js/audio.js', 'r+') as f: # audio_js = f.read() # js_box.append(audio_js) with open(filename[:-4] + '.html', 'w+') as f: f.write(temp.prettify("utf-8")) print '==========finish ' + filename + '=========='
# beautiful soup testing def has30classes(classes): return classes != None and len(classes) == 30 for elem in soup.find_all(class_=has30classes): print(elem.get_text) # an element that's id only contains letters between a to r print(soup.find(id=re.compile("^[a-r]*$"))) # () and find_all methods print(soup("a") == soup.find_all("a")) print(soup.title(string=True)) # find parent, find sibling print(soup.find_parent("div")) print(soup.title.find_next_sibling("link")) print(soup.a.find_parent("div")) # previous sibling, next sibling print(soup.title.find_previous_sibling()) print(soup.title.find_next_sibling()) # find next, find previous print() print(soup.title.find_previous()) print(soup.title.find_next())
driver.get(search_url) if company in except_company: continue while page_num <= 50: html = driver.page_source soup = BeautifulSoup(html, 'html.parser') for urls in soup.select("._sp_each_url"): if urls["href"].startswith("https://news.naver.com"): driver.get(urls["href"]) news_html = driver.page_source news_html_soup = BeautifulSoup(news_html, 'html.parser') tmp_title = news_html_soup.title(string=True) tmp_date = news_html_soup.select('.t11') tmp_article = news_html_soup.select('#articleBodyContents') tmp_press = news_html_soup.select('#footer address') title = tmp_title[0].replace(" : 네이버 뉴스", "") if len(tmp_date) == 0: tmp_date = news_html_soup.select('.article_info')[0].find('em') p_date = tmp_date.get_text().split(" ")[0] else: p_date = tmp_date[0].get_text().split(" ")[0] p_date = datetime.datetime.strptime(p_date, "%Y.%m.%d.") if len(tmp_article) == 0: tmp_article = news_html_soup.select('#articeBody') article = tmp_article[0].get_text().replace('\n', "").replace('\t', "") if not tmp_press[0].a:
from bs4 import BeautifulSoup import requests import pandas as pd #specify URl url = requests.get( 'https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area').text # Parse the HTML from our URL into the BeautifulSoup parse tree format soup = BeautifulSoup(url, "lxml") # To look at the HTML underlying to the web #print(soup.prettify()) # to get the title of the page soup.title() # use the 'find_all' function to bring back all instances of the 'table' # tag in the HTML and store in 'all_tables' variable all_tables = soup.find_all("table") all_tables # use the 'find_all' function to bring back all instances of the 'table' # tag in the HTML and store in 'all_tables' variable my_table = soup.find('table', {'class': 'wikitable sortable'}) my_table links = my_table.find_all('a') links
def scrape_urls(urls): stop_words_ru = get_stop_words('russian') stop_words_en = get_stop_words('english') stop_words = stop_words_en + stop_words_ru stemmer_ru = SnowballStemmer('russian') stemmer_en = SnowballStemmer('english') df = pd.DataFrame(urls) contents = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0' } for row in df.itertuples(): if not re.compile("^https?://").match(row[1]): print('URL must begin with http:// or https://') continue try: response = requests.get(row[1], headers=headers) except Exception as e: print(row[1], e) if response.status_code == 200: try: soup = BeautifulSoup(response.content, 'lxml') [s.decompose() for s in soup('noscript')] [s.decompose() for s in soup('script')] [s.decompose() for s in soup('style')] title = soup.title(text=True)[0] body = soup.body(text=True) html_body = ' '.join(body) except Exception as e: print(row[1], e) continue try: tokenized_title = word_tokenize(title) result_title = [ stemmer_ru.stem(stemmer_en.stem(i)) for i in tokenized_title if i.lower() not in stop_words and i.isalpha() and len(i) > 3 ] text_from_html = html_body.replace('\n', ' ') tokenized_html = word_tokenize(text_from_html) result_words = [ stemmer_ru.stem(stemmer_en.stem(i)) for i in tokenized_html if i.lower() not in stop_words and i.isalpha() and len(i) > 3 ] title = ' '.join(result_title).lower() content = ' '.join(result_words).lower() if len(row) == 3: contents.append( [row[1], ' '.join([title, content]), row[2]]) else: contents.append(' '.join([title, content])) except Exception as e: print(row[1], e) else: print(f'Unable to reach {row[1]}, {response.status_code}') return contents