async def check_update(self, data) -> Page: html = await self.session(id=data) parser = HTMLParser(html) thread = await self.thread_selector(parser) user = [ await self.pretty_text(p.html) for p in parser.css("div.mfd-post-top-0 > a") ] link = [ await self.pretty_text(p.html) for p in parser.css("div.mfd-post-top-1") ] posts = [ await self.pretty_text(p.html) for p in parser.css("div.mfd-post-body-right") ] ids = [ int(p.attributes['data-id']) for p in parser.css(self.id_selector) ] if len(thread) > 0: tuple_title = tuple( zip_longest(thread, user, link, fillvalue=thread[0])) title = [ f"{title[0]}\n{title[1]}\n{title[2]}" for title in tuple_title ] return Page([ SinglePost(title=data[0], md=data[1], id=data[2]) for data in zip(title, posts, ids) ]) return Page()
def html_to_text(self, html, *args): tree = HTMLParser(html) for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator='\n') text = ' '.join(text.split()) #string return text
def get_text(url): global page_text tree = HTMLParser(url) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() page_text += tree.body.text()
def get_text_from_html(html): tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator='\n') return text
def get_text_content(html) -> Optional[str]: tree = HTMLParser(html) if tree.body: for tag in tree.css("script"): tag.decompose() for tag in tree.css("style"): tag.decompose() text = tree.body.text(separator="\n", strip=True) return text
def parse_text(html): tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() return tree
def htmltotxt(url): # html = urlopen(url).read() html = requests.get(url).text tree = HTMLParser(html) for tag in tree.css('script') + tree.css('style'): tag.decompose() text = re.sub(" +", " ", tree.body.text().replace("\n", "")) title = tree.css_first('title').text().strip() hrefs = set( prepareUrl(url, match[1]) for match in HREF_REGEX.findall(html) if not isUrlNa(match[1])) return title, text, hrefs
def worker(): for i in range(500): html = "<span></span><div><p class='p3'>text</p><p class='p3'>sd</p></div><p></p>" selector = "p.p3" tree = HTMLParser(html) assert tree.css_first(selector).text() == 'text' for tag in tree.css('p'): tag.decompose() for tag in tree.css('span'): tag.decompose()
def get_text_selectolax(html): """ parsing HTML from email and returning crucial parts as TEXT """ tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator='\n') return text
def get_text_selectolax(html): html_parser = HTMLParser(html) if html_parser.body is None: return None for tag in html_parser.css('script'): tag.decompose() for tag in html_parser.css('style'): tag.decompose() parsed_text = html_parser.body.text(separator='\n') return parsed_text
def get_text_selectolax(html): html = html.strip() if len(html) == 0: return None tree = HTMLParser(html) for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator='\n') return text
def parse_selectolax(html): tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'] tree = HTMLParser(html) paragraphs = [] heads = [] links = [] for t in tags: selector = t for node in tree.css(selector): if selector == 'p': paragraphs.append(str(node.text())) if selector == 'h1': heads.append(str(node.text())) if selector == 'h2': heads.append(str(node.text())) if selector == 'h3': heads.append(str(node.text())) if selector == 'h4': heads.append(str(node.text())) if selector == 'h5': heads.append(str(node.text())) if selector == 'h6': heads.append(str(node.text())) if (selector == 'a' and 'href' in node.attributes and 'title' in node.attributes): links.append( str(node.attributes['href']) + "\|" + str(node.attributes['title'])) #url | titulo de noticia, etc. a la que la url apunta return "<p>".join(paragraphs), "<h>".join(heads), "<t>".join(links)
def core_course_get_courses(self, username, password): page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/my/").text ) #todo bundan ham optimal qilish mumkin :/ if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] #Parol xato bo'lsa bo'sh list qaytaradi page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/my/").text) "dasturlash 136 shaklida bo'ladi har bir list elementi" "kursnomi idsi" course_list = [] ''' Parse ni quyidagi qism bo'yicha amalga oshiradi: <div class="media-body"> <h4 class="h5"><a href="http://moodle.fbtuit.uz/course/view.php?id=173" class="">Kurs nomi</a></h4> </div> ''' for node in page.css("div"): if 'class' in node.attributes: if node.attributes['class'] == 'media-body' and node.text( ) != '': if node.text().strip( ) + " " + node.css_first('a').attributes['href'][ node.css_first('a').attributes['href'].find("=") + 1:] in course_list: break course_list.append( node.text().strip() + " " + node.css_first('a').attributes['href'] [node.css_first('a').attributes['href'].find("=") + 1:]) return course_list
def core_course_get_grades( self, username, password ): #, course_id): keyinchalik kurslarni o'zidagi baholarni batafsil ko'rsatadigan qilish "Kurs id si bo'yicha baholarni aytadi" "Buni sal chiroyliroq qilib qo'ygin-ey :)" page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/grade/report/overview/index.php").text ) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return "" page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/grade/report/overview/index.php"). text) grades = "Kurs nomi|Baho\n" #td - table tegi, table dan baholarni olamiz counter = 0 for node in page.css('td'): if node.text() == "": return grades grades += node.text() counter += 1 if counter % 2: grades += "|" else: grades += "\n"
def parse_topics_list(self, page_content: str): results = [] root = HTMLParser(page_content) topic_parent_elements = root.css('.bg1,.bg2') for element in topic_parent_elements: topic_href_element = element.css_first('.topictitle') topic_title = topic_href_element.text() rel_url = topic_href_element.attributes.get('href') author = element.css_first('.author > a').text() author_profile_rel_url = element.css_first( '.author > a').attributes.get('href') answers = element.css_first('.posts').child.html views = element.css_first('.views').child.html last_post_dt_string = element.css_first( '.lastpost span time').attributes.get('datetime') data_container = TopicMetaInfo( domain=self.domain, topic_id=0, url=rel_url, title=topic_title, author=author, author_profile_link=author_profile_rel_url, posts_count=answers, views_count=views, last_post_timestamp=last_post_dt_string) data_container.process() results.append(data_container) return results
def listHome(): html = requests.get(HOST).text body = HTMLParser(html) selector = "div.sdc-site-tile--has-link" for node in body.css(selector): for a in node.css('a.sdc-site-tile__headline-link'): attributes = a.attributes label = a.text(strip=True) url = build_url({'action': 'playVoD', 'path': attributes['href']}) for img in node.css('img.sdc-site-tile__image'): attributes = img.attributes icon = attributes['src'] print('larbel>' + label + '>url>' + url + '>icon>' + icon) #addVideo(label, url, icon) ''' soup = BeautifulSoup(html, 'html.parser') for item in soup('div', 'sdc-site-tile--has-link'): videoitem = item.find('span', {'class': 'sdc-site-tile__badge'}) if videoitem is not None and videoitem.find('path') is not None: headline = item.find('h3', {'class': 'sdc-site-tile__headline'}) label = headline.span.string url = build_url({'action': 'playVoD', 'path': headline.a.get('href')}) icon = item.img.get('src') addVideo(label, url, icon) ''' xbmcplugin.endOfDirectory(addon_handle, cacheToDisc=True)
def get_post_data(thread_url): print(f'getting post data from {thread_url}') local_path = parse_url(thread_url)['filepath'] html = open(local_path, "br").read() p = HTMLParser(html) posts = p.css("li.b-post") for post in posts: try: post_data = {} post_data['content'] = post.css_first( "div.b-post__content").text().replace('\t', '').replace('\n', '') post_data['date'] = format_post_date( post.css_first("div.b-post__timestamp").text()) post_data['author'] = post.css_first("div.author").text().replace( '\t', '').replace('\n', '') post_data['author_url'] = post.css_first("div.author").css_first( "a").attrs['href'] post_data['author_title'] = post.css_first( "div.usertitle").text().replace('\t', '').replace('\n', '') logging.info(f"Post data values: {post_data.values()}") f = open('data/posts.csv', 'a') writer = csv.writer(f, delimiter=',') writer.writerow(post_data.values()) f.close() #get_html(post_data['author_url'],author=True) except: logging.info(f"Error getting post data")
def get_text_from_html(html): """ Uses Selectolax to parse the HTML """ tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css(u'script'): tag.decompose() for tag in tree.css(u'style'): tag.decompose() text = tree.body.text(separator=u'\n') return text
def get_subforum_data(): '''Parse data for each subforum list present on main page''' html = open("scrape/_main.txt", "br").read() p = HTMLParser(html) subforum_lists = p.css("tr.subforum-list") for subforum_list in subforum_lists: subforum_elements = subforum_list.css("div.subforum-info") for subforum_element in subforum_elements: try: subforum_data = {} subforum_title_element = subforum_element.css_first( 'a.subforum-title') subforum_data['title'] = subforum_title_element.text() subforum_data['url'] = subforum_title_element.attrs['href'] subforum_title_list.append( parse_url(subforum_data['url'])['forum']) counts_text = subforum_element.css_first('span.counts').text() counts = counts_text.replace('(', '').replace(')', '').replace( ',', '').split('/') subforum_data['topics'] = counts[0] subforum_data['posts'] = counts[1] logging.info(f"Subforum data is: {subforum_data}") f = open('data/subforums.csv', 'a') writer = csv.writer(f, delimiter=',') writer.writerow(subforum_data.values()) f.close() except: logging.info(f"Error getting subforum data")
def extract_text_from_html(html_file): with open(html_file, "r", encoding="utf-8") as rf: html = rf.read().strip() tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css("script"): tag.decompose() for tag in tree.css("style"): tag.decompose() text = tree.body.text(separator="\n") return text
def get_imdb_page(show): global requests logging.info("Scraping information for show: " + (show)) # We want to query imdb one time url = 'https://www.imdb.com/search/title?title=' + show + '&title_type=tv_series,tv_miniseries&sort=popularity' # Making a response and parsing it response = get(url, headers=headers) if response.status_code != 200: logging.warning('Received status code, ' + str(response.status_code)) raise Exception("Received a non-200 status code!") parser = HTMLParser(response.text) # Update progress bar and wait requests += 1 elapsed_time = time() - start_time os.system('clear') print('Request: {}; Frequency: {} requests/s'.format( requests, requests / elapsed_time)) # We only care about the divs that have the movie name # imdb_page has the link to the tv show's imdb page if (len(parser.css(".lister-item-header a")) <= 0): logging.warning('Did not find any results for: ' + show) raise Exception("Did not find a valif imdb page") imdb_page = "https://www.imdb.com" + parser.css_first( ".lister-item-header a").attributes['href'] return imdb_page
def Text2(url, tokenize): global tree global html global parsed if (Url != url): parsed = False if parsed == False: html = requests.get(url).content tree = HTMLParser(html) parsed = True """ soup = BeautifulSoup(html, 'html.parser') #soup = soup.find_all(string=lambda text:isinstance(text,Comment)) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) text = soup. """ if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator='\n') text = text.replace("\n", "") text = re.sub("[\(\[].*?[\)\]]", "", text) text = re.sub(r"\s+", " ", text) if tokenize == True: tokens = nltk.sent_tokenize(text) return tokens else: return text
def remove_html_tags(cell): tree = HTMLParser(cell) for tag in tree.css('script, style'): tag.decompose() text_content = tree.text(deep=True) return text_content
def get_text_selectolax(html): if type(html) == str: tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() text = tree.body.text(separator=' ').replace('\n', '').replace('\t', ' ').replace('\xa0', ' ') return text else: return np.nan
def _parse_quote_page(url): data = requests.get(url) dom = HTMLParser(data.text) for tag in dom.css('a.authorOrTitle'): if 'href' in tag.attributes: return (tag.text(), tag.attributes['href']) return None
def post_about_dangerous_content(build_directory: Path, **config): OK_URL_PREFIXES = [ "https://github.com/mdn/", ] comments = [] for doc in get_built_docs(build_directory): rendered_html = "\n".join( x["value"]["content"] for x in doc["body"] if x["type"] == "prose" and x["value"]["content"] ) tree = HTMLParser(rendered_html) external_urls = defaultdict(int) for node in tree.css("a[href]"): href = node.attributes.get("href") href = href.split("#")[0] # We're only interested in external URLs at the moment if href.startswith("//") or "://" in href: if any(href.lower().startswith(x.lower()) for x in OK_URL_PREFIXES): # exceptions are skipped continue external_urls[href] += 1 if external_urls: external_urls_list = [] for url in sorted(external_urls): count = external_urls[url] external_urls_list.append( f" - {'🚨 ' if url.startswith('http://') else ''}" f"<{url}> ({count} time{'' if count==1 else 's'})" ) comments.append((doc, "\n".join(external_urls_list))) else: comments.append((doc, "No external URLs")) heading = "## External URLs\n\n" if comments: per_doc_comments = [] for doc, comment in comments: lines = [] if config["prefix"]: url = mdn_url_to_dev_url(config["prefix"], doc["mdn_url"]) lines.append(f"URL: [`{doc['mdn_url']}`]({url})") else: lines.append(f"URL: `{doc['mdn_url']}`") lines.append(f"Title: `{doc['title']}`") lines.append(f"[on GitHub]({doc['source']['github_url']})") lines.append("") lines.append(comment) per_doc_comments.append("\n".join(lines)) return heading + "\n---\n".join(per_doc_comments) else: return heading + "*no external links in the built pages* 👱🏽"
def get_all_website_links_selectolax(self, url): urls = set() domain_name = urlparse(url).netloc domain_list = [ 'news.cgtn.com', 'newsus.cgtn.com', 'newsaf.cgtn.com', 'newseu.cgtn.com' ] domain_list.append(domain_name) try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=url, headers=headers) r = urllib.request.urlopen(req) sll = HTMLParser(r.read()) except: print("exception fail") return urls for a_tag in sll.css("a"): if not "href" in a_tag.attributes: continue href = a_tag.attrs["href"] if href == "" or href is None: continue href = urljoin(url, href) parsed_href = urlparse( href ) # This removes the query portion (e.g. /story?id=12345) resulting in the final string being /story. href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + "?" + parsed_href.query #added query back (TODO: robust?) if not self.is_valid(href): # not a valid URL print("why") continue if href in self.internal_urls: # already in the set continue if not (any(domain in href for domain in domain_list)): # external link # if href not in external_urls: # external_urls.add(href) continue a_tag_text = a_tag.text(deep=True, separator='', strip=False) if not (any(word in a_tag_text for word in self.keywords) or any(word in href for word in self.keywords)): #check if A tag text OR A tag href doesn't contain keyword continue # print(a_tag_text) urls.add(href) self.internal_urls.add(href) return urls
def core_calendar_get_days(self, username, password, _time=''): page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz{f'&time={_time}' if len(_time) else ''}" ).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Saytga kirish": if not self.core_auth_confirm_user(username, password): return 0xff page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz{f'&time={_time}' if _time!='' else ''}" ).text) data = {} for tag in page.css("h2"): #Kalendar ko'rsatayotgan oyni olamiz if tag.attributes.get("class", {}) == "current": data['current'] = tag.text() break for tag in page.css("a"): if "arrow_link" in tag.attributes.get( "class", {} ): #if 'class' tag.attributes da va tag.attributes['class'] == ...ni qisqartirilgani for i in tag.css("span"): if i.attributes['class'] == 'arrow_text': name = i.text() data[tag.attributes['class'].split()[-1]] = { 'name': name, '_time': tag.attributes['href'][tag.attributes['href'].rfind("=") + 1:] } data['days'] = {} #├ #└ for table in page.css("table"): if not ('calendar' in table.attributes.get('class', {})): continue for td in table.css("td"): if td.css_first("a"): if td.css_first("a").attributes.get("class", {}) == 'day': data['days'][td.css_first('a').text( )] = td.css_first('a').attributes['href'][ td.css_first('a').attributes['href'].rfind("=") + 1:] return data
def clear_text(text: str, rm_strong=True) -> str: selector = "strong" text = unicodedata.normalize("NFKD", text) text = text.replace("\n", " ") tree = HTMLParser(text) if rm_strong: for node in tree.css(selector): node.decompose() return tree.text().strip()
def core_calendar_get_tasks(self, username, password, _time): page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/calendar/view.php?view=day&lang=uz&time={_time}" ).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Saytga kirish": if not self.core_auth_confirm_user(username, password): return 0xff page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz&time={_time}" ).text) data = {'tasks': []} for h2 in page.css("h2"): if h2.attributes.get("class", {}) == "current": data['current'] = h2.text() counter = 1 for div in page.css("div"): if div.attributes.get("data-type", '') == 'event': data['tasks'].append({ 'name': "\n├" + str(counter) + ". " + div.css_first("h3").text() + "\n| Oxirgi muddat: " + div.css_first("span").text() }) for i in div.css("a"): if i.text() == "Go to activity": link = i.attributes['href'] break if "quiz" in link: data['tasks'][-1][ 'name'] = "\n├ " + f"<a href='{div.css_first('a').attributes['href']}'>{div.css_first('h3').text()}</a>\ \n| Oxirgi muddat: {div.css_first('span').text()}" data['tasks'][-1]['callback_data'] = None else: data['tasks'][-1]['callback_data'] = link[link.rfind("=") + 1:] counter += 1 return data #print('\n'.join(tasks))