def parse_article_entities(doc): html = HTML(html=doc) #為什麼要多 html= post_entries = html.find('div.r-ent') return post_entries
def parse_page(text): html_page = HTML(html=text) title_css = '#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)' titles = html_page.find(title_css) for t in titles: print(t.text)
import pandas as pd import yaml from requests_html import HTML agenda = [] with open("cache/wwf.html", "r") as f: r = HTML(html=f.read()) dateparse = lambda x: pd.datetime.strptime(x, "%d %b %Y").date() timeparse = lambda x: pd.datetime.strptime(x, "%Y-%m-%d %H:%M") agenda = [] columns = r.find(".col") for col in columns: day = col.find("h2", first=True).text.split(" ", maxsplit=1)[1] day = dateparse(f"{day} 2018") events = col.find("div.event") for event in events: (title, organiser) = event.find("h3", first=True).text.splitlines() (start, end) = event.find("h4", first=True).text.split("-") start = timeparse(f"{day} {start}") end = timeparse(f"{day} {end}") description = event.find("p")[1].text speakers = [i.text for i in event.find("li")] print(day) print(start) print(end)
def requestDoc(doc): html = HTML(html=doc) print(html.links)
def parse_item(text): ''' Args: text : str - html text Returns: tuple: (dict, list) dict - meta data for this item list - tags for this item ''' html = HTML(html=text) title_css = 'body > div.container > h3' title = html.find(title_css)[0].text cover_img_css = 'body > div.container > div.row.movie > div.col-md-9.screencap > a' cover_img_url = html.find(cover_img_css)[0].attrs['href'] tags_css = 'body > div.container > div.row.movie > div.col-md-3.info' tags = html.find(tags_css)[0].find('p') release_date = tags[1].text length = tags[2].text sample_img_css = 'body > div.container > #sample-waterfall > a.sample-box' samples = html.find(sample_img_css) # meta data meta = {} meta['fanhao'], meta['title'] = title.split(maxsplit=1) meta['cover_img_url'] = cover_img_url meta['release_date'] = release_date.split()[1] meta['length'] = re.search(r'\d+', length).group() tag_list = {} tag_list.setdefault('star', []) tag_list.setdefault('genre', []) for tag in tags[3:]: links = tag.find('a') spans = tag.find('span.header') if spans and len(links) == 1: tag_type = (spans[0].text) tag_value = links[0].text if tag_type != '' and tag_value != '': tag_list.setdefault(tag_type, []).append(tag_value) else: for link in links: tag_link = link.attrs['href'] tag_value = link.text if 'genre' in tag_link: tag_type = 'genre' if 'star' in tag_link: tag_type = 'star' if tag_type != '' and tag_value != '': tag_list.setdefault(tag_type, []).append(tag_value) face_list = [] cover = create_face('cover', cover_img_url) if cover is not None: face_list.extend(cover) for sample in samples: link = sample.attrs['href'] face_type = 'sample' sample_face = create_face(face_type, link) if sample_face is not None: face_list.extend(sample_face) meta['tags'] = tag_list return meta, face_list
options.add_argument("--headless") driver = webdriver.Chrome(options=options) categories: t.List[str] = [ "https://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/", "https://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/", "https://www.amazon.com/best-sellers-camera-photo/zgbs/photo/", ] first_url: str = categories[0] driver.get(first_url) body_el = driver.find_element_by_css_selector("body") body_html_str: str = body_el.get_attribute("innerHTML") # Convert to HTML instance. The .links attr shows all links in html html_obj = HTML(html=body_html_str) # Modify links list to have '/' at beginning. Just trims list a little. new_links: t.List[str] = [x for x in html_obj.links if x.startswith("/")] # print(new_links) # ['/product-reviews/B085M812NM/ref=zg_bs_p...', '/gcx/Gifts-for-Everyone/gfhz/ # Get rid of 'product-reviews/' URLs: new_links: t.List[str] = [x for x in new_links if "product-reviews/" not in x] # Now with a leaner list of links, let's make our product page links list product_page_links: t.List[str] = [f"https://amazon.com{x}" for x in new_links] first_product_link: str = product_page_links[0] # print(first_product_link) # https://amazon.com/product-reviews/B07TMJ8S5Z/ref=zg_bs_pc_cr_1/130-9341...
chapter = str(chapter) if int(chapter) < 10: chapter = '00' + chapter elif int(chapter) < 100: chapter = '0' + chapter return chapter def getpage(page): """To change pages number into desired format for saving""" page = str(page) if int(page) < 10: page = '0' + page return page homepage = requests.get('https://www.mangapanda.com/one-piece') titles = HTML(html = homepage.text) titles = titles.find('td') titles = titles[22:-4:2] site = 'https://www.mangapanda.com' for chapter in range(fromc,toc+1): link = '/one-piece/'+str(chapter) mangalink = requests.get(site+link) html = HTML(html = mangalink.text) article = html.find('div#selectpage')
driver = webdriver.Chrome(options=options) driver.get(url) return driver.page_source def extract_id_slug(url_path): regex = r"^[^\s]+(?P<id>\d+)-(?P<slug>[\w-]+)$" group = re.match(regex, url_path) if not group: return None, None return group['id'], group['slug'] content = scraper(url) html_r = HTML(html=content) print(html_r) fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")] datas = [] import pandas as pd for path in fabric_links: id_, slug_ = extract_id_slug(path) print(id_, slug_) data = { "id": id_, "slug": slug_, "path": path, "scraped:": 0 # True / False -> 1 / 0 }
def gen_tweets(pages): request = session.get(url + '&max_position', headers=headers) while pages > 0: try: json_response = request.json() html = HTML( html=json_response["items_html"], url="bunk", default_encoding="utf-8" ) except KeyError: raise ValueError( f'Oops! Either "{query}" does not exist or is private.' ) except ParserError: break comma = "," dot = "." tweets = [] for tweet, profile in zip( html.find(".stream-item"), html.find(".js-profile-popup-actionable") ): # 10~11 html elements have `.stream-item` class and also their `data-item-type` is `tweet` # but their content doesn't look like a tweet's content try: text = tweet.find(".tweet-text")[0].full_text except IndexError: # issue #50 continue tweet_id = tweet.attrs["data-item-id"] tweet_url = profile.attrs["data-permalink-path"] username = profile.attrs["data-screen-name"] user_id = profile.attrs["data-user-id"] is_pinned = bool(tweet.find("div.pinned")) time = datetime.fromtimestamp( int(tweet.find("._timestamp")[0].attrs["data-time-ms"]) / 1000.0 ) interactions = [x.text for x in tweet.find(".ProfileTweet-actionCount")] replies = int( interactions[0].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[3] ) retweets = int( interactions[1].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[4] or interactions[5] ) likes = int( interactions[2].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[6] or interactions[7] ) hashtags = [ hashtag_node.full_text for hashtag_node in tweet.find(".twitter-hashtag") ] urls=[] try: urls = [ url_node.attrs["data-expanded-url"] for url_node in ( tweet.find("a.twitter-timeline-link:not(.u-hidden)") + tweet.find("[class='js-tweet-text-container'] a[data-expanded-url]") ) ] except: print("eeee") urls = list(set(urls)) # delete duplicated elements photos = [ photo_node.attrs["data-image-url"] for photo_node in tweet.find(".AdaptiveMedia-photoContainer") ] is_retweet = ( True if tweet.find(".js-stream-tweet")[0].attrs.get( "data-retweet-id", None ) else False ) videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: styles = node.attrs["style"].split() for style in styles: if style.startswith("background"): tmp = style.split("/")[-1] video_id = ( tmp[: tmp.index(".jpg")] if ".jpg" in tmp else tmp[: tmp.index(".png")] if ".png" in tmp else None ) videos.append({"id": video_id}) tweets.append( { "tweetId": tweet_id, "tweetUrl": tweet_url, "username": username, "userId": user_id, "isRetweet": is_retweet, "isPinned": is_pinned, "time": time, "text": text, "replies": replies, "retweets": retweets, "likes": likes, "entries": { "hashtags": hashtags, "urls": urls, "photos": photos, "videos": videos, }, } ) last_tweet = html.find(".stream-item")[-1].attrs["data-item-id"] for tweet in tweets: tweet["text"] = re.sub(r"(\S)http", "\g<1> http", tweet["text"], 1) tweet["text"] = re.sub( r"(\S)pic\.twitter", "\g<1> pic.twitter", tweet["text"], 1 ) yield tweet request = session.get(url, params={"max_position": json_response['min_position']}, headers=headers) pages += -1
from requests_html import HTML with open('sample.html', 'r') as sf: source = sf.read() html = HTML(html=source) # # Print the whole html content from the html file # print(html.html) # # Print the text for the html file # print(html.text) # Find list of articles articles = html.find('div.article') for article in articles: heading = article.find('h2', first=True).text text = article.find('p', first=True).text print(heading) print(text) print()
from requests_html import HTML import codecs fp = codecs.open( "About this Documentation _ Node.js v8.9.4 Documentation.html", "r", "utf-8") html = HTML(html=fp.read()) # c2=html.find('#column2', first=True) # print(c2,dir(c2)) h1s = html.xpath("./body/div/div/div/h1/span/a") for h1 in h1s: print(h1.attrs["id"]) print(len(h1s)) h2s = html.xpath("./body/div/div/ul/li/a") for i in range(len(h1s)): print(h2s[i].attrs["href"]) print("#" + h1s[i].attrs["id"]) #print(h2s[i].attrs["href"]) pass
def pullpage(url, forcerefresh=False): result = dblink.fetch_resource(url, forcerefresh) return HTML(html=result.decode("utf-8"))
from requests_html import HTML soubor = open('sample.html', encoding="utf8") obsah = soubor.read() soubor.close() html = HTML(html=obsah) for odstavec in html.find('a'): print(odstavec.attrs['href'])
def parse_next_link(doc): html = HTML(html=doc) controls = html.find('.action-bar a.btn.wide') link = controls[1].attrs['href'] return domain + link
def gen_tweets(pages): r = session.get(url, headers=headers) while pages > 0: try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{user}" does not exist or is private.') comma = "," tweets = [] for tweet in html.find('.stream-item'): text = tweet.find('.tweet-text')[0].full_text tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] time = datetime.fromtimestamp( int(tweet.find('._timestamp')[0].attrs['data-time-ms']) / 1000.0) interactions = [ x.text for x in tweet.find('.ProfileTweet-actionCount') ] replies = int(interactions[0].split(" ")[0].replace(comma, "")) retweets = int(interactions[1].split(" ")[0].replace( comma, "")) likes = int(interactions[2].split(" ")[0].replace(comma, "")) hashtags = [ hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag') ] urls = [ url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)') ] photos = [ photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer') ] tweets.append({ 'tweetId': tweetId, 'time': time, 'text': text, 'replies': replies, 'retweets': retweets, 'likes': likes, 'entries': { 'hashtags': hashtags, 'urls': urls, 'photos': photos } }) last_tweet = html.find('.stream-item')[-1].attrs['data-item-id'] for tweet in tweets: if tweet: tweet['text'] = re.sub('http', ' http', tweet['text'], 1) yield tweet r = session.get(url, params={'max_position': last_tweet}, headers=headers) pages += -1
def test_get_csrf_token_no_token(self): """Should return None when token not found in html""" html = HTML(html="<html></html>") self.assertIsNone(get_csrf_token(html, "a"))
from requests_html import HTML from requests import get document = get( 'https://spotifycharts.com/regional/global/weekly/latest').content html = HTML(html=document) table = html.find('.chart-table tbody', first=True) songs = table.find('tr') for song in songs[:10]: print( song.find('.chart-table-position', first=True).text, song.find('.chart-table-track', first=True).text, )
def test_get_csrf_token_no_value(self): """Should return None when html element has no value""" html = HTML(html="<input id='a' />") self.assertIsNone(get_csrf_token(html, "a"))
import requests from requests_html import HTML from tqdm import tqdm import re POST_URL = input("Enter the url of the instagram post: \n") # matching the input url with the instagram's default post url pattern url_pattern = re.compile(r'https?://(www\.)?instagram.com/p/\w+') match = url_pattern.match(POST_URL.strip()) if match: chunk_size = 1024 response = requests.get(POST_URL.strip()) r_html = HTML(html=response.text) meta_tag = r_html.find('meta') no_of_meta_elements = len(meta_tag) if no_of_meta_elements > 25: download_url = meta_tag[24].attrs['content'] else: download_url = meta_tag[10].attrs['content'] # if the download url is fetched download_url_pattern = re.compile(r'https?://instagram\.\w+') is_download_url = download_url_pattern.match( download_url) # this returns either True or False if is_download_url: r = requests.get(download_url, stream=True) total_size = int(r.headers['Content-Length'])
def test_get_csrf_token(self): """Should return token from html element's value""" html = HTML(html="<input id='a' value='b' />") self.assertEqual(get_csrf_token(html, "a"), 'b')
def gen_tweets(pages): r = session.get(url, headers=headers) while pages > 0: status = 'ok' try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except: # let other errors raise status = 'page not found' comma = "," dot = "." tweets = [] for tweet in html.find('.stream-item'): try: text = tweet.find('.tweet-text')[0].full_text except: continue tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] timestamp = datetime.fromtimestamp( int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0) interactions = [x.text for x in tweet.find( '.ProfileTweet-actionCount')] replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,"")) retweets = int(interactions[1].split(" ")[ 0].replace(comma, "").replace(dot,"")) likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,"")) hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')] urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')] photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')] videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: styles = node.attrs['style'].split() for style in styles: if style.startswith('background'): tmp = style.split('/')[-1] video_id = tmp[:tmp.index('.jpg')] videos.append({'id': video_id}) tweets.append({'tweetId': tweetId, 'time': timestamp, 'text': text, 'replies': replies, 'retweets': retweets, 'likes': likes, 'entries': { 'hashtags': hashtags, 'urls': urls, 'photos': photos, 'videos': videos } }) last_tweet = html.find('.stream-item')[-1].attrs['data-item-id'] for tweet in tweets: if tweet: tweet['text'] = re.sub('http', ' http', tweet['text'], 1) yield {'tweet': tweet, 'status': status } r = session.get( url, params = {'max_position': last_tweet}, headers = headers) pages += -1 print('progress:', (amountPages-pages)/amountPages * 100, '%')
leetcode_url = "https://leetcode.com/api/problems/all/" session = HTMLSession() blogs = [] while (True): print("current_page: %s" % blog_pageIndex) response = session.get(blog_url % blog_pageIndex) blog_pageIndex = blog_pageIndex + 1 items = response.html.find('div.postTitle') if (len(items) == 0): break for item in items: blog = Blog() a = HTML(html=item.html).find('a', first=True) if (a): blog.title = a.text blog.href = a.attrs['href'] blogs.append(blog) leetcode_headers = { # 'Accept': 'application/json, text/javascript, */*; q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Content-Type': 'application/json', 'Cookie': '', # 'Referer': 'https://leetcode.com/problemset/all/?status=Solved', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', # 'X-Requested-With': 'XMLHttpRequest',
def gen_tweets(pages): r = session.get(url, headers=headers) while pages > 0: try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{query}" does not exist or is private.') except ParserError: break comma = "," dot = "." tweets = [] for tweet in html.find('.stream-item'): # 10~11 html elements have `.stream-item` class and also their `data-item-type` is `tweet` # but their content doesn't look like a tweet's content try: text = tweet.find('.tweet-text')[0].full_text except IndexError: # issue #50 continue tweet_id = tweet.attrs['data-item-id'] time = datetime.fromtimestamp(int(tweet.find('._timestamp')[0].attrs['data-time-ms']) / 1000.0) interactions = [ x.text for x in tweet.find('.ProfileTweet-actionCount') ] replies = int( interactions[0].split(' ')[0].replace(comma, '').replace(dot, '') or interactions[3] ) retweets = int( interactions[1].split(' ')[0].replace(comma, '').replace(dot, '') or interactions[4] or interactions[5] ) likes = int( interactions[2].split(' ')[0].replace(comma, '').replace(dot, '') or interactions[6] or interactions[7] ) hashtags = [ hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag') ] urls = [ url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)') ] photos = [ photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer') ] is_retweet = True if tweet.find('.js-stream-tweet')[0].attrs.get('data-retweet-id', None) \ else False videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: styles = node.attrs['style'].split() for style in styles: if style.startswith('background'): tmp = style.split('/')[-1] video_id = tmp[:tmp.index('.jpg')] videos.append({'id': video_id}) tweets.append({ 'tweetId': tweet_id, 'isRetweet': is_retweet, 'time': time, 'text': text, 'replies': replies, 'retweets': retweets, 'likes': likes, 'entries': { 'hashtags': hashtags, 'urls': urls, 'photos': photos, 'videos': videos } }) last_tweet = html.find('.stream-item')[-1].attrs['data-item-id'] for tweet in tweets: if tweet: tweet['text'] = re.sub(r'\Shttp', ' http', tweet['text'], 1) tweet['text'] = re.sub(r'\Spic\.twitter', ' pic.twitter', tweet['text'], 1) yield tweet r = session.get(url, params={'max_position': last_tweet}, headers=headers) pages += -1
#Python Tutorial Web Scraping with Requests-HTML from requests_html import HTML, HTMLSession import csv #Open HTML file and pass HTML contents into HTML class. Parse HTML directly. with open("simple.html", "r") as htmlfile: source = htmlfile.read() htmlcode = HTML(html=source) # print(htmlcode.html) ''' <!doctype html> <html class="no-js" lang=""> <head> <title>Test - A Sample Website</title> <meta charset="utf-8"> <link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/main.css"> </head> <body> <h1 id='site_title'>Test Website</h1> <hr></hr> <div class="article"> <h2><a href="article_1.html">Article 1 Headline</a></h2> <p>This is a summary of article 1</p> </div> ... ''' print(htmlcode.text) ''' Test - A Sample Website Test Website
def transfer(org_info): es_action_list = [] for each_punishAnnouncement in db.announcement.find( { 'status': 'checked', 'es_status': { '$nin': ['inserted'] }, # '_id': ObjectId("5c7e0b01c663849a6fd9752f"), 'announcementOrg': { '$regex': org_info } }, no_cursor_timeout=True): try: logger.info(str(each_punishAnnouncement['_id'])) res = es.get( index=str(config['Aliyun_ES']['dev_data_index_name']).strip(), doc_type=str(config['Aliyun_ES']['dev_data_doc_type']).strip(), id=str(each_punishAnnouncement['_id'])) if res['found']: logger.info('exists') db.announcement.update_one( {'_id': ObjectId(each_punishAnnouncement['_id'])}, {'$set': { 'es_status': 'inserted' }}) logger.info('Update existed announcement es_status success') continue except exceptions.NotFoundError: logger.info(str(each_punishAnnouncement['_id'])) punishment_type = each_punishAnnouncement['type'] if each_punishAnnouncement['oss_file_id'] != '': oss_file = db.parsed_data.find_one( {'_id': each_punishAnnouncement['oss_file_id']}) html_content = oss_file['oss_file_content'] oss_file_type = oss_file['oss_file_type'] oss_file_name = oss_file['oss_file_name'] origin_url = oss_file['origin_url'] real_org = each_punishAnnouncement['announcementOrg'] org_cate, announcement_region = get_region_and_org( real_org, origin_url) else: oss_file = {} html_content = '' oss_file_type = '' oss_file_name = '' origin_url = '' real_org = each_punishAnnouncement['announcementOrg'] org_cate, announcement_region = get_region_and_org( real_org, origin_url) if org_cate == '' and announcement_region == '': continue content = '' if oss_file_type == 'html' or oss_file_type == 'shtml': html = HTML(html=html_content) if 'content_id_name' in each_punishAnnouncement.keys(): content = html.find( '#' + each_punishAnnouncement['content_id_name'])[0].html elif 'content_class_name' in each_punishAnnouncement.keys(): content = html.find( '.' + each_punishAnnouncement['content_class_name'])[0].html elif 'content_id_name' in oss_file.keys(): content = html.find('#' + oss_file['content_id_name'])[0].html elif 'content_class_name' in oss_file.keys(): if each_punishAnnouncement['announcementOrg'] == '山东律师协会' and \ 'http://www.sdlawyer.org.cn/003/002/201214631225.htm' in oss_file['origin_url']: content = str(html) else: content = html.find('.' + oss_file['content_class_name'])[0].html else: if len(html.find('.in_main')) > 0: content = html.find('.content')[0].html else: if len(html.find('.main')) > 0: content = html.find('.headInfo')[0].html + \ '<p align="center" class="title">' + \ each_punishAnnouncement['announcementTitle'] + \ '</p>' + \ html.find('#ContentRegion')[0].html else: if len(html.find('.er_main')) > 0: content = html.find('.er_main')[0].html logger.info('er_main') else: if len(html.find('#zwgk_pre')) > 0: content = html.find('#zwgk_pre')[0].html logger.info('zwgk_pre') else: if len(html.find('.f12c')) > 0: content = html.find( '.f12c')[0].html.replace( 'margin-left:-25.1500pt;', '').replace( '/chinese/home/img/mz2.jpg', '') logger.info('f12c') else: if len(html.find('.xl_cen')) > 0: content = html.find('.xl_cen')[0].html logger.info('xl_cen') else: if len(html.find('.iRight')) > 0: content = html.find( '.iRight')[0].html logger.info('iRight') else: if len(html.find( '.TRS_Editor')) > 0: content = html.find( '.TRS_Editor')[0].html logger.info('TRS_Editor') else: if len( html.find( '#tab_content') ) > 0: content = '<table width="100%" cellspacing="1" cellpadding="3" ' \ 'border="0" align="center" class="normal" ' \ 'id="tab_content"><tbody>' + \ html.find('#tab_content')[0].find('tr')[0].html + \ html.find('#tab_content')[0].find('tr')[3].html + \ '</table>' content = content.replace( '#08318d', 'red') logger.info('tab_content') else: if len( html.find( '.hei14jj') ) > 0: content = html.find( '.hei14jj' )[0].find( 'table')[0].html logger.info('hei14jj') else: if len( html.find( '.article-infor' )) > 0: content = html.find( '.article-infor' )[0].html logger.info( 'article-infor' ) else: if len( html.find( '.Section1' )) > 0: content = html.find( '.Section1' )[0].html logger.info( 'Section1') else: logger.error( 'content not exists' ) continue else: content = '' if content != '': soup = bs(content, 'lxml') for div in soup.find_all("a"): div.decompose() content = str(soup.html) publish_date_list = re.split( '[年月日]', each_punishAnnouncement['announcementDate'].replace('\xa0', '')) publish_date_text = publish_date_list[0] + ( '0' + publish_date_list[1] if len(publish_date_list[1]) == 1 else publish_date_list[1]) + ('0' + publish_date_list[2] if len( publish_date_list[2]) == 1 else publish_date_list[2]) punish_datetime = datetime.date(int(publish_date_list[0]), int(publish_date_list[1]), int(publish_date_list[2])) punishment_decision = each_punishAnnouncement[ 'punishmentDecision'].strip() law_list = re.findall('(《.*?》((.*?))?)', punishment_decision) laws_final_map = get_law() for each_law in law_list: if each_law[0] in laws_final_map.keys(): for each_date in laws_final_map[each_law[0]]: if punish_datetime > each_date['date']: punishment_decision = \ punishment_decision.replace( each_law[0], '<a target="_blank" href="' + '/app/lar/' + str(each_date['url']) + '">' + each_law[0] + '</a>' ) # 去除开头冗余 facts = each_punishAnnouncement['facts'] litigant = each_punishAnnouncement['litigant'].replace( ',', ',').replace('(', '(').replace(')', ')').replace(';', ';') defense = each_punishAnnouncement['defenseOpinion'] defense_response = each_punishAnnouncement['defenseResponse'] for each_redundance in redundance_list: facts = re.sub('^' + each_redundance + '[,,。::]?', '', facts) litigant = re.sub('^' + each_redundance + '[,,。::]?', '', litigant) defense = re.sub('^' + each_redundance + '[,,。::]?', '', defense) defense_response = re.sub('^' + each_redundance + '[,,。::]?', '', defense_response) punishment_decision = re.sub('^' + each_redundance + '[,,。::]?', '', punishment_decision) doc = { 'title': each_punishAnnouncement['announcementTitle'], 'document_code': each_punishAnnouncement['announcementCode'], 'publish_date': each_punishAnnouncement['announcementDate'].replace('年0', '年').replace( '月0', '月'), 'publish_date_text': int(publish_date_text), 'litigant_origin_text': litigant, 'litigant': '<p>' + '</p><p>'.join(litigant.strip().split('\n')) + '</p>', 'fact_origin_text': facts.strip(), 'fact': '<p>' + '</p><p>'.join(facts.strip().split('\n')) + '</p>', 'defense': '<p>' + '</p><p>'.join(defense.strip().split('\n')) + '</p>', 'defense_response': '<p>' + '</p><p>'.join(defense_response.strip().split('\n')) + '</p>', 'punishment_basis': '<p>' + '</p><p>'.join(each_punishAnnouncement['punishmentBasement'].strip( ).split('\n')) + '</p>', 'punishment_decision': '<p>' + '</p><p>'.join(punishment_decision.strip().split('\n')) + '</p>', 'punishment_org_cate': org_cate, 'punishment_organization': each_punishAnnouncement['announcementOrg'], 'punishment_region': announcement_region, 'punishment_type': punishment_type, 'content_text': '\n'.join([ each_punishAnnouncement['announcementCode'], litigant, facts, defense, defense_response, each_punishAnnouncement['punishmentBasement'], punishment_decision ]), 'html_content': content, 'oss_file_type': oss_file_type, 'oss_file_id': str(each_punishAnnouncement['oss_file_id']), 'oss_file_name': oss_file_name } es_action_list.append({ '_index': str(config['Aliyun_ES']['dev_data_index_name']).strip(), '_type': str(config['Aliyun_ES']['dev_data_doc_type']).strip(), '_id': str(each_punishAnnouncement['_id']), '_source': doc }) logger.info('one document add to action list\n') if len(es_action_list) == 50: bulk(es, es_action_list, raise_on_error=False) logger.info('Inserted into ES 50 documents!!') for each_es_action in es_action_list: db.announcement.update_one( {'_id': ObjectId(each_es_action['_id'])}, {'$set': { 'es_status': 'inserted' }}) logger.info('Update mongodb es_status success') es_action_list = [] if len(es_action_list) > 0: bulk(es, es_action_list, raise_on_error=False) logger.info('Inserted into ES %d documents!!' % len(es_action_list)) for each_es_action in es_action_list: db.announcement.update_one( {'_id': ObjectId(each_es_action['_id'])}, {'$set': { 'es_status': 'inserted' }}) logger.info('Update mongodb es_status success')
from requests_html import HTML flag = True n_url = input('enter the url of site :') def get_p(url): return requests.get(url) def w_chap(ch): with open('chapter.text', "a", encoding="utf-8") as chap: chap.write(ch) while flag: x = get_p(n_url) h = HTML(html=x.text) try: match = h.find('#next_chap') atr = match[0].attrs n_url = 'https://readnovelfull.com' + atr['href'] print('there is a new chapter, parsing...') except Exception as e: print("no new chapter") flag = False chap_content = h.find('#chr-content', first=True).text w_chap('\n\n' + n_url[40:52] + '\n') w_chap(chap_content)
from requests_html import HTML doc = """<a href='https://www.qiushibaike.com/'>""" html = HTML(html=doc) html.links print(html.html)
def htmlparser(path: pathlib.Path, doctype: str ='DOCTYPE html'): '''HTML Parser.''' DEPRECATED_TAGS = ( 'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame', 'acronym', 'big', 'u', 'isindex', 'basefont', 'dir', 'applet', 'style', ) REQUIRED_TAGS = { 'html': ( ('head', '=', 1), ('body', '=', 1), ), 'head': ( ('title', '=', 1), ), } SELFCLOSED_TAGS = { 'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen', 'link', 'meta', 'output', 'param', 'track', 'wbr' } CLOSE_TAGS = { 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'body', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'fieldset', 'figure', 'figcaption', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'menu', 'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'option', 'optgroup', 'p', 'picture', 'pre', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 'samp', 'script', 'section', 'select', 'source', 'span', 'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time', 'title', 'tfoot', 'tr', 'ul', 'var', 'video' } DEPRECATED_ATTRS = ( 'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink', 'text', 'background', 'bgcolor', 'border', 'char', 'charoff', 'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules', 'value', 'valign', 'accept', 'vspace', 'noframes' ) GLOBAL_ATTRS = ( 'lang', 'id', 'class', ) REQUIRED_ATTRS = { 'html': ('lang',), } NOEMPTY_TAGS = ( 'title', ) class _StdHTMLParser(HTMLParser): def handle_decl(self, data): self.doctype = data self.not_paired_tags = [] self._start_tags = [] self.duplicated_attrs = [] self.tag_not_lowercase = [] def handle_starttag(self, tag, attrs): # tag name must be in lowercase # Python standard module "html.parser" covert tag name from uppercase # to lowercase already. rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self._start_tags.append(tag) self._handle_attrs(attrs) def handle_endtag(self, tag): if tag == self._start_tags[-1]: self._start_tags.pop() else: if tag not in self._start_tags: self.not_paired_tags.append((tag, self.lineno)) else: for t in reversed(self._start_tags): if t != tag: self.not_paired_tags.append((t, self.lineno)) else: self._start_tags.pop() break def handle_startendtag(self, tag, attrs): # tag name must be in lowercase rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self.not_paired_tags.append((tag, self.lineno)) self._handle_attrs(attrs) def _handle_attrs(self, attrs): attrnames = [a[0] for a in attrs] for a in attrs: name, _ = a # attribute name must be lowercase if not name.islower(): pass#self.attr_name_not_lowercase.append((attr_name, self.lineno)) # validate duplicated attributes c = attrnames.count(name) if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs: self.duplicated_attrs.append((f'{name} {c}', self.lineno)) def _raw_tag(self): lineno, pos = self.getpos() rawline = self.rawdata.splitlines()[lineno-1] return rawline[pos+1:pos+1+len(self.lasttag)] try: with path.open() as f: doc = f.read() except FileNotFoundError: return [Report('E00001', path, 0, '')] reports = [] # validate DOCTYPE, using standard HTML parser since # requests-html ignore handling the DOCTYPE lineno = 1 obj = 'DOCTYPE' std_parser = _StdHTMLParser() std_parser.feed(doc) try: if std_parser.doctype != doctype: reports.append(Report('E01002', path, lineno, obj)) return reports rules = { 'not_paired_tags': 'E01005', 'duplicated_attrs': 'E01010', 'tag_not_lowercase': 'E01011', } for a, e in rules.items(): if hasattr(std_parser, a): for t in getattr(std_parser, a): reports.append(Report(e, path, t[1], t[0])) except AttributeError: reports.append(Report('E01001', path, lineno, obj)) return reports finally: std_parser.close() parser = HTML(html=doc) for element in parser.find(): lxml_element = element.element tag = lxml_element.tag lineno = lxml_element.sourceline if tag in DEPRECATED_TAGS: reports.append(Report('E01004', path, lineno, tag)) elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS: reports.append(Report('E01003', path, lineno, tag)) else: pass # validate required elements rules = REQUIRED_TAGS.get(tag) if rules is not None: for r in rules: if eval(f'len(element.find(r[0])) !{r[1]} r[2]'): reports.append(Report('E01008', path, lineno, r[0])) # validate required attributes rules = REQUIRED_ATTRS.get(tag) if rules is not None: for r in rules: if r not in (a.lower() for a in element.attrs): reports.append(Report('E01009', path, lineno, r)) # parse attributes for a in element.attrs: a_lower = a if not a.islower(): reports.append(Report('E01012', path, lineno, a)) a_lower = a.lower() if a_lower in DEPRECATED_ATTRS: reports.append(Report('E01007', path, lineno, a)) elif a_lower not in GLOBAL_ATTRS: reports.append(Report('E01006', path, lineno, a)) for t in NOEMPTY_TAGS: for e in parser.find(t): if not e.text: reports.append(Report('E01013', path, lineno, e.element.tag)) return reports
r = requests.get(URL) if r.status_code == 200: html_text = r.text if save: with open(filename, "w") as f: f.write(html_text) print(f"Request successful: {r.status_code}") return html_text return # Save raw HTML text html_text = url_to_html(URL, save=False) # Convert raw HTML to requests_html HTML object r_html = HTML(html=html_text) # Find the specific table element within the HTML table_class: str = ".imdb-scroll-table" # table_class = "#table" # Same result r_table = r_html.find(table_class) # print(r_table) # [<Element 'div' id='table' class=('a-section', 'imdb-scroll-table', 'mojo-gutter')>] # Extract just the text from the table (similar to r.text) if len(r_table) == 1: # print(r_table[0].text) # Has data but unstructured parsed_table = r_table[0] rows: t.List = parsed_table.find( "tr") # list of [<Element 'tr'>, <Element 'tr'>, ...] # Convert list of Elements to list of Lists
import requests import platform if platform.system() == 'Windows': from requests_html import HTML html = HTML(html=requests.get( "https://github.com/trending/python?since=daily").text) for proj in html.find("article"): title = proj.find("h1 a", first=True) desc = proj.find("p", first=True) print(f"~~~{title.text}~~~", " {") print(f" https://github.com{title.attrs['href']}") try: print(f" {desc.text}") except AttributeError: pass print("}", end="\n\n") if platform.system() == 'Linux': from bs4 import BeautifulSoup html = requests.get("https://github.com/trending/python?since=daily").text soup = BeautifulSoup(html, 'html.parser') for proj in soup.select("article"): title = proj.select_one("h1 a") desc = proj.select_one("p") print(f"{[x.strip() for x in title.text.split('/')]}") print(f"https://github.com{title.attrs['href']}")