Exemplo n.º 1
0
def parse_article_entities(doc):
    html = HTML(html=doc)  #為什麼要多 html=
    post_entries = html.find('div.r-ent')
    return post_entries
Exemplo n.º 2
0
def parse_page(text):
    html_page = HTML(html=text)
    title_css = '#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)'
    titles = html_page.find(title_css)
    for t in titles:
        print(t.text)
Exemplo n.º 3
0
import pandas as pd
import yaml
from requests_html import HTML

agenda = []

with open("cache/wwf.html", "r") as f:
    r = HTML(html=f.read())

dateparse = lambda x: pd.datetime.strptime(x, "%d %b %Y").date()
timeparse = lambda x: pd.datetime.strptime(x, "%Y-%m-%d %H:%M")

agenda = []

columns = r.find(".col")

for col in columns:
    day = col.find("h2", first=True).text.split(" ", maxsplit=1)[1]
    day = dateparse(f"{day} 2018")
    events = col.find("div.event")
    for event in events:
        (title, organiser) = event.find("h3", first=True).text.splitlines()
        (start, end) = event.find("h4", first=True).text.split("-")
        start = timeparse(f"{day} {start}")
        end = timeparse(f"{day} {end}")
        description = event.find("p")[1].text
        speakers = [i.text for i in event.find("li")]
        print(day)
        print(start)
        print(end)
Exemplo n.º 4
0
def requestDoc(doc):
    html = HTML(html=doc)
    print(html.links)
Exemplo n.º 5
0
def parse_item(text):
    '''
    Args:
        text : str - html text

    Returns:
        tuple: (dict, list)
        dict - meta data for this item
        list - tags for this item
    '''
    html = HTML(html=text)
    title_css = 'body > div.container > h3'
    title = html.find(title_css)[0].text
    cover_img_css = 'body > div.container > div.row.movie > div.col-md-9.screencap > a'
    cover_img_url = html.find(cover_img_css)[0].attrs['href']
    tags_css = 'body > div.container > div.row.movie > div.col-md-3.info'
    tags = html.find(tags_css)[0].find('p')
    release_date = tags[1].text
    length = tags[2].text

    sample_img_css = 'body > div.container > #sample-waterfall > a.sample-box'
    samples = html.find(sample_img_css)

    # meta data
    meta = {}
    meta['fanhao'], meta['title'] = title.split(maxsplit=1)
    meta['cover_img_url'] = cover_img_url
    meta['release_date'] = release_date.split()[1]
    meta['length'] = re.search(r'\d+', length).group()

    tag_list = {}
    tag_list.setdefault('star', [])
    tag_list.setdefault('genre', [])
    for tag in tags[3:]:
        links = tag.find('a')
        spans = tag.find('span.header')
        if spans and len(links) == 1:
            tag_type = (spans[0].text)
            tag_value = links[0].text
            if tag_type != '' and tag_value != '':
                tag_list.setdefault(tag_type, []).append(tag_value)
        else:
            for link in links:
                tag_link = link.attrs['href']
                tag_value = link.text
                if 'genre' in tag_link:
                    tag_type = 'genre'
                if 'star' in tag_link:
                    tag_type = 'star'
                if tag_type != '' and tag_value != '':
                    tag_list.setdefault(tag_type, []).append(tag_value)

    face_list = []
    cover = create_face('cover', cover_img_url)
    if cover is not None:
        face_list.extend(cover)

    for sample in samples:
        link = sample.attrs['href']
        face_type = 'sample'
        sample_face = create_face(face_type, link)
        if sample_face is not None:
            face_list.extend(sample_face)

    meta['tags'] = tag_list
    return meta, face_list
Exemplo n.º 6
0
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

categories: t.List[str] = [
    "https://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/",
    "https://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/",
    "https://www.amazon.com/best-sellers-camera-photo/zgbs/photo/",
]

first_url: str = categories[0]
driver.get(first_url)
body_el = driver.find_element_by_css_selector("body")
body_html_str: str = body_el.get_attribute("innerHTML")

# Convert to HTML instance. The .links attr shows all links in html
html_obj = HTML(html=body_html_str)

# Modify links list to have '/' at beginning. Just trims list a little.
new_links: t.List[str] = [x for x in html_obj.links if x.startswith("/")]
# print(new_links)
# ['/product-reviews/B085M812NM/ref=zg_bs_p...', '/gcx/Gifts-for-Everyone/gfhz/
# Get rid of 'product-reviews/' URLs:
new_links: t.List[str] = [x for x in new_links if "product-reviews/" not in x]

# Now with a leaner list of links, let's make our product page links list
product_page_links: t.List[str] = [f"https://amazon.com{x}" for x in new_links]
first_product_link: str = product_page_links[0]

# print(first_product_link)
# https://amazon.com/product-reviews/B07TMJ8S5Z/ref=zg_bs_pc_cr_1/130-9341...
Exemplo n.º 7
0
    chapter = str(chapter)
    if int(chapter) < 10:
        chapter = '00' + chapter
    elif int(chapter) < 100:
        chapter = '0' + chapter
    return chapter

def getpage(page):
    """To change pages number into desired format for saving"""
    page = str(page)
    if int(page) < 10:
        page = '0' + page
    return page

homepage = requests.get('https://www.mangapanda.com/one-piece')
titles = HTML(html = homepage.text)
titles = titles.find('td')
titles = titles[22:-4:2]

site = 'https://www.mangapanda.com'

for chapter in range(fromc,toc+1):


    link = '/one-piece/'+str(chapter)


    mangalink = requests.get(site+link)

    html = HTML(html = mangalink.text)
    article = html.find('div#selectpage')
Exemplo n.º 8
0
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver.page_source


def extract_id_slug(url_path):
    regex = r"^[^\s]+(?P<id>\d+)-(?P<slug>[\w-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']


content = scraper(url)

html_r = HTML(html=content)
print(html_r)

fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
datas = []
import pandas as pd

for path in fabric_links:
    id_, slug_ = extract_id_slug(path)
    print(id_, slug_)
    data = {
        "id": id_,
        "slug": slug_,
        "path": path,
        "scraped:": 0  # True / False -> 1 / 0
    }
Exemplo n.º 9
0
    def gen_tweets(pages):
        request = session.get(url + '&max_position', headers=headers)

        while pages > 0:
            try:
                json_response = request.json()
                html = HTML(
                    html=json_response["items_html"], url="bunk", default_encoding="utf-8"
                )
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{query}" does not exist or is private.'
                )
            except ParserError:
                break

            comma = ","
            dot = "."
            tweets = []
            for tweet, profile in zip(
                html.find(".stream-item"), html.find(".js-profile-popup-actionable")
            ):
                # 10~11 html elements have `.stream-item` class and also their `data-item-type` is `tweet`
                # but their content doesn't look like a tweet's content
                try:
                    text = tweet.find(".tweet-text")[0].full_text
                except IndexError:  # issue #50
                    continue


                tweet_id = tweet.attrs["data-item-id"]

                tweet_url = profile.attrs["data-permalink-path"]

                username = profile.attrs["data-screen-name"]

                user_id = profile.attrs["data-user-id"]

                is_pinned = bool(tweet.find("div.pinned"))

                time = datetime.fromtimestamp(
                    int(tweet.find("._timestamp")[0].attrs["data-time-ms"]) / 1000.0
                )

                interactions = [x.text for x in tweet.find(".ProfileTweet-actionCount")]

                replies = int(
                    interactions[0].split(" ")[0].replace(comma, "").replace(dot, "")
                    or interactions[3]
                )

                retweets = int(
                    interactions[1].split(" ")[0].replace(comma, "").replace(dot, "")
                    or interactions[4]
                    or interactions[5]
                )

                likes = int(
                    interactions[2].split(" ")[0].replace(comma, "").replace(dot, "")
                    or interactions[6]
                    or interactions[7]
                )

                hashtags = [
                    hashtag_node.full_text
                    for hashtag_node in tweet.find(".twitter-hashtag")
                ]
                urls=[]
                try:
                     urls = [
                    url_node.attrs["data-expanded-url"]
                    for url_node in (
                        tweet.find("a.twitter-timeline-link:not(.u-hidden)") +
                        tweet.find("[class='js-tweet-text-container'] a[data-expanded-url]")
                    )
                    ]
                except:
                    print("eeee")
                    
               
                urls = list(set(urls)) # delete duplicated elements

                photos = [
                    photo_node.attrs["data-image-url"]
                    for photo_node in tweet.find(".AdaptiveMedia-photoContainer")
                ]

                is_retweet = (
                    True
                    if tweet.find(".js-stream-tweet")[0].attrs.get(
                        "data-retweet-id", None
                    )
                    else False
                )

                videos = []
                video_nodes = tweet.find(".PlayableMedia-player")
                for node in video_nodes:
                    styles = node.attrs["style"].split()
                    for style in styles:
                        if style.startswith("background"):
                            tmp = style.split("/")[-1]
                            video_id = (
                                tmp[: tmp.index(".jpg")]
                                if ".jpg" in tmp
                                else tmp[: tmp.index(".png")]
                                if ".png" in tmp
                                else None
                            )
                            videos.append({"id": video_id})

                tweets.append(
                    {
                        "tweetId": tweet_id,
                        "tweetUrl": tweet_url,
                        "username": username,
                        "userId": user_id,
                        "isRetweet": is_retweet,
                        "isPinned": is_pinned,
                        "time": time,
                        "text": text,
                        "replies": replies,
                        "retweets": retweets,
                        "likes": likes,
                        "entries": {
                            "hashtags": hashtags,
                            "urls": urls,
                            "photos": photos,
                            "videos": videos,
                        },
                    }
                )

            last_tweet = html.find(".stream-item")[-1].attrs["data-item-id"]

            for tweet in tweets:
                tweet["text"] = re.sub(r"(\S)http", "\g<1> http", tweet["text"], 1)
                tweet["text"] = re.sub(
                    r"(\S)pic\.twitter", "\g<1> pic.twitter", tweet["text"], 1
                )
                yield tweet

            request = session.get(url, params={"max_position": json_response['min_position']}, headers=headers)
            pages += -1
from requests_html import HTML

with open('sample.html', 'r') as sf:
    source = sf.read()
    html = HTML(html=source)

# # Print the whole html content from the html file
# print(html.html)

# # Print the text for the html file
# print(html.text)

# Find list of articles
articles = html.find('div.article')

for article in articles:
    heading = article.find('h2', first=True).text
    text = article.find('p', first=True).text
    print(heading)
    print(text)
    print()
Exemplo n.º 11
0
from requests_html import HTML
import codecs
fp = codecs.open(
    "About this Documentation _ Node.js v8.9.4 Documentation.html", "r",
    "utf-8")
html = HTML(html=fp.read())
# c2=html.find('#column2', first=True)
# print(c2,dir(c2))
h1s = html.xpath("./body/div/div/div/h1/span/a")
for h1 in h1s:
    print(h1.attrs["id"])
print(len(h1s))
h2s = html.xpath("./body/div/div/ul/li/a")
for i in range(len(h1s)):
    print(h2s[i].attrs["href"])
    print("#" + h1s[i].attrs["id"])
    #print(h2s[i].attrs["href"])
    pass
def pullpage(url, forcerefresh=False):
    result = dblink.fetch_resource(url, forcerefresh)
    return HTML(html=result.decode("utf-8"))
Exemplo n.º 13
0
from requests_html import HTML

soubor = open('sample.html', encoding="utf8")
obsah = soubor.read()
soubor.close()
html = HTML(html=obsah)

for odstavec in html.find('a'):
    print(odstavec.attrs['href'])
Exemplo n.º 14
0
 def parse_next_link(doc):
     html = HTML(html=doc)
     controls = html.find('.action-bar a.btn.wide')
     link = controls[1].attrs['href']
     return domain + link
Exemplo n.º 15
0
    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            try:
                html = HTML(html=r.json()['items_html'],
                            url='bunk',
                            default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{user}" does not exist or is private.')

            comma = ","
            tweets = []
            for tweet in html.find('.stream-item'):
                text = tweet.find('.tweet-text')[0].full_text
                tweetId = tweet.find(
                    '.js-permalink')[0].attrs['data-conversation-id']
                time = datetime.fromtimestamp(
                    int(tweet.find('._timestamp')[0].attrs['data-time-ms']) /
                    1000.0)
                interactions = [
                    x.text for x in tweet.find('.ProfileTweet-actionCount')
                ]
                replies = int(interactions[0].split(" ")[0].replace(comma, ""))
                retweets = int(interactions[1].split(" ")[0].replace(
                    comma, ""))
                likes = int(interactions[2].split(" ")[0].replace(comma, ""))
                hashtags = [
                    hashtag_node.full_text
                    for hashtag_node in tweet.find('.twitter-hashtag')
                ]
                urls = [
                    url_node.attrs['data-expanded-url'] for url_node in
                    tweet.find('a.twitter-timeline-link:not(.u-hidden)')
                ]
                photos = [
                    photo_node.attrs['data-image-url'] for photo_node in
                    tweet.find('.AdaptiveMedia-photoContainer')
                ]
                tweets.append({
                    'tweetId': tweetId,
                    'time': time,
                    'text': text,
                    'replies': replies,
                    'retweets': retweets,
                    'likes': likes,
                    'entries': {
                        'hashtags': hashtags,
                        'urls': urls,
                        'photos': photos
                    }
                })

            last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']

            for tweet in tweets:
                if tweet:
                    tweet['text'] = re.sub('http', ' http', tweet['text'], 1)
                    yield tweet

            r = session.get(url,
                            params={'max_position': last_tweet},
                            headers=headers)
            pages += -1
Exemplo n.º 16
0
 def test_get_csrf_token_no_token(self):
     """Should return None when token not found in html"""
     html = HTML(html="<html></html>")
     self.assertIsNone(get_csrf_token(html, "a"))
Exemplo n.º 17
0
from requests_html import HTML
from requests import get

document = get(
    'https://spotifycharts.com/regional/global/weekly/latest').content
html = HTML(html=document)

table = html.find('.chart-table tbody', first=True)
songs = table.find('tr')

for song in songs[:10]:
    print(
        song.find('.chart-table-position', first=True).text,
        song.find('.chart-table-track', first=True).text,
    )
Exemplo n.º 18
0
 def test_get_csrf_token_no_value(self):
     """Should return None when html element has no value"""
     html = HTML(html="<input id='a' />")
     self.assertIsNone(get_csrf_token(html, "a"))
import requests
from requests_html import HTML
from tqdm import tqdm
import re

POST_URL = input("Enter the url of the instagram post: \n")

#   matching the input url with the instagram's default post url pattern
url_pattern = re.compile(r'https?://(www\.)?instagram.com/p/\w+')
match = url_pattern.match(POST_URL.strip())

if match:
    chunk_size = 1024
    response = requests.get(POST_URL.strip())
    r_html = HTML(html=response.text)
    meta_tag = r_html.find('meta')
    no_of_meta_elements = len(meta_tag)

    if no_of_meta_elements > 25:
        download_url = meta_tag[24].attrs['content']
    else:
        download_url = meta_tag[10].attrs['content']

    #   if the download url is fetched
    download_url_pattern = re.compile(r'https?://instagram\.\w+')
    is_download_url = download_url_pattern.match(
        download_url)  # this returns either True or False

    if is_download_url:
        r = requests.get(download_url, stream=True)
        total_size = int(r.headers['Content-Length'])
Exemplo n.º 20
0
 def test_get_csrf_token(self):
     """Should return token from html element's value"""
     html = HTML(html="<input id='a' value='b' />")
     self.assertEqual(get_csrf_token(html, "a"), 'b')
Exemplo n.º 21
0
    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            status = 'ok'
            try:
                html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8')
            except:
                # let other errors raise
                status = 'page not found'
            
            comma = ","
            dot = "."
            tweets = []
            for tweet in html.find('.stream-item'):
                try:
                  text = tweet.find('.tweet-text')[0].full_text
                except:
                  continue
                tweetId = tweet.find(
                    '.js-permalink')[0].attrs['data-conversation-id']
                timestamp = datetime.fromtimestamp(
                    int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0)
                interactions = [x.text for x in tweet.find(
                    '.ProfileTweet-actionCount')]
                replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,""))
                retweets = int(interactions[1].split(" ")[
                               0].replace(comma, "").replace(dot,""))
                likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,""))
                hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')]
                urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')]
                photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')]
                
                videos = []
                video_nodes = tweet.find(".PlayableMedia-player")
                for node in video_nodes:
                    styles = node.attrs['style'].split()
                    for style in styles:
                        if style.startswith('background'):
                            tmp = style.split('/')[-1]
                            video_id = tmp[:tmp.index('.jpg')]
                            videos.append({'id': video_id})
                tweets.append({'tweetId': tweetId, 'time': timestamp, 'text': text,
                               'replies': replies, 'retweets': retweets, 'likes': likes, 
                               'entries': {
                                    'hashtags': hashtags, 'urls': urls,
                                    'photos': photos, 'videos': videos
                                }
                               })

            last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']

            for tweet in tweets:
                if tweet:
                    tweet['text'] = re.sub('http', ' http', tweet['text'], 1)
                    yield {'tweet': tweet, 'status': status }

            r = session.get(
                url, params = {'max_position': last_tweet}, headers = headers)
            pages += -1
            print('progress:', (amountPages-pages)/amountPages * 100, '%')
Exemplo n.º 22
0
    leetcode_url = "https://leetcode.com/api/problems/all/"

    session = HTMLSession()
    blogs = []

    while (True):
        print("current_page: %s" % blog_pageIndex)
        response = session.get(blog_url % blog_pageIndex)
        blog_pageIndex = blog_pageIndex + 1
        items = response.html.find('div.postTitle')
        if (len(items) == 0):
            break

        for item in items:
            blog = Blog()
            a = HTML(html=item.html).find('a', first=True)
            if (a):
                blog.title = a.text
                blog.href = a.attrs['href']

            blogs.append(blog)

    leetcode_headers = {
        # 'Accept': 'application/json, text/javascript, */*; q=0.01',
        # 'Accept-Encoding': 'gzip, deflate, br',
        # 'Accept-Language': 'zh-CN,zh;q=0.9',
        # 'Content-Type': 'application/json',
        'Cookie': '',
        # 'Referer': 'https://leetcode.com/problemset/all/?status=Solved',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
        # 'X-Requested-With': 'XMLHttpRequest',
Exemplo n.º 23
0
    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            try:
                html = HTML(html=r.json()['items_html'],
                            url='bunk', default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{query}" does not exist or is private.')
            except ParserError:
                break

            comma = ","
            dot = "."
            tweets = []
            for tweet in html.find('.stream-item'):
                # 10~11 html elements have `.stream-item` class and also their `data-item-type` is `tweet`
                # but their content doesn't look like a tweet's content
                try:
                    text = tweet.find('.tweet-text')[0].full_text
                except IndexError:  # issue #50
                    continue

                tweet_id = tweet.attrs['data-item-id']

                time = datetime.fromtimestamp(int(tweet.find('._timestamp')[0].attrs['data-time-ms']) / 1000.0)

                interactions = [
                    x.text
                    for x in tweet.find('.ProfileTweet-actionCount')
                ]

                replies = int(
                    interactions[0].split(' ')[0].replace(comma, '').replace(dot, '')
                    or interactions[3]
                )

                retweets = int(
                    interactions[1].split(' ')[0].replace(comma, '').replace(dot, '')
                    or interactions[4]
                    or interactions[5]
                )

                likes = int(
                    interactions[2].split(' ')[0].replace(comma, '').replace(dot, '')
                    or interactions[6]
                    or interactions[7]
                )

                hashtags = [
                    hashtag_node.full_text
                    for hashtag_node in tweet.find('.twitter-hashtag')
                ]
                urls = [
                    url_node.attrs['data-expanded-url']
                    for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')
                ]
                photos = [
                    photo_node.attrs['data-image-url']
                    for photo_node in tweet.find('.AdaptiveMedia-photoContainer')
                ]

                is_retweet = True if tweet.find('.js-stream-tweet')[0].attrs.get('data-retweet-id', None) \
                    else False

                videos = []
                video_nodes = tweet.find(".PlayableMedia-player")
                for node in video_nodes:
                    styles = node.attrs['style'].split()
                    for style in styles:
                        if style.startswith('background'):
                            tmp = style.split('/')[-1]
                            video_id = tmp[:tmp.index('.jpg')]
                            videos.append({'id': video_id})

                tweets.append({
                    'tweetId': tweet_id,
                    'isRetweet': is_retweet,
                    'time': time,
                    'text': text,
                    'replies': replies,
                    'retweets': retweets,
                    'likes': likes,
                    'entries': {
                        'hashtags': hashtags, 'urls': urls,
                        'photos': photos, 'videos': videos
                    }
                })

            last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']

            for tweet in tweets:
                if tweet:
                    tweet['text'] = re.sub(r'\Shttp', ' http', tweet['text'], 1)
                    tweet['text'] = re.sub(r'\Spic\.twitter', ' pic.twitter', tweet['text'], 1)
                    yield tweet

            r = session.get(url, params={'max_position': last_tweet}, headers=headers)
            pages += -1
Exemplo n.º 24
0
#Python Tutorial Web Scraping with Requests-HTML
from requests_html import HTML, HTMLSession
import csv

#Open HTML file and pass HTML contents into HTML class.  Parse HTML directly.
with open("simple.html", "r") as htmlfile:
    source = htmlfile.read()
    htmlcode = HTML(html=source)
# print(htmlcode.html)
'''
<!doctype html>
<html class="no-js" lang="">
    <head>
        <title>Test - A Sample Website</title>
        <meta charset="utf-8">
        <link rel="stylesheet" href="css/normalize.css">
        <link rel="stylesheet" href="css/main.css">
    </head>
    <body>
        <h1 id='site_title'>Test Website</h1>
        <hr></hr>
        <div class="article">
            <h2><a href="article_1.html">Article 1 Headline</a></h2>
            <p>This is a summary of article 1</p>
        </div>
...
'''
print(htmlcode.text)
'''
Test - A Sample Website
Test Website
Exemplo n.º 25
0
def transfer(org_info):
    es_action_list = []
    for each_punishAnnouncement in db.announcement.find(
        {
            'status': 'checked',
            'es_status': {
                '$nin': ['inserted']
            },
            # '_id': ObjectId("5c7e0b01c663849a6fd9752f"),
            'announcementOrg': {
                '$regex': org_info
            }
        },
            no_cursor_timeout=True):
        try:
            logger.info(str(each_punishAnnouncement['_id']))
            res = es.get(
                index=str(config['Aliyun_ES']['dev_data_index_name']).strip(),
                doc_type=str(config['Aliyun_ES']['dev_data_doc_type']).strip(),
                id=str(each_punishAnnouncement['_id']))
            if res['found']:
                logger.info('exists')
                db.announcement.update_one(
                    {'_id': ObjectId(each_punishAnnouncement['_id'])},
                    {'$set': {
                        'es_status': 'inserted'
                    }})
                logger.info('Update existed announcement es_status success')
                continue
        except exceptions.NotFoundError:
            logger.info(str(each_punishAnnouncement['_id']))
        punishment_type = each_punishAnnouncement['type']
        if each_punishAnnouncement['oss_file_id'] != '':
            oss_file = db.parsed_data.find_one(
                {'_id': each_punishAnnouncement['oss_file_id']})
            html_content = oss_file['oss_file_content']
            oss_file_type = oss_file['oss_file_type']
            oss_file_name = oss_file['oss_file_name']
            origin_url = oss_file['origin_url']
            real_org = each_punishAnnouncement['announcementOrg']
            org_cate, announcement_region = get_region_and_org(
                real_org, origin_url)
        else:
            oss_file = {}
            html_content = ''
            oss_file_type = ''
            oss_file_name = ''
            origin_url = ''
            real_org = each_punishAnnouncement['announcementOrg']
            org_cate, announcement_region = get_region_and_org(
                real_org, origin_url)

        if org_cate == '' and announcement_region == '':
            continue

        content = ''
        if oss_file_type == 'html' or oss_file_type == 'shtml':
            html = HTML(html=html_content)
            if 'content_id_name' in each_punishAnnouncement.keys():
                content = html.find(
                    '#' + each_punishAnnouncement['content_id_name'])[0].html
            elif 'content_class_name' in each_punishAnnouncement.keys():
                content = html.find(
                    '.' +
                    each_punishAnnouncement['content_class_name'])[0].html
            elif 'content_id_name' in oss_file.keys():
                content = html.find('#' + oss_file['content_id_name'])[0].html
            elif 'content_class_name' in oss_file.keys():
                if each_punishAnnouncement['announcementOrg'] == '山东律师协会' and \
                        'http://www.sdlawyer.org.cn/003/002/201214631225.htm' in oss_file['origin_url']:
                    content = str(html)
                else:
                    content = html.find('.' +
                                        oss_file['content_class_name'])[0].html
            else:
                if len(html.find('.in_main')) > 0:
                    content = html.find('.content')[0].html
                else:
                    if len(html.find('.main')) > 0:
                        content = html.find('.headInfo')[0].html + \
                                  '<p align="center" class="title">' + \
                                  each_punishAnnouncement['announcementTitle'] + \
                                  '</p>' + \
                                  html.find('#ContentRegion')[0].html
                    else:
                        if len(html.find('.er_main')) > 0:
                            content = html.find('.er_main')[0].html
                            logger.info('er_main')
                        else:
                            if len(html.find('#zwgk_pre')) > 0:
                                content = html.find('#zwgk_pre')[0].html
                                logger.info('zwgk_pre')
                            else:
                                if len(html.find('.f12c')) > 0:
                                    content = html.find(
                                        '.f12c')[0].html.replace(
                                            'margin-left:-25.1500pt;',
                                            '').replace(
                                                '/chinese/home/img/mz2.jpg',
                                                '')
                                    logger.info('f12c')
                                else:
                                    if len(html.find('.xl_cen')) > 0:
                                        content = html.find('.xl_cen')[0].html
                                        logger.info('xl_cen')
                                    else:
                                        if len(html.find('.iRight')) > 0:
                                            content = html.find(
                                                '.iRight')[0].html
                                            logger.info('iRight')
                                        else:
                                            if len(html.find(
                                                    '.TRS_Editor')) > 0:
                                                content = html.find(
                                                    '.TRS_Editor')[0].html
                                                logger.info('TRS_Editor')
                                            else:
                                                if len(
                                                        html.find(
                                                            '#tab_content')
                                                ) > 0:
                                                    content = '<table width="100%" cellspacing="1" cellpadding="3" ' \
                                                              'border="0" align="center" class="normal" ' \
                                                              'id="tab_content"><tbody>' + \
                                                              html.find('#tab_content')[0].find('tr')[0].html + \
                                                              html.find('#tab_content')[0].find('tr')[3].html + \
                                                              '</table>'
                                                    content = content.replace(
                                                        '#08318d', 'red')
                                                    logger.info('tab_content')
                                                else:
                                                    if len(
                                                            html.find(
                                                                '.hei14jj')
                                                    ) > 0:
                                                        content = html.find(
                                                            '.hei14jj'
                                                        )[0].find(
                                                            'table')[0].html
                                                        logger.info('hei14jj')
                                                    else:
                                                        if len(
                                                                html.find(
                                                                    '.article-infor'
                                                                )) > 0:
                                                            content = html.find(
                                                                '.article-infor'
                                                            )[0].html
                                                            logger.info(
                                                                'article-infor'
                                                            )
                                                        else:
                                                            if len(
                                                                    html.find(
                                                                        '.Section1'
                                                                    )) > 0:
                                                                content = html.find(
                                                                    '.Section1'
                                                                )[0].html
                                                                logger.info(
                                                                    'Section1')
                                                            else:
                                                                logger.error(
                                                                    'content not exists'
                                                                )
                                                                continue
        else:
            content = ''
        if content != '':
            soup = bs(content, 'lxml')
            for div in soup.find_all("a"):
                div.decompose()
            content = str(soup.html)

        publish_date_list = re.split(
            '[年月日]',
            each_punishAnnouncement['announcementDate'].replace('\xa0', ''))
        publish_date_text = publish_date_list[0] + (
            '0' + publish_date_list[1] if len(publish_date_list[1]) == 1 else
            publish_date_list[1]) + ('0' + publish_date_list[2] if len(
                publish_date_list[2]) == 1 else publish_date_list[2])
        punish_datetime = datetime.date(int(publish_date_list[0]),
                                        int(publish_date_list[1]),
                                        int(publish_date_list[2]))

        punishment_decision = each_punishAnnouncement[
            'punishmentDecision'].strip()
        law_list = re.findall('(《.*?》((.*?))?)', punishment_decision)
        laws_final_map = get_law()
        for each_law in law_list:
            if each_law[0] in laws_final_map.keys():
                for each_date in laws_final_map[each_law[0]]:
                    if punish_datetime > each_date['date']:
                        punishment_decision = \
                            punishment_decision.replace(
                                each_law[0],
                                '<a target="_blank" href="' + '/app/lar/' + str(each_date['url'])
                                + '">' + each_law[0] + '</a>'
                            )

        # 去除开头冗余
        facts = each_punishAnnouncement['facts']
        litigant = each_punishAnnouncement['litigant'].replace(
            ',', ',').replace('(', '(').replace(')', ')').replace(';', ';')
        defense = each_punishAnnouncement['defenseOpinion']
        defense_response = each_punishAnnouncement['defenseResponse']

        for each_redundance in redundance_list:
            facts = re.sub('^' + each_redundance + '[,,。::]?', '', facts)

            litigant = re.sub('^' + each_redundance + '[,,。::]?', '', litigant)

            defense = re.sub('^' + each_redundance + '[,,。::]?', '', defense)

            defense_response = re.sub('^' + each_redundance + '[,,。::]?', '',
                                      defense_response)

            punishment_decision = re.sub('^' + each_redundance + '[,,。::]?',
                                         '', punishment_decision)

        doc = {
            'title':
            each_punishAnnouncement['announcementTitle'],
            'document_code':
            each_punishAnnouncement['announcementCode'],
            'publish_date':
            each_punishAnnouncement['announcementDate'].replace('年0',
                                                                '年').replace(
                                                                    '月0', '月'),
            'publish_date_text':
            int(publish_date_text),
            'litigant_origin_text':
            litigant,
            'litigant':
            '<p>' + '</p><p>'.join(litigant.strip().split('\n')) + '</p>',
            'fact_origin_text':
            facts.strip(),
            'fact':
            '<p>' + '</p><p>'.join(facts.strip().split('\n')) + '</p>',
            'defense':
            '<p>' + '</p><p>'.join(defense.strip().split('\n')) + '</p>',
            'defense_response':
            '<p>' + '</p><p>'.join(defense_response.strip().split('\n')) +
            '</p>',
            'punishment_basis':
            '<p>' +
            '</p><p>'.join(each_punishAnnouncement['punishmentBasement'].strip(
            ).split('\n')) + '</p>',
            'punishment_decision':
            '<p>' + '</p><p>'.join(punishment_decision.strip().split('\n')) +
            '</p>',
            'punishment_org_cate':
            org_cate,
            'punishment_organization':
            each_punishAnnouncement['announcementOrg'],
            'punishment_region':
            announcement_region,
            'punishment_type':
            punishment_type,
            'content_text':
            '\n'.join([
                each_punishAnnouncement['announcementCode'], litigant, facts,
                defense, defense_response,
                each_punishAnnouncement['punishmentBasement'],
                punishment_decision
            ]),
            'html_content':
            content,
            'oss_file_type':
            oss_file_type,
            'oss_file_id':
            str(each_punishAnnouncement['oss_file_id']),
            'oss_file_name':
            oss_file_name
        }

        es_action_list.append({
            '_index':
            str(config['Aliyun_ES']['dev_data_index_name']).strip(),
            '_type':
            str(config['Aliyun_ES']['dev_data_doc_type']).strip(),
            '_id':
            str(each_punishAnnouncement['_id']),
            '_source':
            doc
        })
        logger.info('one document add to action list\n')
        if len(es_action_list) == 50:
            bulk(es, es_action_list, raise_on_error=False)
            logger.info('Inserted into ES 50 documents!!')
            for each_es_action in es_action_list:
                db.announcement.update_one(
                    {'_id': ObjectId(each_es_action['_id'])},
                    {'$set': {
                        'es_status': 'inserted'
                    }})
            logger.info('Update mongodb es_status success')
            es_action_list = []

    if len(es_action_list) > 0:
        bulk(es, es_action_list, raise_on_error=False)
        logger.info('Inserted into ES %d documents!!' % len(es_action_list))
        for each_es_action in es_action_list:
            db.announcement.update_one(
                {'_id': ObjectId(each_es_action['_id'])},
                {'$set': {
                    'es_status': 'inserted'
                }})
        logger.info('Update mongodb es_status success')
Exemplo n.º 26
0
from requests_html import HTML

flag = True
n_url = input('enter the url of site :')


def get_p(url):
    return requests.get(url)


def w_chap(ch):
    with open('chapter.text', "a", encoding="utf-8") as chap:
        chap.write(ch)


while flag:
    x = get_p(n_url)
    h = HTML(html=x.text)
    try:
        match = h.find('#next_chap')
        atr = match[0].attrs
        n_url = 'https://readnovelfull.com' + atr['href']
        print('there is a new chapter, parsing...')

    except Exception as e:
        print("no new chapter")
        flag = False
    chap_content = h.find('#chr-content', first=True).text
    w_chap('\n\n' + n_url[40:52] + '\n')
    w_chap(chap_content)
Exemplo n.º 27
0
from requests_html import HTML
doc = """<a href='https://www.qiushibaike.com/'>"""

html = HTML(html=doc)
html.links
print(html.html)
Exemplo n.º 28
0
def htmlparser(path: pathlib.Path, doctype: str ='DOCTYPE html'):
    '''HTML Parser.'''

    DEPRECATED_TAGS = (
        'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame',
        'acronym', 'big', 'u', 'isindex', 'basefont', 'dir', 'applet',
        'style',
    )

    REQUIRED_TAGS = {
        'html': (
            ('head', '=', 1),
            ('body', '=', 1),
        ),
        'head': (
            ('title', '=', 1),
        ),
    }

    SELFCLOSED_TAGS = {
        'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen',
        'link', 'meta', 'output', 'param', 'track', 'wbr'
    }

    CLOSE_TAGS = {
        'a', 'abbr', 'address', 'article', 'aside', 'audio',
        'bdi', 'bdo', 'blockquote', 'body', 'button',
        'canvas', 'caption', 'cite', 'code', 'col', 'colgroup',
        'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div',
            'dl', 'dt',
        'em',
        'fieldset', 'figure', 'figcaption', 'footer', 'form', 'frameset',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html',
        'ins',
        'kbd',
        'label', 'legend', 'li',
        'main', 'map', 'menu', 'menuitem', 'meter',
        'nav', 'noscript',
        'object', 'ol', 'option', 'optgroup',
        'p', 'picture', 'pre', 'progress',
        'q',
        'rb', 'rp', 'rt', 'rtc', 'ruby',
        'samp', 'script', 'section', 'select', 'source', 'span', 'strong',
            'sub', 'sup',
        'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time',
            'title', 'tfoot', 'tr',
        'ul',
        'var', 'video'
    }

    DEPRECATED_ATTRS = (
        'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink',
        'text', 'background', 'bgcolor', 'border', 'char', 'charoff',
        'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules',
        'value', 'valign', 'accept', 'vspace', 'noframes'
    )

    GLOBAL_ATTRS = (
        'lang', 'id', 'class',
    )

    REQUIRED_ATTRS = {
        'html': ('lang',),
    }

    NOEMPTY_TAGS = (
        'title',
    )

    class _StdHTMLParser(HTMLParser):
        def handle_decl(self, data):
            self.doctype = data
            self.not_paired_tags = []
            self._start_tags = []
            self.duplicated_attrs = []
            self.tag_not_lowercase = []

        def handle_starttag(self, tag, attrs):

            # tag name must be in lowercase
            # Python standard module "html.parser" covert tag name from uppercase
            # to lowercase already.
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self._start_tags.append(tag)
            self._handle_attrs(attrs)

        def handle_endtag(self, tag):
            if tag == self._start_tags[-1]:
                self._start_tags.pop()
            else:
                if tag not in self._start_tags:
                    self.not_paired_tags.append((tag, self.lineno))
                else:
                    for t in reversed(self._start_tags):
                        if t != tag:
                            self.not_paired_tags.append((t, self.lineno))
                        else:
                            self._start_tags.pop()
                            break

        def handle_startendtag(self, tag, attrs):
            # tag name must be in lowercase
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self.not_paired_tags.append((tag, self.lineno))
            self._handle_attrs(attrs)

        def _handle_attrs(self, attrs):
            attrnames = [a[0] for a in attrs]
            for a in attrs:
                name, _ = a

                # attribute name must be lowercase
                if not name.islower():
                    pass#self.attr_name_not_lowercase.append((attr_name, self.lineno))

                # validate duplicated attributes
                c = attrnames.count(name)
                if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs:
                    self.duplicated_attrs.append((f'{name} {c}', self.lineno))

        def _raw_tag(self):
            lineno, pos = self.getpos()
            rawline = self.rawdata.splitlines()[lineno-1]
            return rawline[pos+1:pos+1+len(self.lasttag)]

    try:
        with path.open() as f:
            doc = f.read()
    except FileNotFoundError:
        return [Report('E00001', path, 0, '')]
    reports = []

    # validate DOCTYPE, using standard HTML parser since
    # requests-html ignore handling the DOCTYPE
    lineno = 1
    obj = 'DOCTYPE'
    std_parser = _StdHTMLParser()
    std_parser.feed(doc)
    try:
        if std_parser.doctype != doctype:
            reports.append(Report('E01002', path, lineno, obj))
            return reports

        rules = {
            'not_paired_tags': 'E01005',
            'duplicated_attrs': 'E01010',
            'tag_not_lowercase': 'E01011',
        }
        for a, e in rules.items():
            if hasattr(std_parser, a):
                for t in getattr(std_parser, a):
                    reports.append(Report(e, path, t[1], t[0]))

    except AttributeError:
        reports.append(Report('E01001', path, lineno, obj))
        return reports
    finally:
        std_parser.close()

    parser = HTML(html=doc)
    for element in parser.find():
        lxml_element = element.element
        tag = lxml_element.tag
        lineno = lxml_element.sourceline
        if tag in DEPRECATED_TAGS:
            reports.append(Report('E01004', path, lineno, tag))
        elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS:
            reports.append(Report('E01003', path, lineno, tag))
        else:
            pass
        
        # validate required elements
        rules = REQUIRED_TAGS.get(tag)
        if rules is not None:
            for r in rules:
                if eval(f'len(element.find(r[0])) !{r[1]} r[2]'):
                    reports.append(Report('E01008', path, lineno, r[0]))

        # validate required attributes
        rules = REQUIRED_ATTRS.get(tag)
        if rules is not None:
            for r in rules:
                if r not in (a.lower() for a in element.attrs):
                    reports.append(Report('E01009', path, lineno, r))

        # parse attributes
        for a in element.attrs:
            a_lower = a
            if not a.islower():
                reports.append(Report('E01012', path, lineno, a))
                a_lower = a.lower()
            if a_lower in DEPRECATED_ATTRS:
                reports.append(Report('E01007', path, lineno, a))
            elif a_lower not in GLOBAL_ATTRS:
                reports.append(Report('E01006', path, lineno, a))

    for t in NOEMPTY_TAGS:
        for e in parser.find(t):
            if not e.text:
                reports.append(Report('E01013', path, lineno, e.element.tag))

    return reports
Exemplo n.º 29
0
    r = requests.get(URL)
    if r.status_code == 200:
        html_text = r.text
        if save:
            with open(filename, "w") as f:
                f.write(html_text)
        print(f"Request successful: {r.status_code}")
        return html_text
    return


# Save raw HTML text
html_text = url_to_html(URL, save=False)

# Convert raw HTML to requests_html HTML object
r_html = HTML(html=html_text)

# Find the specific table element within the HTML
table_class: str = ".imdb-scroll-table"
# table_class = "#table"  # Same result
r_table = r_html.find(table_class)
# print(r_table)
# [<Element 'div' id='table' class=('a-section', 'imdb-scroll-table', 'mojo-gutter')>]

# Extract just the text from the table (similar to r.text)
if len(r_table) == 1:
    # print(r_table[0].text)  # Has data but unstructured
    parsed_table = r_table[0]
    rows: t.List = parsed_table.find(
        "tr")  # list of [<Element 'tr'>, <Element 'tr'>, ...]
    # Convert list of Elements to list of Lists
Exemplo n.º 30
0
import requests
import platform

if platform.system() == 'Windows':
    from requests_html import HTML

    html = HTML(html=requests.get(
        "https://github.com/trending/python?since=daily").text)

    for proj in html.find("article"):
        title = proj.find("h1 a", first=True)
        desc = proj.find("p", first=True)
        print(f"~~~{title.text}~~~", " {")
        print(f"	https://github.com{title.attrs['href']}")
        try:
            print(f"	{desc.text}")
        except AttributeError:
            pass
        print("}", end="\n\n")

if platform.system() == 'Linux':
    from bs4 import BeautifulSoup

    html = requests.get("https://github.com/trending/python?since=daily").text
    soup = BeautifulSoup(html, 'html.parser')

    for proj in soup.select("article"):
        title = proj.select_one("h1 a")
        desc = proj.select_one("p")
        print(f"{[x.strip() for x in title.text.split('/')]}")
        print(f"https://github.com{title.attrs['href']}")