Exemplo n.º 1
0
def get_html(wd: Firefox, article: Dict[str, Union[int,
                                                   str]]) -> Dict[str, str]:
    wd.get(article['link'])
    print(article['title'])
    sleep(3)

    # Scroll to the bottom to load images
    scrollBottom(wd)

    # For each image, replace with base64
    with open('convert_images.js', 'r') as script_file:
        script_src = script_file.read()

    wd.execute_script(script_src)

    sleep(5)
    # Get html of the page
    article_div = wd.find_element_by_css_selector('article')

    html = article_div.get_property('outerHTML')

    with open('cleanify.js') as script_file:
        script_src = script_file.read()
        wd.execute_script(script_src)
        sleep(3)
    html: str = wd.page_source
    html = html.replace('max-width:680px', 'max-width:90%')

    article['html'] = html
Exemplo n.º 2
0
def get_links(wd: Firefox,
              p_type: str,
              tag: str = None,
              limit: int = 10) -> List[Dict[str, Union[int, str]]]:
    links_url = 'https://towardsdatascience.com/latest'

    if tag:
        links_url = "https://towardsdatascience.com/tagged/%s" % tag_map[tag]

    print("Parsing Index Page: %s" % links_url)

    if p_type == 'trending':
        links_url = 'https://towardsdatascience.com/trending'
    wd.get(links_url)
    sleep(3)
    links = []
    num_articles = 0
    articles_parsed = 0
    while num_articles < limit:
        articles = wd.find_elements_by_css_selector('.postArticle')
        num_articles = len(articles)
        wd.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        sleep(3)

    while True:
        for article in articles:
            title = article.find_element_by_css_selector(
                '.graf--title').text.strip()
            # print(title)
            date = article.find_element_by_css_selector('time').get_attribute(
                'datetime')
            try:
                claps = int(
                    article.find_element_by_css_selector(
                        '.js-multirecommendCountButton').text)
            except:
                claps = 0
            try:
                comments = article.find_element_by_css_selector(
                    '.buttonSet.u-floatRight > a[href]').text
                comments = int(comments.split(' ')[0])
            except NoSuchElementException:
                comments = 0

            try:
                link = article.find_element_by_css_selector(
                    '.postArticle-content > a').get_attribute('href')
            except NoSuchElementException:
                link = article.find_element_by_css_selector(
                    '.postArticle > div:nth-child(2) > a').get_attribute(
                        'href')

            if claps < CLAP_THRESHOLD:
                continue

            links.append({
                'title': title,
                'date': date,
                'claps': claps,
                'comments': comments,
                'link': link
            })

        if len(links) >= limit:
            break
        else:
            print("Getting More articles to match threshold...")
            articles_parsed += len(articles)
            wd.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(5)
            articles = wd.find_elements_by_css_selector(
                '.postArticle')[articles_parsed:]

    return links[:limit]
Exemplo n.º 3
0
def scrollBottom(wd: Firefox):
    total_height = int(wd.execute_script("return document.body.scrollHeight;"))
    for i in range(0, total_height, 500):
        wd.execute_script("window.scrollTo(0,%i)" % i)
        sleep(0.5)
Exemplo n.º 4
0
    os.mkdir('tmp')

    fo = Options()
    fo.headless = True
    wd = Firefox(executable_path=gd_path, firefox_binary=ff_path, options=fo)
    wd.header_overrides = {
        'Referer': 'https://twitter.com/freedom',
    }
    links = get_links(wd, p_type, args.tag)

    # TODO Filter out links already parsed and sent

    for link in links:
        get_html(wd, link)
        wd.delete_all_cookies()
        wd.execute_script('window.localStorage.clear()')
        wd.execute_script('window.sessionStorage.clear()')

    # TODO Create TOC

    wd.close()

    html_files = []
    for i, link in enumerate(links):

        with open('tmp/%i.html' % i, 'w') as tmp_html:
            tmp_html.write(link['html'])

        html_files.append('tmp/%i.html' % i)

    cmd_args = ['html2pdf/main.js']