Exemplo n.º 1
0
def test_bbc_story():
    browser = make_browser()
    story = get_story(browser,
                      'http://www.bbc.com/news/world-us-canada-41973952',
                      sites['bbc']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 2
0
def test_nola_story():
    story = get_story(
        make_browser(),
        'http://www.nola.com/northshore/index.ssf/2018/02/three_st_tammany_students_accu.html#incart_2box_nola_river_orleans_news',
        sites['nola']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 3
0
def test_usa_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.usatoday.com/story/money/nation-now/2018/02/26/trump-just-claimed-u-s-makes-better-solar-panels-than-china-thats-not-quite-right/375307002/',
        sites['usa']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 4
0
def test_guardian_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.theguardian.com/us-news/2017/nov/09/one-year-later-trump-takes-a-grand-tour-of-asia-as-clinton-visits-wisconsin-finally',
        sites['guardian']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 5
0
def test_nyTimes_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.nytimes.com/2017/11/09/opinion/nuisance-ordinances-eviction-violence.html?action=click&pgtype=Homepage&clickSource=story-heading&module=opinion-c-col-left-region&region=opinion-c-col-left-region&WT.nav=opinion-c-col-left-region',
        sites['nyTimes']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 6
0
def test_npr_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.npr.org/2018/02/27/585133064/lawmakers-agree-on-paid-family-leave-but-not-the-details',
        sites['npr']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 7
0
def test_nbc_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.nbcnews.com/news/us-news/parkland-school-shooting-stoneman-douglas-students-prepare-confront-memories-they-n851656',
        sites['nbc']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 8
0
def test_la_times_story():
    browser = make_browser()
    story = get_story(
        browser,
        'http://www.latimes.com/politics/la-na-pol-jared-kushner-20180227-story.html',
        sites['la_times']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 9
0
def test_cbs_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.cbsnews.com/news/brad-parscale-trump-2020-campagin-manager-announced-today-2018-02-27/',
        sites['cbs']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 10
0
def test_metro_story():
    browser = make_browser()
    story = get_story(
        browser,
        'http://metro.co.uk/2018/02/27/kevin-spacey-foundation-shut-uk-actor-faces-sexual-assault-allegations-7347287/',
        sites['metro']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 11
0
def test_verge_story():
    browser = make_browser()
    story = get_story(
        browser,
        'https://www.theverge.com/2018/2/27/17060092/blackberry-world-app-store-paid-apps-discontinuation-removal-april-1',
        sites['verge']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 12
0
def test_fox_story():
    browser = make_browser()
    story = get_story(
        browser,
        'http://www.foxnews.com/politics/2018/02/27/supreme-court-rules-that-detained-immigrants-dont-get-automatic-bond-hearings.html',
        sites['fox']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 13
0
def test_eOnline_story():
    browser = make_browser()
    story = get_story(
        browser,
        'http://www.eonline.com/news/893550/did-kylie-jenner-have-a-private-baby-shower-all-the-details-on-her-pink-filled-celebration',
        sites['eOnline']['story_xpath'])
    assert len(story['title']) > 0
    assert len(story['desc']) > 0
    assert len(story['story']) > 0
    assert len(story['image']) > 0
Exemplo n.º 14
0
def main():
    browser = config.connect_browser()
    db = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)

    # We want to not look like a bot, I found that initially get all the links to possible scrape
    # then shuffling them looks much less like a bot.
    links = []
    for name, job in config.WORK.items():
        new_links = []

        try:
            print(job['url'])
            new_links = get_links(browser, job['url'], job['link_regex'])
        except Exception as e:
            # Browser sessions get a little funky, in this case refresh the connection
            browser = config.connect_browser()
            print(e)

        links += [(name, link) for link in new_links]

    random.shuffle(links)

    channel = config.setup_mq(config.QUEUE_NAME)
    for link in list(set(links)):
        # Avoid doing unnessary duplicate work
        if db.exists(link):
            continue

        # Just in case
        time.sleep(random.randint(1, 8))

        try:
            print(link)
            story = get_story(browser, link[1],
                              config.WORK[link[0]]['story_xpath'])
        except Exception as e:
            print(e)
            browser = config.connect_browser()
            continue

        # Quick filtering to avoid invalid stories
        if len(story['story']) == 0:
            continue

        # Often connections are lost, reconnect in those cases
        try:
            publish_story(channel, config.QUEUE_NAME, story)
        except Exception:
            channel = config.setup_mq(
                config.QUEUE_NAME)  # Refresh the connection