Пример #1
0
def test_generate_story():
    """Test generate_story()."""
    db = connect_to_db()

    story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>'
    story_url = 'http://foo.com/foo/bar'
    story = generate_story(db=db, url=story_url, content=story_content)

    assert 'stories_id' in story
    assert story['title'] == 'foo bar'
    assert story['publish_date'] == '2016-01-12 03:55:46'
    assert story['url'] == story_url
    assert story['guid'] == story_url

    medium = db.require_by_id('media', story['media_id'])

    assert medium['name'] == 'foo.com'
    assert medium['url'] == 'http://foo.com/'

    feed = db.query(
        "select f.* from feeds f join feeds_stories_map fsm using ( feeds_id ) where stories_id = %(a)s",
        {
            'a': story['stories_id']
        }).hash()

    assert feed is not None
    assert feed['name'] == SPIDER_FEED_NAME

    (date_tag, date_tag_set) = get_story_date_tag(db, story)

    assert date_tag['tag'] == 'guess_by_tag_meta'
    assert date_tag_set['name'] == GUESS_METHOD_TAG_SET

    download = db.query("select * from downloads where stories_id = %(a)s", {
        'a': story['stories_id']
    }).hash()

    assert download is not None
    assert download['url'] == story['url']

    content = fetch_content(db, download)

    assert content == story_content

    story = generate_story(
        db=db,
        url='http://fallback.date',
        content='foo',
        fallback_date='2011-11-11',
    )

    assert story['publish_date'] == '2011-11-11 00:00:00'

    matched_story = generate_story(db, story['url'], 'foo')
    assert matched_story['stories_id'] == story['stories_id']

    story = generate_story(db=db, url='invalid url', content='foo')

    assert story is not None
Пример #2
0
def _add_tweet_story(db: DatabaseHandler,
                     topic: Dict[str, Any],
                     tweet: dict,
                     topic_fetch_urls: List[Dict[str, Any]]) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = f"{screen_name}: {content}"
    tweet_date = tweet['created_at']
    url = f"https://twitter.com/{screen_name}/status/{tweet['id']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = get_tweet_urls(tweet)
    for url in urls:
        if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.debug("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url,
        }

        db.create('topic_links', topic_link)
        increment_domain_links(db, topic_link)

    return story
Пример #3
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    title = f"{user['name']} ({user['screen_name']}) | Twitter"
    tweet_date = sql_now()
    url = f"https://twitter.com/{user['screen_name']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)

    stories_id = story['stories_id']
    tags_id = undateable_tag['tags_id']

    db.query("""
        INSERT INTO public.stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })

    return story
Пример #4
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': undateable_tag['tags_id']})

    return story
Пример #5
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...")

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        log.info(
            f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching"
        )
        return

    log.info("Checking ignore links...")
    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        log.info("Link is to be ignored, returning")
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    log.info("Checking failed URL...")
    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        log.info("URL is failed, returning")
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    log.info("Checking self-linked domain...")
    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        log.info("Link is self-linked domain, returning")
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    log.info(f"Fetching topic {topic_fetch_url['topics_id']}...")
    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    log.info("Checking story match...")
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        log.info("URL is in pending state, getting story match...")
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            log.info(f"Matched story {story_match['stories_id']}, returning")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    log.info("Checking for pending state...")
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        log.info("URL is in pending state, returning")
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    log.info("Checking seeded content...")
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        log.info("Seeded content found, fetching URL...")
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.info(f"{response.code} response returned")
    else:
        log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}")

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        log.info(
            f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern"
        )
        if _ignore_link_pattern(response_url):
            log.info("Ignore link pattern matched, returning")
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        log.info("Checking story match for redirect URL...")
        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    log.info("Checking content match...")
    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        log.info("Request failed")
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        log.info(f"Story {story_match['stories_id']} matched")
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        log.info("Content matched")
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        log.info("Nothing matched, generating story...")

        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url

            log.info("Creating story...")
            story = generate_story(db=db, content=content, url=url)
            log.info(f"Created story {story['stories_id']}")

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:

            log.info(
                "Duplicate story found, checking for story match on unique constraint error..."
            )

            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                message = "Unable to find matching story after unique constraint error."
                log.error(message)
                raise McTMFetchLinkException(message)

            log.info(f"Matched story {story_match['stories_id']}")
            topic_fetch_url['stories_id'] = story_match['stories_id']

        log.info("Done generating story")

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")

    log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
Пример #6
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" %
                  (response.code, topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException(
                    "Unable to find matching story after unique constraint error."
                )
            topic_fetch_url['stories_id'] = story_match['stories_id']

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
Пример #7
0
def test_generate_story():
    """Test generate_story()."""
    db = connect_to_db()

    story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>'
    story_url = 'http://foo.com/foo/bar'
    story = generate_story(db=db, url=story_url, content=story_content)

    assert 'stories_id' in story
    assert story['title'] == 'foo bar'
    assert story['publish_date'] == '2016-01-12 03:55:46'
    assert story['url'] == story_url
    assert story['guid'] == story_url

    medium = db.require_by_id('media', story['media_id'])

    assert medium['name'] == 'foo.com'
    assert medium['url'] == 'http://foo.com/'

    feed = db.query(
        """
        SELECT f.*
        FROM feeds_stories_map AS fsm
            INNER JOIN feeds AS f ON
                fsm.feeds_id = f.feeds_id
        WHERE fsm.stories_id = %(stories_id)s
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    assert feed is not None
    assert feed['name'] == SPIDER_FEED_NAME

    (date_tag, date_tag_set) = get_story_date_tag(db, story)

    assert date_tag['tag'] == 'guess_by_tag_meta'
    assert date_tag_set['name'] == GUESS_METHOD_TAG_SET

    download = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    assert download is not None
    assert download['url'] == story['url']

    content = fetch_content(db, download)

    assert content == story_content

    story = generate_story(
        db=db,
        url='http://fallback.date',
        content='foo',
    )

    assert story['publish_date'] == None

    matched_story = generate_story(db, story['url'], 'foo')
    assert matched_story['stories_id'] == story['stories_id']

    story = generate_story(db=db, url='invalid url', content='foo')

    assert story is not None