예제 #1
0
def test_get_tweet_urls() -> None:
    """Test get_tweet_urls()."""
    tweet = {
        'entities': {
            'urls': [{
                'expanded_url': 'foo'
            }, {
                'expanded_url': 'bar'
            }]
        }
    }
    urls = get_tweet_urls(tweet)
    assert sorted(urls) == ['bar', 'foo']

    tweet = \
        {
            'entities':
                {
                    'urls': [{'expanded_url': 'url foo'}, {'expanded_url': 'url bar'}],
                },
            'retweeted_status':
                {
                    'entities':
                        {
                            'urls': [{'expanded_url': 'rt url foo'}, {'expanded_url': 'rt url bar'}],
                        }
                }
        }
    urls = get_tweet_urls(tweet)
    expected_urls = ['url bar', 'url foo', 'rt url foo', 'rt url bar']
    assert sorted(urls) == sorted(expected_urls)
예제 #2
0
 def get_post_urls(self, post: dict) -> list:
     """Given a post, return a list of urls included in the post."""
     if 'data' in post['data'] and 'tweet' in post['data']['data']:
         return get_tweet_urls(post['data']['data']['tweet'])
     elif 'tweet' in post['data']:
         return get_tweet_urls(post['data']['tweet'])
     else:
         return super().get_post_urls(post)
예제 #3
0
def _add_tweet_story(db: DatabaseHandler,
                     topic: Dict[str, Any],
                     tweet: dict,
                     topic_fetch_urls: List[Dict[str, Any]]) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = f"{screen_name}: {content}"
    tweet_date = tweet['created_at']
    url = f"https://twitter.com/{screen_name}/status/{tweet['id']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = get_tweet_urls(tweet)
    for url in urls:
        if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.debug("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url,
        }

        db.create('topic_links', topic_link)
        increment_domain_links(db, topic_link)

    return story
예제 #4
0
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        SELECT
            topic_posts.topic_posts_id
        FROM topic_posts
            INNER JOIN topic_post_days ON
                topic_posts.topics_id = topic_post_days.topics_id AND
                topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id
            INNER JOIN topic_seed_queries ON
                topic_post_days.topics_id = topic_seed_queries.topics_id AND
                topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id
        WHERE
            topics_id = %(topics_id)s
        """, {
            'topics_id': topic['topics_id'],
        }
    ).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)
예제 #5
0
def _get_post_urls(post: dict) -> list:
    """Given a post, return a list of urls included in the post."""
    # let the underlying module pass the urls in a field rather than parsing them out
    try:
        return post['urls']
    except:
        pass

    # for ch tweets, find the tweets in the tweet payload so that we get the expanded urls rather than ti.co's
    if 'data' in post['data'] and 'tweet' in post['data']['data']:
        return get_tweet_urls(post['data']['data']['tweet'])
    elif 'tweet' in post['data']:
        return get_tweet_urls(post['data']['tweet'])

    links = []
    for url in re.findall(r'https?://[^\s\")]+', post['content']):
        url = re.sub(r'\W+$', '', url)
        links.append(url)

    return links
예제 #6
0
def _get_post_urls(post: dict) -> list:
    """Given a post, return a list of urls included in the post."""
    if 'urls' in post:
        return post['urls']

    if 'tweet' in post:
        return get_tweet_urls(post['tweet'])

    links = []
    for url in re.findall(r'https?://[^\s\")]+', post['content']):
        url = re.sub(r'\W+$', '', url)
        links.append(url)

    return links
예제 #7
0
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        select tt.topic_posts_id
            from topic_posts tt
                join topic_post_days ttd using ( topic_post_days_id )
            where
                topics_id = %(a)s
        """, {
            'a': topic['topics_id']
        }).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' %
                     (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)