def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = "%s: %s" % (screen_name, content)
    tweet_date = tweet['created_at']
    url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id'])

    story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = mediawords.util.twitter.get_tweet_urls(tweet)
    for url in urls:
        if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.info("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url
        }

        db.create('topic_links', topic_link)
        mediawords.tm.domains.increment_domain_links(db, topic_link)

    return story
Exemplo n.º 2
0
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(table='stories',
                      insert_hash={
                          'media_id': int(feed['media_id']),
                          'url': "http://story.test/%s" % label,
                          'guid': "guid://story.test/%s" % label,
                          'title': "story %s" % label,
                          'description': "description %s" % label,
                          'publish_date': '2016-10-15 08:00:00',
                          'collect_date': '2016-10-15 10:00:00',
                          'full_text_rss': True,
                      })

    db.create(table='feeds_stories_map',
              insert_hash={
                  'feeds_id': int(feed['feeds_id']),
                  'stories_id': int(story['stories_id']),
              })

    return story
Exemplo n.º 3
0
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(
        table='stories',
        insert_hash={
            'media_id': int(feed['media_id']),
            'url': "http://story.test/%s" % label,
            'guid': "guid://story.test/%s" % label,
            'title': "story %s" % label,
            'description': "description %s" % label,
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            'full_text_rss': True,
        }
    )

    db.create(
        table='feeds_stories_map',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'stories_id': int(story['stories_id']),
        }
    )

    return story
Exemplo n.º 4
0
def _add_tweet_story(db: DatabaseHandler,
                     topic: Dict[str, Any],
                     tweet: dict,
                     topic_fetch_urls: List[Dict[str, Any]]) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = f"{screen_name}: {content}"
    tweet_date = tweet['created_at']
    url = f"https://twitter.com/{screen_name}/status/{tweet['id']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = get_tweet_urls(tweet)
    for url in urls:
        if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.debug("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url,
        }

        db.create('topic_links', topic_link)
        increment_domain_links(db, topic_link)

    return story
Exemplo n.º 5
0
def create_test_timespan(db: DatabaseHandler,
                         topic: dict = None,
                         snapshot: dict = None) -> dict:
    """Create simple timespans for testing.

    Mast pass either topic or snapshot or both. If a snapshot is not passed, create one.
    """
    assert topic is not None or snapshot is not None

    if not snapshot:
        snapshot = create_test_snapshot(db, topic)

    return db.create(table='timespans',
                     insert_hash={
                         'topics_id': snapshot['topics_id'],
                         'snapshots_id': snapshot['snapshots_id'],
                         'start_date': snapshot['start_date'],
                         'end_date': snapshot['end_date'],
                         'period': 'overall',
                         'story_count': 0,
                         'story_link_count': 0,
                         'medium_count': 0,
                         'medium_link_count': 0,
                         'post_count': 0
                     })
Exemplo n.º 6
0
def create_test_snapshot(db: DatabaseHandler, topic: dict) -> dict:
    """Create simple snapshot for testing."""
    return db.create(table='snapshots',
                     insert_hash={
                         'topics_id': topic['topics_id'],
                         'snapshot_date': topic['end_date'],
                         'start_date': topic['start_date'],
                         'end_date': topic['end_date']
                     })
Exemplo n.º 7
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
Exemplo n.º 8
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
Exemplo n.º 9
0
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict,
                     topic_fetch_urls: list) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = "%s: %s" % (screen_name, content)
    tweet_date = tweet['created_at']
    url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id'])

    story = mediawords.tm.stories.generate_story(db=db,
                                                 url=url,
                                                 content=content,
                                                 title=title,
                                                 publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db,
                                               story=story,
                                               topic=topic,
                                               link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(
            db, topic_fetch_url)

    urls = mediawords.util.twitter.get_tweet_urls(tweet)
    for url in urls:
        if mediawords.tm.domains.skip_self_linked_domain_url(
                db, topic['topics_id'], story['url'], url):
            log.info("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url
        }

        db.create('topic_links', topic_link)
        mediawords.tm.domains.increment_domain_links(db, topic_link)

    return story
Exemplo n.º 10
0
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict:
    """Create test feed with a simple label belonging to medium."""

    label = decode_object_from_bytes_if_needed(label)
    medium = decode_object_from_bytes_if_needed(medium)

    return db.create(table='feeds',
                     insert_hash={
                         'name': label,
                         'url': "http://feed.test/%s" % label,
                         'media_id': int(medium['media_id']),
                     })
Exemplo n.º 11
0
def create_test_medium(db: DatabaseHandler, label: str) -> dict:
    """Create test medium with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(table='media',
                     insert_hash={
                         'name': label,
                         'url': "http://media.test/%s" % (label, ),
                         'is_monitored': True,
                         'public_notes': "%s public notes" % (label, ),
                         'editor_notes': "%s editor notes" % (label, ),
                     })
Exemplo n.º 12
0
def create_test_medium(db: DatabaseHandler, label: str) -> dict:
    """Create test medium with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(
        table='media',
        insert_hash={
            'name': label,
            'url': "http://media.test/%s" % (label,),
            'is_monitored': True,
            'public_notes': "%s public notes" % (label,),
            'editor_notes': "%s editor notes" % (label,),
        })
Exemplo n.º 13
0
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict:
    """Create test feed with a simple label belonging to medium."""

    label = decode_object_from_bytes_if_needed(label)
    medium = decode_object_from_bytes_if_needed(medium)

    return db.create(
        table='feeds',
        insert_hash={
            'name': label,
            'url': "http://feed.test/%s" % label,
            'media_id': int(medium['media_id']),
        }
    )
Exemplo n.º 14
0
def create_test_topic_stories(
        db: DatabaseHandler, 
        topic: dict, 
        num_media: int=10, 
        num_stories_per_medium: int=10) -> None:
    """Fill topic with test stories in num_media with one cross media link per medium."""
    for mi in range(num_media):
        medium = create_test_medium(db, f'medium {mi}')
        feed = create_test_feed(db, f'feed {mi}', medium)
        
        for si in range(num_stories_per_medium):
            story = create_test_story(db, f'story {mi} {si}', feed)
            db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']})
            db.create('topic_stories', {'topics_id': topic['topics_id'], 'stories_id': story['stories_id']})

    db.query(
        """
        insert into topic_links ( topics_id, stories_id, url, ref_stories_id )
            select %(a)s, a.stories_id, b.url, b.stories_id
                from stories a
                    join stories b on ( a.media_id <> b.media_id )
                limit %(b)s
        """,
        {'a': topic['topics_id'], 'b': num_media * num_stories_per_medium})
Exemplo n.º 15
0
def create_test_topic(db: DatabaseHandler, label: str) -> dict:
    """Create test topic with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(table='topics',
                     insert_hash={
                         'name': label,
                         'description': label,
                         'pattern': label,
                         'solr_seed_query': label,
                         'solr_seed_query_run': True,
                         'start_date': '2016-01-01',
                         'end_date': '2016-03-01',
                         'job_queue': 'mc',
                         'max_stories': 100000,
                     })
Exemplo n.º 16
0
def create_test_topic(db: DatabaseHandler, label: str) -> dict:
    """Create test topic with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(
        table='topics',
        insert_hash={
            'name': label,
            'description': label,
            'pattern': label,
            'solr_seed_query': label,
            'solr_seed_query_run': True,
            'start_date': '2016-01-01',
            'end_date': '2016-03-01',
            'job_queue': 'mc',
            'max_stories': 100000,
        }
    )
Exemplo n.º 17
0
def create_download_for_story(db: DatabaseHandler, feed: dict,
                              story: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)
    story = decode_object_from_bytes_if_needed(story)

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': feed['feeds_id'],
                         'url': story['url'],
                         'host': host,
                         'type': 'content',
                         'sequence': 1,
                         'state': 'success',
                         'priority': 1,
                         'extracted': False,
                         'path': 'postgresql:foo',
                         'stories_id': story['stories_id'],
                     })
Exemplo n.º 18
0
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)
    story = decode_object_from_bytes_if_needed(story)

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'success',
            'priority': 1,
            'extracted': False,
            'path': 'postgresql:foo',
            'stories_id': story['stories_id'],
        }
    )
Exemplo n.º 19
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': int(feed['feeds_id']),
                         'url': feed['url'],
                         'host': host,
                         'type': 'feed',
                         'sequence': 1,
                         'state': 'pending',
                         'priority': priority,
                         'download_time': 'NOW()',
                         'extracted': False,
                     })
Exemplo n.º 20
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'url': feed['url'],
            'host': host,
            'type': 'feed',
            'sequence': 1,
            'state': 'pending',
            'priority': priority,
            'download_time': 'NOW()',
            'extracted': False,
        })
Exemplo n.º 21
0
def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
Exemplo n.º 22
0
def create_test_topic_posts(
        db: DatabaseHandler,
        topic: dict,
        num_posts_per_day: int=10) -> int:
    """Fill topic with topic_posts. Return the number of posts created."""
    date = dateutil.parser.parse(topic['start_date'])
    end_date = dateutil.parser.parse(topic['end_date'])

    tsq = {
        'topics_id': topic['topics_id'],
        'source': 'csv',
        'platform': 'generic_post',
        'query': 'foo'
    }
    tsq = db.create('topic_seed_queries', tsq)

    stories = db.query( "select * from snap.live_stories where topics_id = %(a)s", {'a': topic['topics_id']}).hashes()

    num_posts = 0
    while date < end_date:
        tpd = {
            'topic_seed_queries_id': tsq['topic_seed_queries_id'],
            'day': date.strftime('%Y-%m-%d'),
            'num_posts_stored': num_posts_per_day,
            'num_posts_fetched': num_posts_per_day,
        }
        tpd = db.create('topic_post_days', tpd)

        for i in range(num_posts_per_day):
            author_num = i 
            channel_num = i % num_posts_per_day
            topic_post = {
                'topic_post_days_id': tpd['topic_post_days_id'],
                'post_id': i,
                'content': f'content {i}',
                'author': f'author {author_num}',
                'channel': f'channel {channel_num}',
                'publish_date': date.strftime('%Y-%m-%d'),
                'data': '{}'
            }
            topic_post = db.create('topic_posts', topic_post)

            post_story = stories[num_posts % len(stories)]
            tpu = {
                'topic_posts_id': topic_post['topic_posts_id'],
                'url': post_story['url']
            }
            tpu = db.create('topic_post_urls', tpu)

            tsu = {
                'topics_id': topic['topics_id'],
                'url': post_story['url'],
                'stories_id': post_story['stories_id'],
                'topic_seed_queries_id': tsq['topic_seed_queries_id'],
                'topic_post_urls_id': tpu['topic_post_urls_id']
            }
            tsu = db.create('topic_seed_urls', tsu)

            num_posts += 1

        date = date + datetime.timedelta(days=1)

    return num_posts
Exemplo n.º 23
0
def create_test_topic_stories(db: DatabaseHandler,
                              topic: dict,
                              num_media: int = 10,
                              num_stories_per_medium: int = 10) -> None:
    """Fill topic with test stories in num_media with one cross media link per medium."""
    for mi in range(num_media):
        medium = create_test_medium(db, f'medium {mi}')
        feed = create_test_feed(db, f'feed {mi}', medium)

        for si in range(num_stories_per_medium):
            story = create_test_story(db, f'story {mi} {si}', feed)
            db.update_by_id('stories', story['stories_id'],
                            {'publish_date': topic['start_date']})
            db.create('topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id']
            })

    test_topic_links = db.query(
        """
        WITH src_stories AS (
            SELECT *
            FROM stories
        ),
        test_topic_links AS (
            SELECT
                src_stories.stories_id,
                ref_stories.url,
                ref_stories.stories_id AS ref_stories_id
            FROM src_stories
                INNER JOIN stories AS ref_stories
                    ON src_stories.media_id != ref_stories.media_id
            LIMIT 50
        )
        SELECT
            stories_id,
            url,
            ref_stories_id
        FROM test_topic_links
    """, {
            'topic_link_count': num_media * num_stories_per_medium,
        }).hashes()

    # Inserting individually because otherwise it complains with:
    # ERROR:  cannot handle complex subqueries when the router executor is disabled
    for test_topic_link in test_topic_links:
        db.query(
            """
            INSERT INTO topic_links (
                topics_id,
                stories_id,
                url,
                ref_stories_id
            ) VALUES (
                %(topics_id)s,
                %(stories_id)s,
                %(url)s,
                %(ref_stories_id)s
            )
        """, {
                'topics_id': topic['topics_id'],
                'stories_id': test_topic_link['stories_id'],
                'url': test_topic_link['url'],
                'ref_stories_id': test_topic_link['ref_stories_id'],
            })