예제 #1
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #2
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #3
0
def create_test_topic_stories(
        db: DatabaseHandler, 
        topic: dict, 
        num_media: int=10, 
        num_stories_per_medium: int=10) -> None:
    """Fill topic with test stories in num_media with one cross media link per medium."""
    for mi in range(num_media):
        medium = create_test_medium(db, f'medium {mi}')
        feed = create_test_feed(db, f'feed {mi}', medium)
        
        for si in range(num_stories_per_medium):
            story = create_test_story(db, f'story {mi} {si}', feed)
            db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']})
            db.create('topic_stories', {'topics_id': topic['topics_id'], 'stories_id': story['stories_id']})

    db.query(
        """
        insert into topic_links ( topics_id, stories_id, url, ref_stories_id )
            select %(a)s, a.stories_id, b.url, b.stories_id
                from stories a
                    join stories b on ( a.media_id <> b.media_id )
                limit %(b)s
        """,
        {'a': topic['topics_id'], 'b': num_media * num_stories_per_medium})
예제 #4
0
def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #5
0
def create_test_topic_stories(db: DatabaseHandler,
                              topic: dict,
                              num_media: int = 10,
                              num_stories_per_medium: int = 10) -> None:
    """Fill topic with test stories in num_media with one cross media link per medium."""
    for mi in range(num_media):
        medium = create_test_medium(db, f'medium {mi}')
        feed = create_test_feed(db, f'feed {mi}', medium)

        for si in range(num_stories_per_medium):
            story = create_test_story(db, f'story {mi} {si}', feed)
            db.update_by_id('stories', story['stories_id'],
                            {'publish_date': topic['start_date']})
            db.create('topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id']
            })

    test_topic_links = db.query(
        """
        WITH src_stories AS (
            SELECT *
            FROM stories
        ),
        test_topic_links AS (
            SELECT
                src_stories.stories_id,
                ref_stories.url,
                ref_stories.stories_id AS ref_stories_id
            FROM src_stories
                INNER JOIN stories AS ref_stories
                    ON src_stories.media_id != ref_stories.media_id
            LIMIT 50
        )
        SELECT
            stories_id,
            url,
            ref_stories_id
        FROM test_topic_links
    """, {
            'topic_link_count': num_media * num_stories_per_medium,
        }).hashes()

    # Inserting individually because otherwise it complains with:
    # ERROR:  cannot handle complex subqueries when the router executor is disabled
    for test_topic_link in test_topic_links:
        db.query(
            """
            INSERT INTO topic_links (
                topics_id,
                stories_id,
                url,
                ref_stories_id
            ) VALUES (
                %(topics_id)s,
                %(stories_id)s,
                %(url)s,
                %(ref_stories_id)s
            )
        """, {
                'topics_id': topic['topics_id'],
                'stories_id': test_topic_link['stories_id'],
                'url': test_topic_link['url'],
                'ref_stories_id': test_topic_link['ref_stories_id'],
            })