예제 #1
0
    def store_transcript(cls, db: DatabaseHandler,
                         transcript: Transcript) -> int:
        story = db.find_by_id(table='stories', object_id=transcript.stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': transcript.stories_id,
            }).hash()

        download = create_download_for_new_story(db=db, story=story, feed=feed)

        text = cls._download_text_from_transcript(transcript=transcript)

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        return download['downloads_id']
예제 #2
0
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None:
    """Given a ap story return by get_new_stories(), add it to the database."""
    ap_medium = db.query(
        """
        SELECT *
        FROM media
        WHERE name = %(medium_name)s
    """, {
            'medium_name': AP_MEDIUM_NAME,
        }).hash()
    ap_feed = {
        'media_id': ap_medium['media_id'],
        'name': 'API Feed',
        'active': False,
        'type': 'syndicated',
        'url': 'http://ap.com'
    }
    ap_feed = db.find_or_create('feeds', ap_feed)

    story = {
        'guid': ap_story['guid'],
        'url': ap_story['url'],
        'publish_date': ap_story['publish_date'],
        'title': ap_story['title'],
        'description': ap_story['description'],
        'media_id': ap_medium['media_id']
    }
    story = add_story(db, story, ap_feed['feeds_id'])

    if not story:
        return

    story_download = create_download_for_new_story(db, story, ap_feed)

    download_text = {
        'downloads_id': story_download['downloads_id'],
        'download_text': ap_story['text'],
        'download_text_length': len(ap_story['text'])
    }

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s)
        """, download_text)

    # Send to the extractor for it to do vectorization, language detection, etc.
    JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
        stories_id=story['stories_id'],
        use_existing=True,
    )
def test_create_download_for_new_story():
    """Test create_download_for_new_story()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db=db, label='foo', medium=medium)
    story = create_test_story(db=db, label='foo', feed=feed)

    returned_download = create_download_for_new_story(db, story, feed)

    assert returned_download is not None

    got_download = db.query("select * from downloads where stories_id = %(a)s", {'a': story['stories_id']}).hash()

    assert got_download is not None

    assert got_download['downloads_id'] == returned_download['downloads_id']
    assert got_download['feeds_id'] == feed['feeds_id']
    assert got_download['url'] == story['url']
    assert got_download['state'] == 'success'
    assert got_download['type'] == 'content'
    assert not got_download['extracted']
예제 #4
0
파일: stories.py 프로젝트: rleir/mediacloud
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
예제 #5
0
파일: stories.py 프로젝트: rleir/mediacloud
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
def test_create():
    db = connect_to_db()

    test_medium = create_test_medium(db, 'downloads test')
    test_feed = create_test_feed(db, 'downloads test', test_medium)
    test_story = create_test_story(db=db, feed=test_feed, label='test_story')
    test_download = create_download_for_new_story(db=db,
                                                  story=test_story,
                                                  feed=test_feed)

    test_download['path'] = 'postgresql:foo'
    test_download['state'] = 'success'
    db.update_by_id('downloads', test_download['downloads_id'], test_download)

    assert len(
        db.query(
            """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
                'downloads_id': test_download['downloads_id']
            }).hashes()) == 0

    assert len(
        db.query(
            """
        SELECT *
        FROM downloads
        WHERE downloads_id = %(downloads_id)s
          AND extracted = 't'
    """, {
                'downloads_id': test_download['downloads_id']
            }).hashes()) == 0

    extract = {
        'extracted_text': 'Hello!',
    }

    created_download_text = create(db=db,
                                   download=test_download,
                                   extract=extract)
    assert created_download_text
    assert created_download_text['downloads_id'] == test_download[
        'downloads_id']

    found_download_texts = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': test_download['downloads_id']
        }).hashes()
    assert len(found_download_texts) == 1

    download_text = found_download_texts[0]
    assert download_text
    assert download_text['downloads_id'] == test_download['downloads_id']
    assert download_text['download_text'] == extract['extracted_text']
    assert download_text['download_text_length'] == len(
        extract['extracted_text'])
예제 #7
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    log.debug(f"Generating story from URL {url}...")

    url = url[0:MAX_URL_LENGTH]

    log.debug(f"Guessing medium for URL {url}...")
    medium = guess_medium(db, url)
    log.debug(f"Done guessing medium for URL {url}: {medium}")

    log.debug(f"Getting spider feed for medium {medium}...")
    feed = get_spider_feed(db, medium)
    log.debug(f"Done getting spider feed for medium {medium}: {feed}")

    log.debug(f"Getting spidered tag...")
    spidered_tag = get_spidered_tag(db)
    log.debug(f"Done getting spidered tag: {spidered_tag}")

    if title is None:
        log.debug(f"Parsing HTML title...")
        title = html_title(content, url, MAX_TITLE_LENGTH)
        log.debug(f"Done parsing HTML title: {title}")

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        log.debug(f"Guessing date for URL {url}...")
        date_guess = guess_date(url, content)
        log.debug(f"Done guessing date for URL {url}: {date_guess}")

        story['publish_date'] = date_guess.date if date_guess.found else None
    else:
        story['publish_date'] = publish_date

    log.debug(f"Adding story {story}...")
    story = add_story(db, story, feed['feeds_id'])
    log.debug(f"Done adding story {story}")

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        log.debug(f"Assigning date guess tag...")
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        log.debug("Story is new, creating download...")
        download = create_download_for_new_story(db, story, feed)

        log.debug("Storing story content...")
        store_and_verify_content(db, download, content)

        log.debug("Extracting story...")
        _extract_story(db, story)
        log.debug("Done extracting story")

    else:
        log.debug("Story is not new, skipping download storage and extraction")

    log.debug(f"Done generating story from URL {url}")

    return story
예제 #8
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = db.create(
        'stories',
        {
            'url': old_story['url'],
            'media_id': new_medium['media_id'],
            'guid': old_story['guid'],
            'publish_date': old_story['publish_date'],
            'collect_date': sql_now(),
            'description': old_story['description'],
            'title': old_story['title']
        },
    )
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    for old_story_tag in db.query(
        """
        SELECT tags_id
        FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        ORDER BY tags_id
        """,
        {'stories_id': old_story['stories_id']},
    ).hashes():
        stories_id = story['stories_id']
        tags_id = old_story_tag['tags_id']

        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
            ON CONFLICT (stories_id, tags_id) DO NOTHING
        """, {
            'stories_id': stories_id,
            'tags_id': tags_id,
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        """
            SELECT *
            FROM downloads
            WHERE stories_id = %(stories_id)s
            ORDER BY downloads_id
            LIMIT 1
        """,
        {
            'stories_id': old_story['stories_id'],
        }
    ).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
                INSERT INTO download_texts (
                    downloads_id,
                    download_text,
                    download_text_length
                )
                    SELECT
                        %(downloads_id)s,
                        dt.download_text,
                        dt.download_text_length
                    FROM download_texts AS dt
                    WHERE dt.downloads_id = %(downloads_id)s
            """,
            {
                'downloads_id': download['downloads_id'],
            },
        )

    # noinspection SqlInsertValues
    db.query(
        """
            INSERT INTO story_sentences (
                stories_id,
                sentence_number,
                sentence,
                media_id,
                publish_date,
                language
            )
                SELECT
                    %(new_stories_id)s,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date,
                    language
                FROM story_sentences
                WHERE stories_id = %(old_stories_id)s
        """,
        {
            'old_stories_id': old_story['stories_id'],
            'new_stories_id': int(story['stories_id']),
        },
    )

    return story