Python connect_to_db примеры, mediawords.db.connect_to_db Python примеры использования

Пример #1

0

Показать файл

Файл: test_database.py Проект: zhanglipku/mediacloud

    def setUp(self) -> None:
        """Create a fresh testing database for each unit test.

        This relies on an empty template existing, which should have been created in setUpClass() above.
        """
        super().setUp()

        # Connect to the template database to execure the create command for the test database
        log.warning("recreate test db from template: %s" %
                    self.template_db_name)

        db = connect_to_db(label=self.TEST_DB_LABEL, is_template=True)

        self.__kill_connections_to_database(db=db, database_name=self.db_name)

        db.query("DROP DATABASE IF EXISTS {}".format(self.db_name))
        db.query("CREATE DATABASE {} TEMPLATE {}".format(
            self.db_name, self.template_db_name))

        db.disconnect()

        db = connect_to_db(label=self.TEST_DB_LABEL)

        force_using_test_database()

        self.__db = db

Пример #2

0

Показать файл

Файл: test_db.py Проект: zhanglipku/mediacloud

def test_connect_to_db():
    # Default database
    db = connect_to_db(do_not_check_schema_version=True)
    database_name = db.query('SELECT current_database()').hash()
    assert database_name['current_database'] == 'mediacloud'

    # Test database
    db = connect_to_db(label='test', do_not_check_schema_version=True)
    database_name = db.query('SELECT current_database()').hash()
    assert database_name['current_database'] == 'mediacloud_test'

    # Invalid label
    with pytest.raises(McConnectToDBException):
        connect_to_db('NONEXISTENT_LABEL')

Пример #3

0

Показать файл

Файл: test_normalized_urls_out_of_date.py Проект: vishalbelsare/mediacloud

def test_normalized_urls_out_of_date():
    """Test _normalized_urls_out_of_date()."""
    db = connect_to_db()

    assert not _normalized_urls_out_of_date(db)

    [create_test_medium(db, str(i)) for i in range(5)]

    assert _normalized_urls_out_of_date(db)

    # noinspection SqlWithoutWhere
    db.query("update media set normalized_url = url")

    assert not _normalized_urls_out_of_date(db)

    db.query(
        "update media set normalized_url = null where media_id in ( select media_id from media limit 1 )"
    )

    assert _normalized_urls_out_of_date(db)

    # noinspection SqlWithoutWhere
    db.query("update media set normalized_url = url")

    assert not _normalized_urls_out_of_date(db)

Пример #4

0

Показать файл

Файл: fetch_twitter_urls_job.py Проект: berkmancenter/mediacloud

    def run_job(cls, topic_fetch_urls_ids: list):
        """Call fetch_topic_url and requeue the job of the request has been domain throttled.

        Arguments:
        topic_fetch_urls_ids - ids of topic_fetch_urls

        Returns:
        None

        """
        if topic_fetch_urls_ids is None:
            raise McFetchTwitterUrlsJobException("'topic_fetch_urls_ids' is None.")

        log.info("Start fetch twitter urls for %d topic_fetch_urls" % len(topic_fetch_urls_ids))

        db = connect_to_db()

        try:
            mediawords.tm.fetch_twitter_urls.fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids)
        except Exception as ex:
            log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex)))
            db.query(
                """
                update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now()
                    where topic_fetch_urls_id = any(%(c)s)
                """,
                {
                    'a': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR,
                    'b': traceback.format_exc(),
                    'c': topic_fetch_urls_ids
                })

        db.disconnect()

        log.info("Finished fetching twitter url")

Пример #5

0

Показать файл

def run_job(test_job_states_id: int,
            x: int,
            y: int,
            state_updater: Optional[StateUpdater] = None):
    if isinstance(test_job_states_id, bytes):
        test_job_states_id = decode_object_from_bytes_if_needed(
            test_job_states_id)
    if isinstance(x, bytes):
        x = decode_object_from_bytes_if_needed(x)
    if isinstance(y, bytes):
        y = decode_object_from_bytes_if_needed(y)

    x = int(x)
    y = int(y)

    assert state_updater, "State updater is set."
    assert isinstance(
        state_updater,
        StateUpdater), "State updater is of the StateUpdater class."

    log.info(
        f"Running job in 'custom' Python worker (test job state ID: {test_job_states_id})..."
    )

    db = connect_to_db()

    state_updater.update_job_state(db=db, state='foo', message='bar')

    # Sleep indefinitely to keep the job in "custom" state
    while True:
        time.sleep(10)

Пример #6

0

Показать файл

Файл: test_try_fetch_tweets_chunk_multiple.py Проект: vishalbelsare/mediacloud

    def _try_fetch_tweets_chunk_parallel(topic_: dict, tfus_: list) -> None:
        db_ = connect_to_db()

        with requests_mock.Mocker() as m:
            m.get("https://api.twitter.com/1.1/statuses/lookup.json",
                  text=mock_statuses_lookup)
            _try_fetch_tweets_chunk(db_, topic_, tfus_)

Пример #7

0

Показать файл

Файл: update_story_tags.py Проект: zhanglipku/mediacloud

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McCLIFFUpdateStoryTagsJobException("'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McCLIFFUpdateStoryTagsJobException(
                "Story with ID %d was not found." % stories_id)

        cliff = CLIFFAnnotator()
        try:
            cliff.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McCLIFFUpdateStoryTagsJobException(
                "Unable to process story ID %s with CLIFF: %s" % (
                    stories_id,
                    str(ex),
                ))

        log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
        NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

Пример #8

0

Показать файл

def test_ch_remote_integration() -> None:
    """Test ch remote integration."""
    db = connect_to_db()
    validate_remote_integration(db=db,
                                source='crimson_hexagon',
                                query=str(TEST_MONITOR_ID),
                                day='2016-01-01')

Пример #9

0

Показать файл

Файл: cliff_update_story_tags_worker.py Проект: CodeForAfrica/backend

def run_cliff_update_story_tags(stories_id: int) -> None:
    """Create / update story tags using CLIFF annotation."""
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFUpdateStoryTagsJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFUpdateStoryTagsJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagger()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFUpdateStoryTagsJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    # log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    # JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation').add_to_queue(stories_id=stories_id)
    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)

Пример #10

0

Показать файл

Файл: test_send_password_reset_token.py Проект: vishalbelsare/mediacloud

    def test_send_password_reset_token(self):
        db = connect_to_db()

        email = '*****@*****.**'
        password = '******'
        password_reset_link = 'http://password-reset.com/'

        add_user(
            db=db,
            new_user=NewUser(
                email=email,
                full_name='Test user login',
                has_consented=True,
                notes='Test test test',
                role_ids=[1],
                active=True,
                password=password,
                password_repeat=password,
                activation_url='',  # user is active, no need for activation URL
            ),
        )

        # Existing user
        send_password_reset_token(db=db,
                                  email=email,
                                  password_reset_link=password_reset_link)

        # Nonexisting user (call shouldn't fail because we don't want to reveal which users are in the system so we
        # pretend that we've sent the email)
        send_password_reset_token(db=db,
                                  email='*****@*****.**',
                                  password_reset_link=password_reset_link)

Пример #11

0

Показать файл

def test_get_consistent_color():

    db = connect_to_db()

    color_c_baz = get_consistent_color(db=db, item_set='c', item_id='baz')
    color_b_baz = get_consistent_color(db=db, item_set='b', item_id='baz')
    color_b_bar = get_consistent_color(db=db, item_set='b', item_id='bar')
    color_a_baz = get_consistent_color(db=db, item_set='a', item_id='baz')
    color_a_bar = get_consistent_color(db=db, item_set='a', item_id='bar')
    color_a_foo = get_consistent_color(db=db, item_set='a', item_id='foo')

    num_db_colors = db.query("SELECT COUNT(*) FROM color_sets").flat()
    assert num_db_colors[0] == 9

    assert color_a_foo != color_a_bar
    assert color_a_foo != color_a_baz
    assert color_a_bar != color_a_baz
    assert color_b_bar != color_b_baz

    color_a_foo_2 = get_consistent_color(db=db, item_set='a', item_id='foo')
    color_a_bar_2 = get_consistent_color(db=db, item_set='a', item_id='bar')
    color_a_baz_2 = get_consistent_color(db=db, item_set='a', item_id='baz')
    color_b_bar_2 = get_consistent_color(db=db, item_set='b', item_id='bar')
    color_b_baz_2 = get_consistent_color(db=db, item_set='b', item_id='baz')
    color_c_baz_2 = get_consistent_color(db=db, item_set='c', item_id='baz')

    assert color_a_foo_2 == color_a_foo
    assert color_a_bar_2 == color_a_bar
    assert color_a_baz_2 == color_a_baz
    assert color_b_bar_2 == color_b_bar
    assert color_b_baz_2 == color_b_baz
    assert color_c_baz_2 == color_c_baz

Пример #12

0

Показать файл

Файл: test_import_archive_file.py Проект: vishalbelsare/mediacloud

def test_import_archive_file():
    db = connect_to_db()

    db.create('media', {'url': 'ap.com', 'name': AP_MEDIUM_NAME})

    xml_file = '/opt/mediacloud/tests/data/ap_test_fixtures/test_ap_fixture_archive.xml'

    import_archive_file(db, xml_file)

    stories = db.query("""
        SELECT *
        FROM stories
    """).hashes()

    assert len(stories) == 1

    story = stories[0]

    assert story[
        'title'] == 'Report: Far-right violence in Germany declined in 2017'
    assert story[
        'url'] == 'https://apnews.com/61a17439ecd940498124a2939a78c678'
    assert story['guid'] == 'de9a436b796b41d5821509773f740fa0'
    assert story['publish_date'] == '2018-07-06 13:34:25'
    assert story['description'][0:10] == 'German media are reporting a drop'[
        0:10]

    download_text = db.query("""
        SELECT *
        FROM download_texts
    """).hash()

    assert download_text['download_text'][0:10] == 'BERLIN (AP'

Пример #13

0

Показать файл

Файл: test_get_dup_story_groups.py Проект: vishalbelsare/mediacloud

def test_get_dup_story_groups():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    dup_story_groups = _get_dup_story_groups(db, topic)

    assert len(dup_story_groups) == 3

    for dsg in dup_story_groups:
        for story in dsg:
            assert dsg[0]['title'].lower() == story['title'].lower()

Пример #14

0

Показать файл

Файл: extract_story_links_job.py Проект: chautong/mediacloud

    def run_job(cls, stories_id: int, topics_id: int) -> None:
        """Run the extract_story_links job, using mediawords.tm.extract_story_links for the logic."""
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)
        if stories_id is None:
            raise McExtractStoryLinksJobException("'stories_id' is None.")

        if isinstance(topics_id, bytes):
            topics_id = decode_object_from_bytes_if_needed(stories_id)
        if topics_id is None:
            raise McExtractStoryLinksJobException("'topics_id' is None.")

        stories_id = int(stories_id)
        topics_id = int(topics_id)

        log.info(
            "Start fetching extracting links for stories_id %d topics_id %d" %
            (stories_id, topics_id))

        try:
            db = connect_to_db()
            story = db.require_by_id(table='stories', object_id=stories_id)
            topic = db.require_by_id(table='topics', object_id=topics_id)
            mediawords.tm.extract_story_links.extract_links_for_topic_story(
                db, story, topic)
        except Exception as ex:
            raise McExtractStoryLinksJobException(
                "Unable to process story $stories_id: %s" % str(ex))

        log.info(
            "Finished fetching extracting links for stories_id %d topics_id %d"
            % (stories_id, topics_id))

Пример #15

0

Показать файл

def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s",
        {'a': topic['topics_id']}).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']

Пример #16

0

Показать файл

Файл: extract_story_links_job.py Проект: berkmancenter/mediacloud

    def run_job(cls, stories_id: int, topics_id: int) -> None:
        """Run the extract_story_links job, using mediawords.tm.extract_story_links for the logic."""
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)
        if stories_id is None:
            raise McExtractStoryLinksJobException("'stories_id' is None.")

        if isinstance(topics_id, bytes):
            topics_id = decode_object_from_bytes_if_needed(topics_id)
        if topics_id is None:
            raise McExtractStoryLinksJobException("'topics_id' is None.")

        stories_id = int(stories_id)
        topics_id = int(topics_id)

        log.info("Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id))

        try:
            db = connect_to_db()
            story = db.require_by_id(table='stories', object_id=stories_id)
            topic = db.require_by_id(table='topics', object_id=topics_id)
            mediawords.tm.extract_story_links.extract_links_for_topic_story(db, story, topic)

        except Exception as ex:
            log.error("Error while processing story {}: {}".format(stories_id, ex))
            raise McExtractStoryLinksJobException(
                "Unable to process story {}: {}".format(stories_id, traceback.format_exc())
            )

        log.info("Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id))

Пример #17

0

Показать файл

Файл: update_story_tags.py Проект: berkmancenter/mediacloud

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McCLIFFUpdateStoryTagsJobException("'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McCLIFFUpdateStoryTagsJobException("Story with ID %d was not found." % stories_id)

        cliff = CLIFFAnnotator()
        try:
            cliff.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McCLIFFUpdateStoryTagsJobException(
                "Unable to process story ID %s with CLIFF: %s" % (stories_id, str(ex),)
            )

        log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
        NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

Пример #18

0

Показать файл

Файл: extract_and_vector_worker.py Проект: timyrankinen/mediacloud

def run_extract_and_vector(stories_id: int,
                           use_cache: bool = False,
                           use_existing: bool = False) -> None:
    """Extract, vector and process a story."""

    global _consecutive_requeues

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    if not stories_id:
        raise McExtractAndVectorException("'stories_id' is not set.")

    db = connect_to_db()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McExtractAndVectorException(
            "Story with ID {} was not found.".format(stories_id))

    if medium_is_locked(db=db, media_id=story['media_id']):
        log.warning(
            "Requeueing job for story {} in locked medium {}...".format(
                stories_id, story['media_id']))
        _consecutive_requeues += 1

        # Prevent spamming these requeue events if the locked media source is the only one in the queue
        if _consecutive_requeues > _SLEEP_AFTER_REQUEUES:
            log.warning(
                "Story extraction job has been requeued more than {} times, waiting before requeueing..."
                .format(_consecutive_requeues))
            time.sleep(1)

        JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id)

        return

    _consecutive_requeues = 0

    log.info("Extracting story {}...".format(stories_id))

    db.begin()

    try:
        extractor_args = PyExtractorArguments(use_cache=use_cache,
                                              use_existing=use_existing)
        extract_and_process_story(db=db,
                                  story=story,
                                  extractor_args=extractor_args)

    except Exception as ex:
        raise McExtractAndVectorException(
            "Extractor died while extracting story {}: {}".format(
                stories_id, ex))

    db.commit()

    log.info("Done extracting story {}.".format(stories_id))

Пример #19

0

Показать файл

def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None:
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info("Finished generating word2vec model for snapshot %d." %
             snapshots_id)

Пример #20

0

Показать файл

Файл: test_provide_download_ids.py Проект: vishalbelsare/mediacloud

def test_provide_download_ids() -> None:
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, 'foo', medium=medium)

    hosts = ('foo.bar', 'bar.bat', 'bat.baz')
    downloads_per_host = 3

    for host in hosts:
        for i in range(downloads_per_host):
            download = {
                'feeds_id': feed['feeds_id'],
                'state': 'pending',
                'priority': 1,
                'sequence': 1,
                'type': 'content',
                'url': 'http://' + host + '/' + str(i),
                'host': host}

            db.create('downloads', download)

    download_ids = provide_download_ids(db)

    # +1 for the test feed
    assert len(download_ids) == len(hosts) + 1

Пример #21

0

Показать файл

def test_fetch_topic_posts() -> None:
    """Run fetch_topic_post tests."""
    db = connect_to_db()

    topic = create_test_topic(db, 'test')

    topic['pattern'] = '.*'
    topic['platform'] = 'generic_post'
    topic['mode'] = 'url_sharing'
    topic['start_date'] = datetime.datetime.strptime(MOCK_START_DATE,
                                                     '%Y-%m-%d')
    topic['end_date'] = topic['start_date'] + datetime.timedelta(
        days=MOCK_DAYS - 1)

    db.update_by_id('topics', topic['topics_id'], topic)

    mock_posts = _get_mock_posts()

    mock_posts_csv = CSVStaticPostFetcher()._get_csv_string_from_dicts(
        mock_posts)

    tsq = {
        'topics_id': topic['topics_id'],
        'platform': 'generic_post',
        'source': 'csv',
        'ignore_pattern': 'ignore',
        'query': mock_posts_csv
    }
    tsq = db.create('topic_seed_queries', tsq)

    db.update_by_id('topics', topic['topics_id'], {'platform': 'generic_post'})

    fetch_topic_posts(db, tsq)

    topic_post_days = db.query("SELECT * FROM topic_post_days").hashes()
    assert len(topic_post_days) == MOCK_DAYS

    start_date = topic['start_date']
    test_days = [
        start_date + datetime.timedelta(days=x) for x in range(0, MOCK_DAYS)
    ]
    for d in test_days:
        topic_post_day = db.query(
            """
            SELECT *
            FROM topic_post_days
            WHERE
                topics_id = %(topics_id)s AND
                topic_seed_queries_id = %(topic_seed_queries_id)s AND
                day = %(day)s
            """, {
                'topics_id': tsq['topics_id'],
                'topic_seed_queries_id': tsq['topic_seed_queries_id'],
                'day': d,
            }).hash()
        assert topic_post_day is not None

    _validate_topic_posts(db, topic, mock_posts)

    _validate_topic_post_urls(db, mock_posts)

Пример #22

0

Показать файл

Файл: fetch_annotation.py Проект: berkmancenter/mediacloud

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McNYTLabelsFetchAnnotationJobException("'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Fetching annotation for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsFetchAnnotationJobException(
                "Unable to process story $stories_id with NYTLabels: %s" % str(ex)
            )

        log.info("Adding story ID %d to the update story tags queue..." % stories_id)
        NYTLabelsUpdateStoryTagsJob.add_to_queue(stories_id=stories_id)

        log.info("Finished fetching annotation for story ID %d" % stories_id)

Пример #23

0

Показать файл

def test_add_story_description_unset():
    """Test adding a story without a description being set."""

    db = connect_to_db()

    medium = create_test_medium(db=db, label='test')
    feed = create_test_feed(db=db, label='test', medium=medium)

    story = {
        'url': 'http://test',
        'guid': 'http://test',
        'media_id': medium['media_id'],
        'title': "test",

        # stories.description can be NULL so it's a valid value:
        'description': None,
        'publish_date': '2016-10-15 08:00:00',
        'collect_date': '2016-10-15 10:00:00',
    }

    add_story(db=db, story=story, feeds_id=feed['feeds_id'])

    assert len(db.select(table='stories', what_to_select='*').hashes()) == 1
    assert len(
        db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1

Пример #24

0

Показать файл

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "Unable to process story ID %d with NYTLabels: %s" % (
                    stories_id,
                    str(ex),
                ))

        log.info("Marking story ID %d as processed..." % stories_id)
        mark_as_processed(db=db, stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

Пример #25

0

Показать файл

Файл: test_login_with_api_key_inactive_user.py Проект: rleir/mediacloud

    def test_login_with_api_key_inactive_user(self):
        """Inactive user logging in with API key."""

        db = connect_to_db()

        email = '*****@*****.**'
        password = '******'
        full_name = 'Test user login'
        ip_address = '1.2.3.4'

        add_user(
            db=db,
            new_user=NewUser(
                email=email,
                full_name=full_name,
                notes='Test test test',
                role_ids=[1],
                active=False,
                password=password,
                password_repeat=password,
                activation_url='https://activate.com/activate',
            ),
        )

        user = user_info(db=db, email=email)
        assert user
        global_api_key = user.global_api_key()

        with pytest.raises(McAuthLoginException) as ex:
            login_with_api_key(db=db,
                               api_key=global_api_key,
                               ip_address=ip_address)

        # Make sure the error message explicitly states that login failed due to user not being active
        assert 'not active' in str(ex)

Пример #26

0

Показать файл

Файл: test_copy_story_to_new_medium_with_download_error.py Проект: sagar-joshi/backend

def test_copy_story_to_new_medium_with_download_error():
    """Test copy_story_to_new_medium with an associated download error."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']})

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    new_download = db.query(
        "select * from downloads where stories_id = %(a)s",
        {'a': new_story['stories_id']}).hash()
    assert new_download is not None
    assert new_download['state'] == 'error'

Пример #27

0

Показать файл

Файл: test_get_story_with_most_sentences.py Проект: vishalbelsare/mediacloud

def test_get_story_with_most_sentences():
    """Test _get_story_with_most_sentences()."""
    db = connect_to_db()

    medium = create_test_medium(db, "foo")
    feed = create_test_feed(db=db, label="foo", medium=medium)

    num_filled_stories = 5
    stories = []
    for i in range(num_filled_stories):
        story = create_test_story(db=db, label="foo" + str(i), feed=feed)
        stories.append(story)
        for n in range(1, i + 1):
            db.create(
                'story_sentences', {
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'sentence': 'foo',
                    'sentence_number': n,
                    'publish_date': story['publish_date']
                })

    empty_stories = []
    for i in range(2):
        story = create_test_story(db=db, label="foo empty" + str(i), feed=feed)
        empty_stories.append(story)
        stories.append(story)

    assert _get_story_with_most_sentences(
        db, stories) == stories[num_filled_stories - 1]

    assert _get_story_with_most_sentences(
        db, [empty_stories[0]]) == empty_stories[0]
    assert _get_story_with_most_sentences(db,
                                          empty_stories) == empty_stories[0]

Пример #28

0

Показать файл

Файл: test_merge_dup_media_story.py Проект: sagar-joshi/backend

def test_merge_dup_media_story():
    """Test merge_dup_media_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)
    old_story = create_test_story(db=db, label='merge old', feed=feed)

    new_medium = create_test_medium(db, 'merge new')

    db.update_by_id('media', medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    cloned_story = merge_dup_media_story(db, topic, old_story)

    for field in 'url guid publish_date title'.split():
        assert cloned_story[field] == old_story[field]

    topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': cloned_story['stories_id'],
            'b': topic['topics_id']
        }).hash()
    assert topic_story is not None

    merged_story = merge_dup_media_story(db, topic, old_story)
    assert merged_story['stories_id'] == cloned_story['stories_id']

Пример #29

0

Показать файл

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McNYTLabelsFetchAnnotationJobException(
                "'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Fetching annotation for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsFetchAnnotationJobException(
                "Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.annotate_and_store_for_story(db=db,
                                                   stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsFetchAnnotationJobException(
                "Unable to process story $stories_id with NYTLabels: %s" %
                str(ex))

        log.info("Adding story ID %d to the update story tags queue..." %
                 stories_id)
        NYTLabelsUpdateStoryTagsJob.add_to_queue(stories_id=stories_id)

        log.info("Finished fetching annotation for story ID %d" % stories_id)

Пример #30

0

Показать файл

Файл: cliff_fetch_annotation_worker.py Проект: robpotter89/backend

def run_cliff_fetch_annotation(stories_id: int) -> None:
    """Fetch story's CLIFF annotation."""
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    cliff = CLIFFAnnotatorFetcher()
    try:
        cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex))

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)

Пример #31

0

Показать файл

    def test_login_with_email_password_inactive_user(self):
        """Inactive user logging in with username and password."""

        db = connect_to_db()

        email = '*****@*****.**'
        password = '******'
        full_name = 'Test user login'

        # Inactive user
        add_user(
            db=db,
            new_user=NewUser(
                email=email,
                full_name=full_name,
                has_consented=True,
                notes='Test test test',
                role_ids=[1],
                active=False,
                password=password,
                password_repeat=password,
                activation_url='https://activate.com/activate',
            ),
        )

        with pytest.raises(McAuthLoginException) as ex:
            login_with_email_password(db=db, email=email, password=password)

        # Make sure the error message explicitly states that login failed due to user not being active
        assert 'not active' in str(ex)

Пример #32

0

Показать файл

Файл: update_story_tags.py Проект: berkmancenter/mediacloud

    def run_job(cls, stories_id: int) -> None:
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        if stories_id is None:
            raise McNYTLabelsUpdateStoryTagsJobException("'stories_id' is None.")

        stories_id = int(stories_id)

        db = connect_to_db()

        log.info("Updating tags for story ID %d..." % stories_id)

        story = db.find_by_id(table='stories', object_id=stories_id)
        if story is None:
            raise McNYTLabelsUpdateStoryTagsJobException("Story with ID %d was not found." % stories_id)

        nytlabels = NYTLabelsAnnotator()
        try:
            nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
        except Exception as ex:
            raise McNYTLabelsUpdateStoryTagsJobException(
                "Unable to process story ID %d with NYTLabels: %s" % (stories_id, str(ex),)
            )

        log.info("Marking story ID %d as processed..." % stories_id)
        mark_as_processed(db=db, stories_id=stories_id)

        log.info("Finished updating tags for story ID %d" % stories_id)

Пример #33

0

Показать файл

 def setUpClass(cls) -> None:
     # All tests should be able to use the same database
     cls._DB = connect_to_db()
     cls._TEST_MEDIUM = create_test_medium(db=cls._DB, label='test')
     cls._TEST_FEED = create_test_feed(db=cls._DB,
                                       label='test',
                                       medium=cls._TEST_MEDIUM)

Пример #34

0

Показать файл

    def setUp(self):

        super().setUp()

        self.__db = connect_to_db()

        log.debug("Preparing test table 'kardashians'...")
        self.__db.query("DROP TABLE IF EXISTS kardashians CASCADE")
        self.__db.query("""
            CREATE TABLE kardashians (
                id SERIAL PRIMARY KEY NOT NULL,
                name VARCHAR UNIQUE NOT NULL,   -- UNIQUE to test find_or_create()
                surname TEXT NOT NULL,
                dob DATE NOT NULL,
                married_to_kanye BOOL NOT NULL DEFAULT 'f'
            )
        """)
        self.__db.query("""
            INSERT INTO kardashians (name, surname, dob, married_to_kanye) VALUES
            ('Kris', 'Jenner', '1955-11-05'::DATE, 'f'),          -- id=1
            ('Caitlyn', 'Jenner', '1949-10-28'::DATE, 'f'),       -- id=2
            ('Kourtney', 'Kardashian', '1979-04-18'::DATE, 'f'),  -- id=3
            ('Kim', 'Kardashian', '1980-10-21'::DATE, 't'),       -- id=4
            ('Khloé', 'Kardashian', '1984-06-27'::DATE, 'f'),     -- id=5; also, UTF-8
            ('Rob', 'Kardashian', '1987-03-17'::DATE, 'f'),       -- id=6
            ('Kendall', 'Jenner', '1995-11-03'::DATE, 'f'),       -- id=7
            ('Kylie', 'Jenner', '1997-08-10'::DATE, 'f')          -- id=8
        """)

Пример #35

0

Показать файл

Файл: cliff_tags_from_annotation_worker.py Проект: vishalbelsare/mediacloud

def run_cliff_tags_from_annotation(stories_id: int) -> None:
    """Fetch story's CLIFF annotation and uses it to generate/store tags"""
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFTagsFromAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFTagsFromAnnotationJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagsFromAnnotation()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFTagsFromAnnotationJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag'
              ).add_to_queue(stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)

Пример #36

0

Показать файл

Файл: fetch_media_pages.py Проект: berkmancenter/mediacloud

    def run_job(cls, media_id: int) -> None:
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        db = connect_to_db()

        fetch_sitemap_pages_for_media_id(db=db, media_id=media_id)

Пример #37

0

Показать файл

Файл: test_story_vectors.py Проект: berkmancenter/mediacloud

    def test_medium_is_locked(self):
        media_id = self.test_medium['media_id']

        db_locked_session = connect_to_db(label=self.TEST_DB_LABEL)

        assert medium_is_locked(db=self.db(), media_id=media_id) is False

        db_locked_session.query("SELECT pg_advisory_lock(%(media_id)s)", {'media_id': media_id})
        assert medium_is_locked(db=self.db(), media_id=media_id) is True

        db_locked_session.query("SELECT pg_advisory_unlock(%(media_id)s)", {'media_id': media_id})
        assert medium_is_locked(db=self.db(), media_id=media_id) is False

        db_locked_session.disconnect()

Пример #38

0

Показать файл

Файл: extract_and_vector.py Проект: berkmancenter/mediacloud

    def run_job(cls, stories_id: int, use_cache: bool = False) -> None:

        # MC_REWRITE_TO_PYTHON: remove after Python rewrite
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)
        stories_id = int(stories_id)

        if not stories_id:
            raise McExtractAndVectorException("'stories_id' is not set.")

        db = connect_to_db()

        story = db.find_by_id(table='stories', object_id=stories_id)
        if not story:
            raise McExtractAndVectorException("Story with ID {} was not found.".format(stories_id))

        if medium_is_locked(db=db, media_id=story['media_id']):
            log.warning("Requeueing job for story {} in locked medium {}...".format(stories_id, story['media_id']))
            ExtractAndVectorJob._consecutive_requeues += 1

            # Prevent spamming these requeue events if the locked media source is the only one in the queue
            if ExtractAndVectorJob._consecutive_requeues > ExtractAndVectorJob._SLEEP_AFTER_REQUEUES:
                log.warning(
                    "Story extraction job has been requeued more than {} times, waiting before requeueing...".format(
                        ExtractAndVectorJob._consecutive_requeues
                    )
                )
                time.sleep(1)

            ExtractAndVectorJob.add_to_queue(stories_id=stories_id)

            return

        ExtractAndVectorJob._consecutive_requeues = 0

        log.info("Extracting story {}...".format(stories_id))

        db.begin()

        try:
            extractor_args = PyExtractorArguments(use_cache=use_cache)
            extract_and_process_story(db=db, story=story, extractor_args=extractor_args)

        except Exception as ex:
            raise McExtractAndVectorException("Extractor died while extracting story {}: {}".format(stories_id, ex))

        db.commit()

        log.info("Done extracting story {}.".format(stories_id))

Пример #39

0

Показать файл

Файл: test_database.py Проект: berkmancenter/mediacloud

    def setUp(self) -> None:
        """Create a fresh testing database for each unit test.

        This relies on an empty template existing, which should have been created in setUpClass() above.
        """
        super().setUp()

        # Connect to the template database to execure the create command for the test database
        log.warning("recreate test db from template: %s" % self.template_db_name)

        db = connect_to_db(label=self.TEST_DB_LABEL, is_template=True)

        self.__kill_connections_to_database(db=db, database_name=self.db_name)

        db.query("DROP DATABASE IF EXISTS {}".format(self.db_name))
        db.query("CREATE DATABASE {} TEMPLATE {}".format(self.db_name, self.template_db_name))

        db.disconnect()

        db = connect_to_db(label=self.TEST_DB_LABEL)

        force_using_test_database()

        self.__db = db

Пример #40

0

Показать файл

Файл: purge_object_caches.py Проект: berkmancenter/mediacloud

def purge_object_caches():
    """Call PostgreSQL function which purges PostgreSQL object caches."""

    # Wait for an hour between attempts to purge object caches
    delay_between_attempts = 60 * 60

    log.info("Starting to purge object caches...")
    while True:
        log.info("Purging object caches...")

        db = connect_to_db()
        db.query('SELECT cache.purge_object_caches()')
        db.disconnect()

        log.info("Purged object caches, sleeping for %d seconds." % delay_between_attempts)
        time.sleep(delay_between_attempts)

Пример #41

0

Показать файл

Файл: create_missing_partitions.py Проект: berkmancenter/mediacloud

def create_missing_partitions():
    """Call PostgreSQL function which creates missing table partitions (if any)."""

    # Wait for an hour between attempts to create new partitions
    delay_between_attempts = 60 * 60

    log.info("Starting to create missing partitions...")
    while True:
        log.info("Creating missing partitions...")

        db = connect_to_db()
        db.query('SELECT create_missing_partitions()')
        db.disconnect()

        log.info("Created missing partitions, sleeping for %d seconds." % delay_between_attempts)
        time.sleep(delay_between_attempts)

Пример #42

0

Показать файл

Файл: schema.py Проект: berkmancenter/mediacloud

def recreate_db(label: typing.Optional[str] = None, is_template: bool = False) -> None:
    """(Re)create database schema.

    This function drops all objects in all schemas and reruns the schema/mediawords.sql to recreate the schema
    (and erase all data!) for the given database.

    This function will refuse to run if there are more than 10 million stories in the database, under the assumption
    that the database might be a production database in that case.

    """
    def reset_all_schemas(db_: DatabaseHandler) -> None:
        """Recreate all schemas."""
        schemas = db_.query("""
            SELECT schema_name
            FROM information_schema.schemata
            WHERE schema_name NOT LIKE %(schema_pattern)s
              AND schema_name != 'information_schema'
            ORDER BY schema_name
        """, {'schema_pattern': 'pg_%'}).flat()

        # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test
        db_.query('SET client_min_messages=WARNING')

        for schema in schemas:
            db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema)

        db_.query('SET client_min_messages=NOTICE')

    # ---

    label = decode_str_from_bytes_if_needed(label)

    db = connect_to_db(label=label, do_not_check_schema_version=True, is_template=is_template)

    log.info("Resetting all schemas...")
    reset_all_schemas(db_=db)

    db.set_show_error_statement(True)

    mediawords_sql_path = mc_sql_schema_path()
    log.info("Importing from %s..." % mediawords_sql_path)
    with open(mediawords_sql_path, 'r') as mediawords_sql_f:
        mediawords_sql = mediawords_sql_f.read()
        db.query(mediawords_sql)

    log.info("Done.")

Пример #43

0

Показать файл

Файл: test_export_tables.py Проект: berkmancenter/mediacloud

def test_print_exported_tables_to_backup_crawler():
    # Basic sanity test to make sure something gets printed out to STDOUT
    # FIXME try importing the dump into a test database
    db = connect_to_db()

    orig_stdout = sys.stdout
    sys.stdout = captured_stdout = StringIO()

    export_dump_exception = None
    try:
        print_exported_tables_to_backup_crawler(db=db)
    except Exception as ex:
        export_dump_exception = str(ex)

    sys.stdout = orig_stdout

    assert export_dump_exception is None

    sql_dump = captured_stdout.getvalue()
    assert 'COPY media' in sql_dump

Пример #44

0

Показать файл

Файл: generate_snapshot_model.py Проект: berkmancenter/mediacloud

    def run_job(cls, snapshots_id: int) -> None:

        # MC_REWRITE_TO_PYTHON: remove after Python rewrite
        if isinstance(snapshots_id, bytes):
            snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

        if snapshots_id is None:
            raise McWord2vecGenerateSnapshotModelException("'snapshots_id' is None.")

        snapshots_id = int(snapshots_id)

        db = connect_to_db()

        log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

        sentence_iterator = SnapshotSentenceIterator(db=db, snapshots_id=snapshots_id)
        model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id)
        train_word2vec_model(sentence_iterator=sentence_iterator,
                             model_store=model_store)

        log.info("Finished generating word2vec model for snapshot %d." % snapshots_id)

Пример #45

0

Показать файл

Файл: test_database.py Проект: berkmancenter/mediacloud

    def setUpClass(cls) -> None:
        """Create a fresh template data from mediawords.sql.

        The template database will be used to execute the
        'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database
        for each individual unit test.  Recreating from a template is much faster than creating a database from
        scratch from our large schema.
        """
        super().setUpClass()

        config = py_get_config()

        db_config = list(filter(lambda x: x['label'] == cls.TEST_DB_LABEL, config['database']))
        if len(db_config) < 1:
            raise McTestDatabaseTestCaseException("Unable to find %s database in mediawords.yml" % cls.TEST_DB_LABEL)

        cls.db_name = (db_config[0])['db']

        cls.template_db_name = config['mediawords'].get('test_template_db_name', None)
        if cls.template_db_name is not None:
            log.warning("use existing test db template: %s" % cls.template_db_name)
            return

        log.info("create test db template")

        cls.template_db_name = cls.db_name + '_template'

        # we insert this db name directly into sql, so be paranoid about what is in it
        if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None:
            raise McTestDatabaseTestCaseException("Illegal table name: " + cls.db_name)

        # mediacloud_test should already exist, so we have to connect to it to create the template database
        db = connect_to_db(label=cls.TEST_DB_LABEL, do_not_check_schema_version=True)

        cls.__kill_connections_to_database(db=db, database_name=cls.template_db_name)

        db.query("DROP DATABASE IF EXISTS {}".format(cls.template_db_name))
        db.query("CREATE DATABASE {}".format(cls.template_db_name))
        db.disconnect()
        recreate_db(label=cls.TEST_DB_LABEL, is_template=True)

Пример #46

0

Показать файл

Файл: fetch_link_job.py Проект: berkmancenter/mediacloud

    def run_job(
            cls,
            topic_fetch_urls_id: int,
            dummy_requeue: bool = False,
            domain_timeout: typing.Optional[int] = None) -> None:
        """Call fetch_topic_url and requeue the job of the request has been domain throttled.

        Arguments:
        topic_fetch_urls_id - id of topic_fetch_urls row
        dummy_requeue - if True, set state to FETCH_STATE_REQUEUED as normal but do not actually requeue
        domain_timeout - pass down to ThrottledUserAgent to set the timeout for each domain

        Returns:
        None

        """
        if isinstance(topic_fetch_urls_id, bytes):
            topic_fetch_urls_id = decode_object_from_bytes_if_needed(topic_fetch_urls_id)
        if topic_fetch_urls_id is None:
            raise McFetchLinkJobException("'topic_fetch_urls_id' is None.")

        log.info("Start fetch for topic_fetch_url %d" % topic_fetch_urls_id)

        db = connect_to_db()

        try:
            mediawords.tm.fetch_link.fetch_topic_url(
                db=db,
                topic_fetch_urls_id=topic_fetch_urls_id,
                domain_timeout=domain_timeout)
            cls._consecutive_requeues = 0

        except McThrottledDomainException:
            # if a domain has been throttled, just add it back to the end of the queue
            log.info("Fetch for topic_fetch_url %d domain throttled.  Requeueing ..." % topic_fetch_urls_id)

            db.update_by_id(
                'topic_fetch_urls',
                topic_fetch_urls_id,
                {'state': mediawords.tm.fetch_link.FETCH_STATE_REQUEUED, 'fetch_date': datetime.datetime.now()})
            if not dummy_requeue:
                FetchLinkJob.add_to_queue(topic_fetch_urls_id)

            cls._consecutive_requeues += 1
            if cls._consecutive_requeues > REQUEUES_UNTIL_SLEEP:
                log.info("sleeping after %d consecutive retries ..." % cls._consecutive_requeues)
                time.sleep(1)

        except Exception as ex:
            # all non throttled errors should get caught by the try: about, but catch again here just in case
            log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_id, str(ex)))
            cls._consecutive_requeues = 0
            update = {
                'state': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR,
                'fetch_date': datetime.datetime.now(),
                'message': traceback.format_exc(),
            }
            db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, update)

        db.disconnect()

        log.info("Finished fetch for topic_fetch_url %d" % topic_fetch_urls_id)

Пример #47

0

Показать файл

Файл: add_all_media_to_sitemap_queue.py Проект: berkmancenter/mediacloud

def add_all_media_to_sitemap_queue(db: DatabaseHandler):
    """Add all media IDs to XML sitemap fetching queue."""
    log.info("Fetching all media IDs...")
    media_ids = db.query("""
        SELECT media_id
        FROM media
        ORDER BY media_id
    """).flat()
    for media_id in media_ids:
        log.info("Adding media ID %d" % media_id)
        FetchMediaPages.add_to_queue(media_id=media_id)


def add_us_media_to_sitemap_queue():
    us_media_ids = [
        104828, 1089, 1092, 1095, 1098, 1101, 1104, 1110, 1145, 1149, 1150, 14, 15, 1747, 1750, 1751, 1752, 1755, 18268,
        18710, 18775, 18839, 18840, 19334, 19643, 1, 22088, 25349, 25499, 27502, 2, 40944, 4415, 4419, 4442, 4, 6218,
        623382, 64866, 65, 6, 751082, 7, 8,
    ]
    us_media_ids = sorted(us_media_ids)
    for media_id in us_media_ids:
        log.info("Adding media ID %d" % media_id)
        FetchMediaPages.add_to_queue(media_id=media_id)


if __name__ == "__main__":
    db_ = connect_to_db()
    # add_all_media_to_sitemap_queue(db=db_)
    add_us_media_to_sitemap_queue()

Пример #48

0

Показать файл

Файл: export_tables_to_backup_crawler.py Проект: berkmancenter/mediacloud

#!/usr/bin/env python3
#
# Export "media", "feeds", ... table data needed to run a backup crawler
#
# Usage:
#
# 1) On production machine (database that is being exported), run:
#
#     # Export table data to "mediacloud-dump.sql"
#     ./tools/export_import/export_tables_to_backup_crawler.py > mediacloud-dump.sql
#
# 2) On target machine (e.g. a backup crawler), run:
#
#     # Create database
#     createdb mediacloud
#
#     # Import empty schema
#     psql -f script/mediawords.sql mediacloud
#
#     # Import tables from "mediacloud-dump.sql"
#     psql -v ON_ERROR_STOP=1 -f mediacloud-dump.sql mediacloud
#

from mediawords.db import connect_to_db
from mediawords.db.export.export_tables import print_exported_tables_to_backup_crawler

if __name__ == '__main__':
    db = connect_to_db()
    print_exported_tables_to_backup_crawler(db=db)

Python connect_to_db примеры использования