예제 #1
0
def skip_self_linked_domain(db: DatabaseHandler,
                            topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url[
            'topic_links_id'] is None:
        return False

    topic_link = db.query(
        """
        SELECT *
        FROM topic_links
        WHERE
            topics_id = %(topics_id)s AND
            topic_links_id = %(topic_links_id)s
    """, {
            'topics_id': topic_fetch_url['topics_id'],
            'topic_links_id': topic_fetch_url['topic_links_id'],
        }).hash()
    if not topic_link:
        raise Exception(
            f"Topic link ID {topic_fetch_url['topic_links_id']} was not found."
        )

    story = db.require_by_id('stories', topic_link['stories_id'])

    url = topic_link.get('redirect_url', topic_link['url'])

    return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'],
                                       story['url'], url)
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = "%s: %s" % (screen_name, content)
    tweet_date = tweet['created_at']
    url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id'])

    story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = mediawords.util.twitter.get_tweet_urls(tweet)
    for url in urls:
        if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.info("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url
        }

        db.create('topic_links', topic_link)
        mediawords.tm.domains.increment_domain_links(db, topic_link)

    return story
예제 #3
0
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(table='stories',
                      insert_hash={
                          'media_id': int(feed['media_id']),
                          'url': "http://story.test/%s" % label,
                          'guid': "guid://story.test/%s" % label,
                          'title': "story %s" % label,
                          'description': "description %s" % label,
                          'publish_date': '2016-10-15 08:00:00',
                          'collect_date': '2016-10-15 10:00:00',
                          'full_text_rss': True,
                      })

    db.create(table='feeds_stories_map',
              insert_hash={
                  'feeds_id': int(feed['feeds_id']),
                  'stories_id': int(story['stories_id']),
              })

    return story
예제 #4
0
def _add_tweet_story(db: DatabaseHandler,
                     topic: Dict[str, Any],
                     tweet: dict,
                     topic_fetch_urls: List[Dict[str, Any]]) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = f"{screen_name}: {content}"
    tweet_date = tweet['created_at']
    url = f"https://twitter.com/{screen_name}/status/{tweet['id']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = get_tweet_urls(tweet)
    for url in urls:
        if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.debug("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url,
        }

        db.create('topic_links', topic_link)
        increment_domain_links(db, topic_link)

    return story
예제 #5
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    title = f"{user['name']} ({user['screen_name']}) | Twitter"
    tweet_date = sql_now()
    url = f"https://twitter.com/{user['screen_name']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)

    stories_id = story['stories_id']
    tags_id = undateable_tag['tags_id']

    db.query("""
        INSERT INTO public.stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })

    return story
예제 #6
0
def _call_function_on_url_chunks(db: DatabaseHandler,
                                 topic: Dict[str, Any],
                                 urls: List[Dict[str, Any]],
                                 chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning(f"error fetching twitter data: {ex}")

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]

            db.query("""
                UPDATE topic_fetch_urls SET
                    state = %(state)s,
                    message = %(message)s
                WHERE
                    topics_id = %(topics_id)s AND
                    topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s)
            """, {
                'state': FETCH_STATE_PYTHON_ERROR,
                'message': str(ex),
                'topics_id': topic['topics_id'],
                'topic_fetch_urls_ids': topic_fetch_urls_ids,
            })

        i += URLS_CHUNK_SIZE
예제 #7
0
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(
        table='stories',
        insert_hash={
            'media_id': int(feed['media_id']),
            'url': "http://story.test/%s" % label,
            'guid': "guid://story.test/%s" % label,
            'title': "story %s" % label,
            'description': "description %s" % label,
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            'full_text_rss': True,
        }
    )

    db.create(
        table='feeds_stories_map',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'stories_id': int(story['stories_id']),
        }
    )

    return story
예제 #8
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links and all_links counts in the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.  Always increment all_links.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    self_link = 1 if story_domain in (url_domain, redirect_url_domain) else 0

    db.query(
        """
        insert into topic_domains (topics_id, domain, self_links, all_links)
            values(%(topics_id)s, %(domain)s, %(self_link)s, 1)
            on conflict (topics_id, md5(domain))
                do update set
                    self_links = topic_domains.self_links + %(self_link)s,
                    all_links = topic_domains.all_links + 1
        """,
        {'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain, 'self_link': self_link})
예제 #9
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict,
                    topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'],
                               user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = mediawords.util.sql.sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = mediawords.tm.stories.generate_story(db=db,
                                                 url=url,
                                                 content=content,
                                                 title=title,
                                                 publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db,
                                               story=story,
                                               topic=topic,
                                               link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(
            db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {
            'a': story['stories_id'],
            'b': undateable_tag['tags_id']
        })

    return story
예제 #10
0
def _get_undateable_tag(db: DatabaseHandler) -> dict:
    """Return the date_invalid:undateable tag."""
    tag_name = 'undateable'
    tag_set_name = 'date_invalid'

    invalid_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """, {
            'a': tag_name,
            'b': tag_set_name
        }).hash()

    if invalid_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': tag_set_name})
        invalid_tag = db.find_or_create('tags', {
            'tag': tag_name,
            'tag_sets_id': tag_set['tag_sets_id']
        })

    return invalid_tag
예제 #11
0
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None:
    """Print table dump to STDOUT."""

    column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns()
    primary_key_column = db.primary_key_column(table=table)

    print("""
--
-- Table '%(table)s'
--

    """ % {'table': table})

    # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values:
    #
    # http://stackoverflow.com/a/11379550/200603
    #
    # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values.
    print(
        "COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');"
        % {
            'table': table,
            'column_names': ', '.join(column_names),
        })

    csv_writer = csv.writer(sys.stdout,
                            delimiter="\t",
                            escapechar="\\",
                            quoting=csv.QUOTE_NONE)

    res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" %
                   {
                       'table': table,
                       'primary_key_column': primary_key_column,
                   })

    postgresql_null_value = '\\N'
    postgresql_end_of_data = '\.'
    while True:
        row = res.array()
        if row is None:
            break
        else:
            csv_writer.writerow(
                [postgresql_null_value if val is None else val for val in row])

    print(postgresql_end_of_data)

    print("""

-- Update sequence head
SELECT setval(
    pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'),
    (SELECT max(%(primary_key_column)s)+1 FROM %(table)s)
);

    """ % {
        'table': table,
        'primary_key_column': primary_key_column,
    })
예제 #12
0
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List,
                                 chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning("error fetching twitter data: {}".format(ex))

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]
            db.query(
                "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)",
                {
                    'a': FETCH_STATE_PYTHON_ERROR,
                    'b': traceback.format_exc(),
                    'c': topic_fetch_urls_ids
                })

        i += URLS_CHUNK_SIZE
예제 #13
0
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to tweet missing."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {
            'a': FETCH_STATE_TWEET_MISSING,
            'b': topic_fetch_url['topic_fetch_urls_id']
        })
예제 #14
0
def _log_content_match_failed(db: DatabaseHandler,
                              topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to content match failed."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {
            'a': FETCH_STATE_CONTENT_MATCH_FAILED,
            'b': topic_fetch_url['topic_fetch_urls_id']
        })
예제 #15
0
 def __kill_connections_to_database(db: DatabaseHandler, database_name: str) -> None:
     """Kill all active connections to the database."""
     # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call
     # tearDown() and the the connection to the test database persists (apparently)
     db.query("""
         SELECT pg_terminate_backend(pg_stat_activity.pid)
         FROM pg_catalog.pg_stat_activity
         WHERE datname = %(template_db_name)s
           AND pid != pg_backend_pid()
     """, {'template_db_name': database_name})
예제 #16
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #17
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #18
0
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None:
    """Print table dump to STDOUT."""

    column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns()
    primary_key_column = db.primary_key_column(table)

    print("""
--
-- Table '%(table)s'
--

    """ % {'table': table})

    # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values:
    #
    # http://stackoverflow.com/a/11379550/200603
    #
    # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values.
    print("COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');" % {
        'table': table,
        'column_names': ', '.join(column_names),
    })

    csv_writer = csv.writer(sys.stdout, delimiter="\t", escapechar="\\", quoting=csv.QUOTE_NONE)

    res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" % {
        'table': table,
        'primary_key_column': primary_key_column,
    })

    postgresql_null_value = '\\N'
    postgresql_end_of_data = r'\.'
    while True:
        row = res.array()
        if row is None:
            break
        else:
            csv_writer.writerow([postgresql_null_value if val is None else val for val in row])

    print(postgresql_end_of_data)

    print("""

-- Update sequence head
SELECT setval(
    pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'),
    (SELECT max(%(primary_key_column)s)+1 FROM %(table)s)
);

    """ % {
        'table': table,
        'primary_key_column': primary_key_column,
    })
예제 #19
0
 def __kill_connections_to_database(db: DatabaseHandler,
                                    database_name: str) -> None:
     """Kill all active connections to the database."""
     # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call
     # tearDown() and the the connection to the test database persists (apparently)
     db.query(
         """
         SELECT pg_terminate_backend(pg_stat_activity.pid)
         FROM pg_catalog.pg_stat_activity
         WHERE datname = %(template_db_name)s
           AND pid != pg_backend_pid()
     """, {'template_db_name': database_name})
예제 #20
0
def fetch_twitter_urls_update_state(db: DatabaseHandler, topic_fetch_urls_ids: List[int]):
    """Try fetch_twitter_urls(), update state."""
    try:
        fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids)
    except Exception as ex:
        log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex)))
        db.query("""
            update topic_fetch_urls
            set state = %(a)s,
                message = %(b)s,
                fetch_date = now()
            where topic_fetch_urls_id = any(%(c)s)
        """, {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids})
예제 #21
0
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> None:
    """Update topic_fetch_url state to content match failed."""

    db.query("""
        UPDATE topic_fetch_urls SET
            state = %(state)s,
            fetch_date = NOW()
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = %(topic_fetch_urls_id)s
    """, {
        'state': FETCH_STATE_CONTENT_MATCH_FAILED,
        'topics_id': topic_fetch_url['topics_id'],
        'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'],
    })
예제 #22
0
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List[int]) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
예제 #23
0
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> None:
    """Update topic_fetch_url state to tweet missing."""

    db.query("""
        UPDATE topic_fetch_urls SET
            state = %(state)s,
            fetch_date = NOW()
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = %(topic_fetch_urls_id)s
    """, {
        'state': FETCH_STATE_TWEET_MISSING,
        'topics_id': topic_fetch_url['topics_id'],
        'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'],
    })
예제 #24
0
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
예제 #25
0
def create_test_timespan(db: DatabaseHandler,
                         topic: dict = None,
                         snapshot: dict = None) -> dict:
    """Create simple timespans for testing.

    Mast pass either topic or snapshot or both. If a snapshot is not passed, create one.
    """
    assert topic is not None or snapshot is not None

    if not snapshot:
        snapshot = create_test_snapshot(db, topic)

    return db.create(table='timespans',
                     insert_hash={
                         'topics_id': snapshot['topics_id'],
                         'snapshots_id': snapshot['snapshots_id'],
                         'start_date': snapshot['start_date'],
                         'end_date': snapshot['end_date'],
                         'period': 'overall',
                         'story_count': 0,
                         'story_link_count': 0,
                         'medium_count': 0,
                         'medium_link_count': 0,
                         'post_count': 0
                     })
예제 #26
0
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int,
                                source_url: str, ref_url: str) -> bool:
    """Return true if the url should be skipped because it is a self linked domain within the topic.

    Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * ref_url matches SKIP_SELF_LINK_RE.
    """
    source_domain = get_url_distinctive_domain(source_url)
    ref_domain = get_url_distinctive_domain(ref_url)

    if source_domain != ref_domain:
        return False

    if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I):
        return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {
            'a': topics_id,
            'b': ref_domain
        }).hash()

    if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS:
        return True

    return False
def print_long_running_job_states(db: DatabaseHandler):
    """print a count of job_states that have been running or queued for longer than a day"""
    counts = db.query("""
        select
            class,
            count(*) as count,
            min(last_updated::date) as min_date,
            max(last_updated::date) as max_date
        from (
            select
                *,
                rank() over (
                    partition by
                        class,
                        (args->>'media_id')::int
                    order by last_updated desc
                ) as media_rank,
                args->>'media_id' as media_id
            from job_states
        ) as q 
        where 
            media_rank = 1 and 
            state in( 'queued', 'running') and 
            last_updated < now() - interval '1 day'
        group by class
        order by class
    """).hashes()

    if len(counts) > 0:
        print("Long Running Jobs:\n")

        for count in counts:
            print("%s: %d (%s - %s)" % (count['class'], count['count'], count['min_date'], count['max_date']))
예제 #28
0
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int, source_url: str, ref_url: str) -> bool:
    """Return true if the url should be skipped because it is a self linked domain within the topic.

    Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * ref_url matches SKIP_SELF_LINK_RE.
    """
    source_domain = mediawords.util.url.get_url_distinctive_domain(source_url)
    ref_domain = mediawords.util.url.get_url_distinctive_domain(ref_url)

    if source_domain != ref_domain:
        return False

    if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I):
        return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {'a': topics_id, 'b': ref_domain}).hash()

    if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS:
        return True

    return False
예제 #29
0
def connect_to_db() -> DatabaseHandler:
    """Connect to PostgreSQL."""

    db_config = CommonConfig.database()
    retries_config = db_config.retries()

    assert retries_config.max_attempts() > 0, "max_tries can't be negative."

    db = None

    for attempt in range(1, retries_config.max_attempts() + 1):

        try:

            log.debug("Connecting to PostgreSQL...")

            db = DatabaseHandler(
                host=db_config.hostname(),
                port=db_config.port(),
                username=db_config.username(),
                password=db_config.password(),
                database=db_config.database_name(),
            )
            if not db:
                raise ValueError("Returned value is None.")

            # Return the database handler upon successful connection
            break

        except Exception as ex:

            error_message = "Unable to connect to %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': db_config.username(),
                'host': db_config.hostname(),
                'port': db_config.port(),
                'database': db_config.database_name(),
                'exception': str(ex),
            }

            log.error(error_message)

            if attempt < retries_config.max_attempts():
                log.info(
                    f"Will retry for #{attempt} time in {retries_config.sleep_between_attempts()} seconds..."
                )
                time.sleep(retries_config.sleep_between_attempts())

            else:
                log.info("Out of retries, giving up and exiting...")

                # Don't throw any exceptions because they might be caught by
                # the try-catch block, and so the caller will just assume that
                # there was something wrong with the input data and proceed
                # with processing next item in the job queue (e.g. the next
                # story). Instead, just quit and wait for someone to restart
                # the whole app that requires database access.
                fatal_error(error_message)

    return db
예제 #30
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
        insert into topic_domains (topics_id, domain, self_links)
            values(%(topics_id)s, %(domain)s, 1)
            on conflict (topics_id, md5(domain))
                do nothing
            returning *
        """,
        {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }
    ).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            update topic_domains set
                    self_links = topic_domains.self_links + 1
                where
                    topics_id = %(topics_id)s and
                    domain = %(domain)s
            """,
            {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            }
        )
예제 #31
0
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict:
    """Update topic_fetch_url stat to tweet added."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, stories_id=%(b)s, fetch_date=now() where topic_fetch_urls_id=%(c)s
            returning *
        """,
        {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
예제 #32
0
def _log_python_errpr(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> dict:
    """Update topic_fetch_url stat to tweet failed."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, fetch_date=now(), message = %(b)s  where topic_fetch_urls_id=%(c)s
            returning *
        """,
        {'a': FETCH_STATE_PYTHON_ERROR, 'b': message, 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
예제 #33
0
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None:
        return False

    topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id'])

    story = db.require_by_id('stories', topic_link['stories_id'])

    url = topic_link.get('redirect_url', topic_link['url'])

    return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
예제 #34
0
def create_test_snapshot(db: DatabaseHandler, topic: dict) -> dict:
    """Create simple snapshot for testing."""
    return db.create(table='snapshots',
                     insert_hash={
                         'topics_id': topic['topics_id'],
                         'snapshot_date': topic['end_date'],
                         'start_date': topic['start_date'],
                         'end_date': topic['end_date']
                     })
예제 #35
0
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return true if the domain of the linked url is the same as the domain of the linking story and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * the linked url matches SKIP_SELF_LINK_RE.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None:
        return False

    topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id'])

    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])

    if redirect_url is None:
        redirect_url = topic_link['url']

    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    link_domain = redirect_url_domain if redirect_url_domain else url_domain

    if story_domain not in (url_domain, redirect_url_domain):
        return False

    for url in (topic_link['url'], redirect_url):
        if re.search(SKIP_SELF_LINK_RE, url, flags=re.I):
            return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {'a': topic_fetch_url['topics_id'], 'b': link_domain}).hash()

    if topic_domain and topic_domain['self_links'] > MAX_SELF_LINKS:
        return True

    return False
예제 #36
0
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict:
    """Update topic_fetch_url stat to tweet added."""
    return db.query("""
        UPDATE topic_fetch_urls
        SET state = %(a)s,
            stories_id = %(b)s,
            fetch_date = NOW()
        WHERE topic_fetch_urls_id = %(c)s
        RETURNING *
    """, {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
예제 #37
0
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List, chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning("error fetching twitter data: {}".format(ex))

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]
            db.query(
                "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)",
                {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids})

        i += URLS_CHUNK_SIZE
예제 #38
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = get_url_distinctive_domain(story['url'])

    url_domain = get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
            INSERT INTO topic_domains (topics_id, domain, self_links)
            VALUES (%(topics_id)s, %(domain)s, 1)
            ON CONFLICT (topics_id, md5(domain)) DO NOTHING
            RETURNING *
        """, {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            UPDATE topic_domains SET
                self_links = topic_domains.self_links + 1
            WHERE
                topics_id = %(topics_id)s AND
                domain = %(domain)s
            """, {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            })
예제 #39
0
def _add_tweet_story(db: DatabaseHandler, topic: dict, tweet: dict,
                     topic_fetch_urls: list) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = "%s: %s" % (screen_name, content)
    tweet_date = tweet['created_at']
    url = 'https://twitter.com/%s/status/%s' % (screen_name, tweet['id'])

    story = mediawords.tm.stories.generate_story(db=db,
                                                 url=url,
                                                 content=content,
                                                 title=title,
                                                 publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db,
                                               story=story,
                                               topic=topic,
                                               link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(
            db, topic_fetch_url)

    urls = mediawords.util.twitter.get_tweet_urls(tweet)
    for url in urls:
        if mediawords.tm.domains.skip_self_linked_domain_url(
                db, topic['topics_id'], story['url'], url):
            log.info("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url
        }

        db.create('topic_links', topic_link)
        mediawords.tm.domains.increment_domain_links(db, topic_link)

    return story
예제 #40
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = mediawords.util.sql.sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': undateable_tag['tags_id']})

    return story
예제 #41
0
def _get_undateable_tag(db: DatabaseHandler) -> dict:
    """Return the date_invalid:undateable tag."""
    tag_name = 'undateable'
    tag_set_name = 'date_invalid'

    invalid_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """,
        {'a': tag_name, 'b': tag_set_name}).hash()

    if invalid_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': tag_set_name})
        invalid_tag = db.find_or_create('tags', {'tag': tag_name, 'tag_sets_id': tag_set['tag_sets_id']})

    return invalid_tag
예제 #42
0
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict:
    """Create test feed with a simple label belonging to medium."""

    label = decode_object_from_bytes_if_needed(label)
    medium = decode_object_from_bytes_if_needed(medium)

    return db.create(table='feeds',
                     insert_hash={
                         'name': label,
                         'url': "http://feed.test/%s" % label,
                         'media_id': int(medium['media_id']),
                     })
예제 #43
0
def create_test_medium(db: DatabaseHandler, label: str) -> dict:
    """Create test medium with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(
        table='media',
        insert_hash={
            'name': label,
            'url': "http://media.test/%s" % (label,),
            'is_monitored': True,
            'public_notes': "%s public notes" % (label,),
            'editor_notes': "%s editor notes" % (label,),
        })
예제 #44
0
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict:
    """Create test feed with a simple label belonging to medium."""

    label = decode_object_from_bytes_if_needed(label)
    medium = decode_object_from_bytes_if_needed(medium)

    return db.create(
        table='feeds',
        insert_hash={
            'name': label,
            'url': "http://feed.test/%s" % label,
            'media_id': int(medium['media_id']),
        }
    )
예제 #45
0
    def reset_all_schemas(db_: DatabaseHandler) -> None:
        """Recreate all schemas."""
        schemas = db_.query("""
            SELECT schema_name
            FROM information_schema.schemata
            WHERE schema_name NOT LIKE %(schema_pattern)s
              AND schema_name != 'information_schema'
            ORDER BY schema_name
        """, {'schema_pattern': 'pg_%'}).flat()

        # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test
        db_.query('SET client_min_messages=WARNING')

        for schema in schemas:
            db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema)

        db_.query('SET client_min_messages=NOTICE')
예제 #46
0
def create_test_topic(db: DatabaseHandler, label: str) -> dict:
    """Create test topic with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(
        table='topics',
        insert_hash={
            'name': label,
            'description': label,
            'pattern': label,
            'solr_seed_query': label,
            'solr_seed_query_run': True,
            'start_date': '2016-01-01',
            'end_date': '2016-03-01',
            'job_queue': 'mc',
            'max_stories': 100000,
        }
    )
예제 #47
0
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)
    story = decode_object_from_bytes_if_needed(story)

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'success',
            'priority': 1,
            'extracted': False,
            'path': 'postgresql:foo',
            'stories_id': story['stories_id'],
        }
    )
예제 #48
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'url': feed['url'],
            'host': host,
            'type': 'feed',
            'sequence': 1,
            'state': 'pending',
            'priority': priority,
            'download_time': 'NOW()',
            'extracted': False,
        })
예제 #49
0
def __validate_table_foreign_keys(db: DatabaseHandler, table: str) -> None:
    """Validate all table's foreign keys; raise McValidateTableForeignKeysException if any of the keys are invalid.

    If table's constraints aren't right, SQL would be pretty much invalid."""

    foreign_keys = db.query("""
        SELECT
            tc.constraint_name,
            tc.table_schema,
            tc.table_name,
            kcu.column_name,
            ccu.table_schema AS foreign_table_schema,
            ccu.table_name AS foreign_table_name,
            ccu.column_name AS foreign_column_name

        FROM information_schema.table_constraints AS tc
            JOIN information_schema.key_column_usage AS kcu
                ON tc.constraint_name = kcu.constraint_name
            JOIN information_schema.constraint_column_usage AS ccu
                ON ccu.constraint_name = tc.constraint_name
        WHERE constraint_type = 'FOREIGN KEY'
          AND tc.table_schema = 'public'
          AND tc.table_name = %(table)s
    """, {'table': table}).hashes()

    foreign_key_errors = []

    for foreign_key in foreign_keys:
        constraint_name = foreign_key['constraint_name']

        log.info("Validating foreign key '%s' for table '%s'..." % (constraint_name, table))

        sql = """
            SELECT DISTINCT a.%(column_name)s
            FROM %(table_schema)s.%(table_name)s AS a
                LEFT JOIN %(foreign_table_schema)s.%(foreign_table_name)s AS b
                    ON a.%(column_name)s = b.%(foreign_column_name)s
            WHERE a.%(column_name)s IS NOT NULL
              AND b.%(foreign_column_name)s IS NULL
            ORDER BY a.%(column_name)s
        """ % {
            'column_name': foreign_key['column_name'],
            'table_schema': foreign_key['table_schema'],
            'table_name': table,
            'foreign_table_schema': foreign_key['foreign_table_schema'],
            'foreign_table_name': foreign_key['foreign_table_name'],
            'foreign_column_name': foreign_key['foreign_column_name'],
        }

        unreferenced_rows = db.query(sql).flat()

        if len(unreferenced_rows) > 0:
            error = """
                Table '%(table)s' has unreferenced rows for constraint '%(constraint_name)s':
                %(unreferenced_rows)s; SQL: %(sql)s
            """ % {
                'table': table,
                'constraint_name': constraint_name,
                'unreferenced_rows': ', '.join(unreferenced_rows),
                'sql': sql,
            }
            foreign_key_errors.append(error)
            log.warning(error)
        else:
            log.info("Foreign key '%s' for table '%s' looks fine." % (constraint_name, table))

    if len(foreign_key_errors) > 0:
        raise McValidateTableForeignKeysException(
            "One or more foreign key checks failed for table '%(table)s': %(foreign_key_errors)s" % {
                'table': table,
                'foreign_key_errors': "\n".join(foreign_key_errors)
            }
        )
예제 #50
0
def print_exported_tables_to_backup_crawler(db: DatabaseHandler) -> None:
    """Export tables by printing their SQL dump to STDOUT."""

    # Tables to export
    tables = ['tag_sets', 'media', 'feeds', 'tags', 'media_tags_map', 'feeds_tags_map']

    db.begin()

    log.info("Validating foreign keys...")
    foreign_key_errors = []

    for table in tables:
        log.info("Validating foreign keys for table '%s'..." % table)

        # Aggregate errors into array to be able to print a one huge complaint
        try:
            __validate_table_foreign_keys(db=db, table=table)
        except McValidateTableForeignKeysException as ex:
            error = str(ex)
            log.warning("Validating foreign key for table '%s' failed: %s" % (table, error))
            foreign_key_errors.append(error)

    if len(foreign_key_errors):
        raise McPrintExportedTablesToBackupCrawlerException(
            "One or more foreign key checks failed, won't continue as resulting SQL would be invalid:\n\n%s" %
            str(foreign_key_errors)
        )

    log.info("Done validating foreign keys.")

    print("""
--
-- This is a dataset needed for running a backup crawler.
--
-- Import this dump into the backup crawler's PostgreSQL instance.
--

BEGIN;

--
-- Die if schema has not been initialized
--
DO $$
BEGIN
    IF NOT EXISTS (
        SELECT 1
        FROM information_schema.tables
        WHERE table_schema = CURRENT_SCHEMA()
          AND table_name = 'media'
    ) THEN
        RAISE EXCEPTION 'Table "media" does not exist, please initialize schema.';
    END IF;
END$$;

--
-- Die if something's already in the database
--
DO $$
BEGIN
    IF EXISTS (SELECT 1 FROM media) THEN
        RAISE EXCEPTION 'Table "media" already contains data, please purge the database.';
    END IF;
END$$;

--
-- Temporarily disable constraints to speed up import
--
SET CONSTRAINTS ALL DEFERRED;

--
-- Truncate "tag_sets" table (might already have something)
--
TRUNCATE tag_sets CASCADE;

    """)

    log.info("Exporting tables...")
    for table in tables:
        log.info("Exporting table '%s'..." % table)
        __print_table_csv_to_stdout(db=db, table=table)
    log.info("Done exporting tables.")

    db.commit()

    print("""
--
-- Reenable constraints
--
SET CONSTRAINTS ALL IMMEDIATE;

COMMIT;
    """)
예제 #51
0
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to content match failed."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {'a': FETCH_STATE_CONTENT_MATCH_FAILED, 'b': topic_fetch_url['topic_fetch_urls_id']})
예제 #52
0
def connect_to_db(
        label: typing.Optional[str] = None,
        do_not_check_schema_version: bool = False,
        is_template: bool = False) -> DatabaseHandler:
    """Connect to PostgreSQL.

    Arguments:
    label - db config section label for mediawords.yml
    do_no_check_schema_version - if false, throw an error if the versions in mediawords.ym and the db do not match
    is_template - if true, connect to a db called <db_name>_template instead of <db_name>

    """
    label = decode_str_from_bytes_if_needed(label)

    # If this is Catalyst::Test run, force the label to the test database
    if using_test_database():
        label = 'test'

    config = py_get_config()

    if 'database' not in config:
        raise McConnectToDBException("No database connections are configured")

    all_settings = config['database']
    if all_settings is None:
        raise McConnectToDBException("No database connections are configured")

    settings = None
    if label is not None:
        for configured_database in all_settings:
            if configured_database['label'] == label:
                settings = configured_database
                break
        if settings is None:
            raise McConnectToDBException("No database connection settings labeled '%s'." % label)
    else:
        if len(all_settings) == 0:
            raise McConnectToDBException("No default connection settings found.")

        settings = all_settings[0]

    if settings is None:
        raise McConnectToDBException("Settings are undefined.")

    if 'host' not in settings or 'db' not in settings:
        raise McConnectToDBException("Settings are incomplete ('db' and 'host' must both be set).")

    host = settings['host']
    port = int(settings['port'])
    username = settings['user']
    password = settings['pass']
    database = settings['db']

    if is_template:
        database = database + "_template"

    try:
        ret = DatabaseHandler(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            do_not_check_schema_version=do_not_check_schema_version
        )
    except Exception as ex:
        raise McConnectToDBException(
            "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': username,
                'host': host,
                'port': port,
                'database': database,
                'exception': str(ex)
            })

    if ret is None:
        raise McConnectToDBException("Error while connecting to the database.")

    if 'db_statement_timeout' in config['mediawords']:
        db_statement_timeout = config['mediawords']['db_statement_timeout']

        ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout})

    return ret
예제 #53
0
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to tweet missing."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {'a': FETCH_STATE_TWEET_MISSING, 'b': topic_fetch_url['topic_fetch_urls_id']})