Пример #1
0
def process_download_for_extractor(db: DatabaseHandler,
                                   download: dict,
                                   extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url']))

    extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    has_remaining_download = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {'stories_id': stories_id}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db, story=story, extractor_args=extractor_args)
def get_links_from_story_text(db: DatabaseHandler, story: dict) -> typing.List[str]:
    """Get all urls that appear in the text or description of the story using a simple regex."""
    download_ids = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
        """, {'stories_id': story['stories_id']}
    ).flat()

    download_texts = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = ANY(%(download_ids)s)
        ORDER BY download_texts_id
        """, {'download_ids': download_ids}
    ).hashes()

    story_text = ' '.join([dt['download_text'] for dt in download_texts])

    story_text = story_text + ' ' + str(story['title']) if story['title'] is not None else story_text
    story_text = story_text + ' ' + str(story['description']) if story['description'] is not None else story_text

    links = []
    for url in re.findall(r'https?://[^\s\")]+', story_text):
        url = re.sub(r'\W+$', '', url)
        links.append(url)

    return links
Пример #3
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']},
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
Пример #4
0
def create(db: DatabaseHandler, download: dict, extract: dict) -> dict:
    """Create a download_text hash and insert it into the database. Delete any existing download_text row for the
    download."""

    # FIXME don't pass freeform "extract" dict, we need just the "extracted_text"

    download = decode_object_from_bytes_if_needed(download)
    extract = decode_object_from_bytes_if_needed(extract)

    db.query("""
        DELETE FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    download_text = db.query("""
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
        RETURNING *
    """, {
        'downloads_id': download['downloads_id'],
        'download_text': extract['extracted_text'],
    }).hash()

    db.query("""
        UPDATE downloads
        SET extracted = 't'
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    return download_text
Пример #5
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
Пример #6
0
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]:
    """Lookup a media source by normalized url and then name.

    Uses mediawords.util.url.normalize_url_lossy to normalize urls.  Returns the parent media for duplicate media
    sources and returns no media that are marked foreign_rss_links.

    This function queries the media.normalized_url field to find the matching urls.  Because the normalization
    function is in python, we have to keep that denormalized_url field current from within python.  This function
    is responsible for keeping the table up to date by filling the field for any media for which it is null.
    Arguments:
    db - db handle
    url - url to lookup
    name - name to lookup

    Returns:
    a media source dict or None

    """
    _update_media_normalized_urls(db)

    nu = _normalize_url(url)

    lookup_query = \
        """
        select m.*
            from media m
            where
                m.normalized_url = %(a)s and
                foreign_rss_links = 'f'
            order by dup_media_id asc nulls last, media_id asc
        """

    medium = db.query(lookup_query, {'a': nu}).hash()

    if medium is None:
        medium = db.query(
            "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false",
            {'a': name}).hash()

    if medium is None:
        return None

    if medium['dup_media_id'] is not None:

        media_cycle_lookup = dict()  # type: dict
        while medium['dup_media_id'] is not None:
            if medium['media_id'] in media_cycle_lookup:
                raise McTopicMediaException('Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys()))
            media_cycle_lookup[medium['media_id']] = True

            medium = db.query("select * from media where media_id = %(a)s", {'a': medium['dup_media_id']}).hash()

    if medium['foreign_rss_links']:
        raise McTopicMediaException('Parent duplicate media source %d has foreign_rss_links' % medium['media_id'])

    return medium
def _insert_tweet_urls(db: DatabaseHandler, topic_tweet: dict, urls: typing.List) -> typing.List:
    """Insert list of urls into topic_tweet_urls."""
    for url in urls:
        db.query(
            """
            insert into topic_tweet_urls( topic_tweets_id, url )
                values( %(a)s, %(b)s )
                on conflict do nothing
            """,
            {'a': topic_tweet['topic_tweets_id'], 'b': url})
Пример #8
0
def change_password(db: DatabaseHandler,
                    email: str,
                    new_password: str,
                    new_password_repeat: str,
                    do_not_inform_via_email: bool = False) -> None:
    """Change user's password."""

    email = decode_object_from_bytes_if_needed(email)
    new_password = decode_object_from_bytes_if_needed(new_password)
    new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat)

    if isinstance(do_not_inform_via_email, bytes):
        do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email)

    do_not_inform_via_email = bool(int(do_not_inform_via_email))

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email)

    password_validation_message = validate_new_password(email=email,
                                                        password=new_password,
                                                        password_repeat=new_password_repeat)
    if password_validation_message:
        raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message)

    # Hash + validate the password
    try:
        password_new_hash = generate_secure_hash(password=new_password)
    except Exception as ex:
        raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex))

    if not password_new_hash:
        raise McAuthChangePasswordException("Generated password hash is empty.")

    # Set the password hash
    db.query("""
        UPDATE auth_users
        SET password_hash = %(password_hash)s,
            active = TRUE
        WHERE email = %(email)s
    """, {
        'email': email,
        'password_hash': password_new_hash,
    })

    if not do_not_inform_via_email:

        message = AuthPasswordChangedMessage(to=email, full_name=user.full_name())
        if not send_email(message):
            raise McAuthChangePasswordException(
                'The password has been changed, but I was unable to send an email notifying you about the change.'
            )
Пример #9
0
    def __remove_object_from_cache(self, db: DatabaseHandler, object_id: int) -> None:
        """Attempt to remove object from cache.

        Raise if removal fails because after removal we'd expect the object to be gone for good."""

        object_id = self._prepare_object_id(object_id)

        sql = "DELETE FROM %s " % self.__cache_table  # interpolated by Python
        sql += "WHERE object_id = %(object_id)s"  # interpolated by psycopg2

        db.query(sql, {'object_id': object_id})
Пример #10
0
def __get_topic_url_variants(db: DatabaseHandler, urls: List[str]) -> List[str]:
    """Get any alternative urls for the given url from topic_merged_stories or topic_links."""

    urls = decode_object_from_bytes_if_needed(urls)

    # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples
    stories_ids_sql = "SELECT stories_id "
    stories_ids_sql += "FROM stories "
    stories_ids_sql += "WHERE url = ANY(?)"
    stories_ids = db.query(stories_ids_sql, urls).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if stories_ids is None:
        stories_ids = []
    elif isinstance(stories_ids, int):
        stories_ids = [stories_ids]

    stories_ids = [int(x) for x in stories_ids]

    all_stories_ids = __get_merged_stories_ids(db=db, stories_ids=stories_ids)
    if len(all_stories_ids) == 0:
        return urls

    all_urls = db.query("""
        SELECT DISTINCT url
        FROM (
            SELECT redirect_url AS url
            FROM topic_links
            WHERE ref_stories_id = ANY(?)

            UNION

            SELECT url
            FROM topic_links
            WHERE ref_stories_id = ANY(?)

            UNION

            SELECT url
            FROM stories
            WHERE stories_id = ANY(?)
        ) AS q
        WHERE q IS NOT NULL
    """, all_stories_ids, all_stories_ids, all_stories_ids).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if all_urls is None:
        all_urls = []
    elif isinstance(all_urls, str):
        all_urls = [all_urls]

    return all_urls
Пример #11
0
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]:
    """Generate password reset token used for both activating newly registered users and resetting passwords.

    Returns non-hashed password reset token or None if user was not found.
    """

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a
    # "success" message. That way the adversary would not be able to find out which email addresses are active users.
    #
    # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing
    # attacks)
    user_exists = db.query("""
        SELECT auth_users_id,
               email
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if user_exists is None or len(user_exists) == 0:
        # User was not found, so set the email address to an empty string, but don't return just now and continue with a
        # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks)
        email = ''

    # Generate the activation token
    password_reset_token = random_string(length=64)
    if len(password_reset_token) == 0:
        raise McAuthProfileException('Unable to generate an activation token.')

    # Hash + validate the activation token
    password_reset_token_hash = generate_secure_hash(password=password_reset_token)
    if not password_reset_token_hash:
        raise McAuthProfileException("Unable to hash an activation token.")

    # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = %(password_reset_token_hash)s
        WHERE email = %(email)s
          AND email != ''
    """, {
        'email': email,
        'password_reset_token_hash': password_reset_token_hash,
    })

    return password_reset_token
Пример #12
0
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str:
    """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called."""
    item_set = decode_object_from_bytes_if_needed(item_set)
    item_id = decode_object_from_bytes_if_needed(item_id)

    # Always return grey for null or not typed values
    if item_id.lower() in {'null', 'not typed'}:
        return '999999'

    color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", {
        'item_set': item_set,
        'item_id': item_id,
    }).flat()
    if color is not None and len(color):
        if isinstance(color, list):
            color = color[0]
        return color

    set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", {
        'item_set': item_set,
    }).flat()
    if set_colors is not None:
        if not isinstance(set_colors, list):
            set_colors = [set_colors]

    existing_colors = set()

    if set_colors is not None:
        for color in set_colors:
            existing_colors.add(color)

    # Use the hard coded palette of 25 colors if possible
    new_color = None
    for color in __MC_COLORS:
        if color not in existing_colors:
            new_color = color
            break

    # Otherwise, just generate a random color
    if new_color is None:
        colors = analogous_color(color='0000ff', return_slices=256, split_slices=255)
        new_color = random.choice(colors)

    db.create(table='color_sets', insert_hash={
        'color_set': item_set,
        'id': item_id,
        'color': new_color,
    })

    return new_color
Пример #13
0
def _add_topic_tweet_single_day(
        db: DatabaseHandler,
        topic: dict,
        day: datetime.datetime,
        ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict:
    """
    Add a row to topic_tweet_day if it does not already exist.  fetch data for new row from CH.

    Arguments:
    db - database handle
    topic - topic dict
    day - date to fetch eg '2017-12-30'
    ch_class - AbstractCrimsonHexagon class

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_tweet_day = db.query(
        "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
        {'a': topic['topics_id'], 'b': str(day)}).hash()

    if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']:
        raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_tweet_day is not None:
        db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id'])

    ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day)

    tweet_count = ch_posts['totalPostsAvailable']

    num_ch_tweets = len(ch_posts['posts'])

    topic_tweet_day = db.create(
        'topic_tweet_days',
        {
            'topics_id': topic['topics_id'],
            'day': day,
            'tweet_count': tweet_count,
            'num_ch_tweets': num_ch_tweets,
            'tweets_fetched': False
        })

    topic_tweet_day['ch_posts'] = ch_posts

    return topic_tweet_day
Пример #14
0
def regenerate_api_key(db: DatabaseHandler, email: str) -> None:
    """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException("User with email address '%s' does not exist." % email)

    db.begin()

    # Purge all IP-limited API keys
    db.query("""
        DELETE FROM auth_user_api_keys
        WHERE ip_address IS NOT NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    # Regenerate non-IP limited API key
    db.query("""
        UPDATE auth_user_api_keys

        -- DEFAULT points to a generation function
        SET api_key = DEFAULT

        WHERE ip_address IS NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthProfileException("Unable to send email about reset API key.")

    db.commit()
Пример #15
0
def _get_ap_dup_sentence_lengths(db: DatabaseHandler, story_text: str) -> List[int]:
    story_text = decode_object_from_bytes_if_needed(story_text)

    ap_media_id = _get_ap_media_id(db=db)

    if ap_media_id is None:
        return []

    sentences = _get_sentences_from_content(story_text=story_text)

    md5s = []
    for sentence in sentences:
        md5_hash = hashlib.md5(sentence.encode('utf-8')).hexdigest()
        md5s.append(md5_hash)

    sentence_lengths = db.query("""
        SELECT length(sentence) AS len
        FROM story_sentences
        WHERE media_id = %(ap_media_id)s

          -- FIXME this probably never worked because the index is half_md5(), not md5()
          AND md5(sentence) = ANY(%(md5s)s)
    """, {
        'ap_media_id': ap_media_id,
        'md5s': md5s,
    }).flat()

    # MC_REWRITE_TO_PYTHON: Perlism
    if sentence_lengths is None:
        sentence_lengths = []

    return sentence_lengths
Пример #16
0
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict:
    """Find or create the 'Spider Feed' feed for the media source."""

    feed = db.query(
        "select * from feeds where media_id = %(a)s and name = %(b)s",
        {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash()

    if feed is not None:
        return feed

    return db.find_or_create('feeds', {
        'media_id': medium['media_id'],
        'url': medium['url'] + '#spiderfeed',
        'name': SPIDER_FEED_NAME,
        'active': False,
    })
Пример #17
0
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict:
    """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id."""
    medium = db.require_by_id('media', media_id)
    if medium['dup_media_id'] is None:
        return medium
    else:
        return _get_deduped_medium(db, medium['dup_media_id'])
Пример #18
0
def _get_story_with_most_sentences(db: DatabaseHandler, stories: list) -> dict:
    """Given a list of stories, return the story with the most sentences."""
    assert len(stories) > 0

    if len(stories) == 1:
        return stories[0]

    story = db.query(
        """
        select s.*
            from stories s
            where stories_id in (
                select stories_id
                    from story_sentences
                    where stories_id = any (%(a)s)
                    group by stories_id
                    order by count(*) desc
                    limit 1
            )
        """,
        {'a': [s['stories_id'] for s in stories]}).hash()

    if story is not None:
        return story
    else:
        return stories[0]
Пример #19
0
def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool:
    """Validate password reset token (used for both user activation and password reset)."""
    email = decode_object_from_bytes_if_needed(email)
    password_reset_token = decode_object_from_bytes_if_needed(password_reset_token)

    if not (email and password_reset_token):
        log.error("Email and / or password reset token is empty.")
        return False

    # Fetch readonly information about the user
    password_reset_token_hash = db.query("""
        SELECT auth_users_id,
               email,
               password_reset_token_hash
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash:
        log.error("Unable to find user %s in the database." % email)
        return False

    password_reset_token_hash = password_reset_token_hash['password_reset_token_hash']

    if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token):
        return True
    else:
        return False
Пример #20
0
def _get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[FetchLinkResponse]:
    """Return content for this url and topic in topic_seed_urls.

    Arguments:
    db - db handle
    topic_fetch_url - topic_fetch_url dict from db

    Returns:
    dummy response object

    """
    r = db.query(
        "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null",
        {'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url']}).flat()

    if len(r) == 0:
        return None

    return FetchLinkResponse(
        url=topic_fetch_url['url'],
        is_success=True,
        code=HTTPStatus.OK.value,
        message=HTTPStatus.OK.phrase,
        content=r[0],
        last_requested_url=topic_fetch_url['url'],
    )
Пример #21
0
def lookup_tag(db: DatabaseHandler, tag_name: str) -> Optional[Dict[str, Any]]:
    """Lookup the tag given the tag_set:tag format."""
    tag_name = decode_object_from_bytes_if_needed(tag_name)

    if not tag_name:
        log.warning("Tag name is empty.")
        return None

    if not re.match(pattern='^([^:]*):([^:]*)$', string=tag_name):
        log.warning("Unable to parse tag name '{}'.".format(tag_name))
        return None

    tag_set_name, tag = tag_name.split(':')

    found_tag = db.query("""
        SELECT t.*
        FROM tags AS t,
             tag_sets AS ts
        WHERE t.tag_sets_id = ts.tag_sets_id
          AND t.tag = %(tag)s
          AND ts.name = %(tag_set_name)s
    """, {'tag': tag, 'tag_set_name': tag_set_name}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if found_tag is None:
        found_tag = {}

    return found_tag
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls get_links_from_story
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    try:
        log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name']))
        links = get_links_from_story(db, story)

        for link in links:
            if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link):
                log.info("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            mediawords.tm.domains.increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception:
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
Пример #23
0
def guess_medium(db: DatabaseHandler, story_url: str) -> dict:
    """Guess the media source for a story with the given url.

    The guess is based on a normalized version of the host part of the url.  The guess takes into account the
    duplicate media relationships included in the postgres database through the media.dup_media_id fields.  If
    no appropriate media source exists, this function will create a new one and return it.

    """
    (medium_url, medium_name) = generate_medium_url_and_name_from_url(story_url)

    medium = lookup_medium(db, medium_url, medium_name)

    if medium is not None:
        return medium

    normalized_medium_url = _normalize_url(medium_url)
    normalized_story_url = _normalize_url(story_url)
    all_urls = [normalized_medium_url, medium_url, normalized_story_url, story_url]

    # avoid conflicts with existing media names and urls that are missed
    # by the above query b/c of dups feeds or foreign_rss_links
    medium_name = get_unique_medium_name(db, [medium_name] + all_urls)
    medium_url = get_unique_medium_url(db, all_urls)

    # a race condition with another thread can cause this to fail sometimes, but after the medium in the
    # other process has been created, all should be fine
    for i in range(_GUESS_MEDIUM_RETRIES):
        medium_data = {'name': medium_name, 'url': medium_url, 'normalized_url': normalized_medium_url}
        medium = db.find_or_create('media', medium_data)

        if medium is not None:
            break
        else:
            time.sleep(1)

    if medium is None:
        raise McTopicMediaUniqueException(
            "Unable to find or create medium for %s / %s" % (medium_name, medium_url))

    log.info("add medium: %s / %s / %d" % (medium_name, medium_url, medium['media_id']))

    spidered_tag = get_spidered_tag(db)

    db.find_or_create('media_tags_map', {'media_id': medium['media_id'], 'tags_id': spidered_tag['tags_id']})

    return medium
Пример #24
0
def get_unique_medium_name(db: DatabaseHandler, names: list) -> str:
    """Return the first name in the names list that does not yet exist for a media source, or None."""
    for name in names:
        name = name[0:MAX_NAME_LENGTH]
        name_exists = db.query("select 1 from media where lower(name) = lower(%(a)s)", {'a': name}).hash()
        if name_exists is None:
            return name

    raise McTopicMediaUniqueException("Unable to find unique name among names: " + str(names))
Пример #25
0
def get_spidered_tag(db: DatabaseHandler) -> dict:
    """Return the spidered:spidered tag dict."""
    spidered_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """,
        {'a': SPIDERED_TAG_TAG, 'b': SPIDERED_TAG_SET}).hash()

    if spidered_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET})
        spidered_tag = db.find_or_create('tags', {'tag': SPIDERED_TAG_TAG, 'tag_sets_id': tag_set['tag_sets_id']})

    return spidered_tag
Пример #26
0
def get_media_id(db: DatabaseHandler, download: dict) -> int:
    """Convenience method to get the media_id for the download."""
    download = decode_object_from_bytes_if_needed(download)

    return db.query("""
        SELECT media_id
        FROM feeds
        WHERE feeds_id = %(feeds_id)s
    """, {'feeds_id': download['feeds_id']}).hash()['media_id']
Пример #27
0
def mark_as_processed(db: DatabaseHandler, stories_id: int) -> bool:
    """Mark the story as processed by inserting an entry into 'processed_stories'. Return True on success."""

    # FIXME upsert instead of inserting a potential duplicate

    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    log.debug("Marking story ID %d as processed..." % stories_id)

    try:
        db.insert(table='processed_stories', insert_hash={'stories_id': stories_id})
    except Exception as ex:
        log.warning("Unable to insert story ID %d into 'processed_stories': %s" % (stories_id, str(ex),))
        return False
    else:
        return True
Пример #28
0
def is_new(db: DatabaseHandler, story: dict) -> bool:
    """Return true if this story should be considered new for the given media source.

    This is used to determine whether to add a new story for a feed item URL.

    A story is new if no story with the same URL or GUID exists in the same media source and if no story exists with the
    same title in the same media source in the same calendar day.
    """

    story = decode_object_from_bytes_if_needed(story)

    if story['title'] == '(no title)':
        return False

    db_story = db.query("""
        SELECT *
        FROM stories
        WHERE guid = %(guid)s
          AND media_id = %(media_id)s
    """, {
        'guid': story['guid'],
        'media_id': story['media_id'],
    }).hash()
    if db_story:
        return False

    db_story = db.query("""
        SELECT 1
        FROM stories
        WHERE md5(title) = md5(%(title)s)
          AND media_id = %(media_id)s

          -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index
          AND date_trunc('day', publish_date)  + interval '1 second'
            = date_trunc('day', %(publish_date)s::date) + interval '1 second'
    """, {
        'title': story['title'],
        'media_id': story['media_id'],
        'publish_date': story['publish_date'],
    }).hash()
    if db_story:
        return False

    return True
Пример #29
0
def __get_merged_stories_ids(db: DatabaseHandler, stories_ids: List[int], n: int = 0) -> List[int]:
    """Gor a given set of stories, get all the stories that are source or target merged stories in
    topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found."""

    stories_ids = decode_object_from_bytes_if_needed(stories_ids)

    # "The crazy load was from a query to our topic_merged_stories_ids to get
    # url variants.  It looks like we have some case of many, many merged story
    # pairs that are causing that query to make postgres sit on a cpu for a
    # super long time.  There's no good reason to query for ridiculous numbers
    # of merged stories, so I just arbitrarily capped the number of merged story
    # pairs to 20 to prevent this query from running away in the future."
    max_stories = 20

    # MC_REWRITE_TO_PYTHON: cast strings to ints
    # noinspection PyTypeChecker
    stories_ids = [int(x) for x in stories_ids]

    if len(stories_ids) == 0:
        return []

    if len(stories_ids) >= max_stories:
        return stories_ids[0:max_stories - 1]

    # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples
    stories_ids_list = ', '.join(str(x) for x in stories_ids)

    merged_stories_ids = db.query("""
        SELECT DISTINCT
            target_stories_id,
            source_stories_id
        FROM topic_merged_stories_map
        WHERE target_stories_id IN (%(stories_ids_list)s)
          OR source_stories_id IN (%(stories_ids_list)s)
        LIMIT %(max_stories)s
    """ % {
        'stories_ids_list': stories_ids_list,
        'max_stories': int(max_stories),
    }).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if merged_stories_ids is None:
        merged_stories_ids = []
    elif isinstance(merged_stories_ids, int):
        merged_stories_ids = [merged_stories_ids]

    merged_stories_ids = [int(x) for x in merged_stories_ids]

    all_stories_ids = list(set(stories_ids + merged_stories_ids))

    if n > 10 or len(stories_ids) == len(all_stories_ids) or len(stories_ids) >= max_stories:
        return all_stories_ids

    else:
        return __get_merged_stories_ids(db=db, stories_ids=all_stories_ids, n=n + 1)
Пример #30
0
def delete_user(db: DatabaseHandler, email: str) -> None:
    """Delete user."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException("User with email address '%s' does not exist." % email)

    # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map')
    db.query("""
        DELETE FROM auth_users
        WHERE email = %(email)s
    """, {'email': email})
Пример #31
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
Пример #32
0
def __get_merged_stories_ids(db: DatabaseHandler,
                             stories_ids: List[int],
                             n: int = 0) -> List[int]:
    """Gor a given set of stories, get all the stories that are source or target merged stories in
    topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found."""

    stories_ids = decode_object_from_bytes_if_needed(stories_ids)

    # "The crazy load was from a query to our topic_merged_stories_ids to get
    # url variants.  It looks like we have some case of many, many merged story
    # pairs that are causing that query to make postgres sit on a cpu for a
    # super long time.  There's no good reason to query for ridiculous numbers
    # of merged stories, so I just arbitrarily capped the number of merged story
    # pairs to 20 to prevent this query from running away in the future."
    max_stories = 20

    # MC_REWRITE_TO_PYTHON: cast strings to ints
    # noinspection PyTypeChecker
    stories_ids = [int(x) for x in stories_ids]

    if len(stories_ids) == 0:
        return []

    if len(stories_ids) >= max_stories:
        return stories_ids[0:max_stories - 1]

    # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples
    stories_ids_list = ', '.join(str(x) for x in stories_ids)

    merged_stories_ids = db.query("""
        SELECT DISTINCT
            target_stories_id,
            source_stories_id
        FROM topic_merged_stories_map
        WHERE target_stories_id IN (%(stories_ids_list)s)
          OR source_stories_id IN (%(stories_ids_list)s)
        LIMIT %(max_stories)s
    """ % {
        'stories_ids_list': stories_ids_list,
        'max_stories': int(max_stories),
    }).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if merged_stories_ids is None:
        merged_stories_ids = []
    elif isinstance(merged_stories_ids, int):
        merged_stories_ids = [merged_stories_ids]

    merged_stories_ids = [int(x) for x in merged_stories_ids]

    all_stories_ids = list(set(stories_ids + merged_stories_ids))

    if n > 10 or len(stories_ids) == len(
            all_stories_ids) or len(stories_ids) >= max_stories:
        return all_stories_ids

    else:
        return __get_merged_stories_ids(db=db,
                                        stories_ids=all_stories_ids,
                                        n=n + 1)
Пример #33
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
Пример #34
0
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> None:
    """Update the topic_fetch_url.message field in the database."""
    if _USE_TFU_DEBUG_MESSAGES:
        db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], {'message': message})
Пример #35
0
def _add_stale_feeds(db: DatabaseHandler) -> None:
    """Add pending downloads for all stale feeds."""
    vars(_add_stale_feeds).setdefault('last_check', 0)
    if _add_stale_feeds.last_check > time.time() - STALE_FEED_CHECK_INTERVAL:
        return

    _add_stale_feeds.last_check = time.time()

    # If the table doesn't exist, PostgreSQL sends a NOTICE which breaks the "no warnings" unit test
    db.query('SET client_min_messages=WARNING')
    db.query('DROP TABLE IF EXISTS feeds_to_queue')
    db.query('SET client_min_messages=NOTICE')

    db.query(
        """
        CREATE TEMPORARY TABLE feeds_to_queue AS
        SELECT feeds_id,
               url
        FROM feeds
        WHERE active = 't'
          AND url ~ 'https?://'
          AND (
            -- Never attempted
            last_attempted_download_time IS NULL

            -- Feed was downloaded more than stale_feed_interval seconds ago
            OR (last_attempted_download_time < (NOW() - (%(a)s || ' seconds')::interval))

            -- (Probably) if a new story comes in every "n" seconds, refetch feed every "n" + 5 minutes
            OR (
                (NOW() > last_attempted_download_time +
                        (last_attempted_download_time - last_new_story_time) + interval '5 minutes')

                -- "web_page" feeds are to be downloaded only once a week,
                -- independently from when the last new story comes in from the
                -- feed (because every "web_page" feed download provides a
                -- single story)
                AND type != 'web_page'
            )
          )
        """,
        {'a': STALE_FEED_INTERVAL})

    db.query(
        """
        UPDATE feeds
        SET last_attempted_download_time = NOW()
        WHERE feeds_id IN (SELECT feeds_id FROM feeds_to_queue)
        """)

    downloads = db.query(
        """
        WITH inserted_downloads as (
            INSERT INTO downloads (feeds_id, url, host, type, sequence, state, priority, download_time, extracted)
            SELECT feeds_id,
                   url,
                   LOWER(SUBSTRING(url from '.*://([^/]*)' )),
                   'feed',
                   1,
                   'pending',
                   0,
                   NOW(),
                   false
            FROM feeds_to_queue
            RETURNING *
        )

        select d.*, f.media_id as _media_id
            from inserted_downloads d
                join feeds f using (feeds_id)
        """).hashes()

    db.query("drop table feeds_to_queue")

    log.info("added stale feeds: %d" % len(downloads))
Пример #36
0
def get_and_store_story_stats(db: DatabaseHandler, story: dict) -> FacebookURLStats:
    """
    Get Facebook statistics for story URL, store them in a database.

    Return statistics object on success, throw exception on failure (updates the stats in the database in any case).

    :param db: Database handler.
    :param story: Story dictionary.
    :return Statistics object.
    """
    story = decode_object_from_bytes_if_needed(story)

    story_url = story['url']

    stats = None
    thrown_exception = None

    story_stats = db.query("select * from story_statistics where stories_id = %(a)s", {'a': story['stories_id']}).hash()

    try:
        if len(story_stats.get('facebook_api_error', '')) > 0:
            message ='ignore story %d with error: %s' % (story['stories_id'], story_stats['facebook_api_error'])
            raise McFacebookSoftFailureException(message)
    except Exception:
        pass

    try:
        stats = _get_url_stats(url=story_url)
    except Exception as ex:
        log.error(f"Statistics can't be fetched for URL '{story_url}': {ex}")
        thrown_exception = ex

    db.query("""
        INSERT INTO story_statistics (
            stories_id,
            facebook_share_count,
            facebook_comment_count,
            facebook_reaction_count,
            facebook_api_collect_date,
            facebook_api_error
        ) VALUES (
            %(stories_id)s,
            %(share_count)s,
            %(comment_count)s,
            %(reaction_count)s,
            NOW(),
            %(facebook_error)s
        ) ON CONFLICT (stories_id) DO UPDATE SET
            facebook_share_count = %(share_count)s,
            facebook_comment_count = %(comment_count)s,
            facebook_reaction_count = %(reaction_count)s,
            facebook_api_collect_date = NOW(),
            facebook_api_error = %(facebook_error)s
    """, {
        'stories_id': story['stories_id'],
        'share_count': stats.share_count if stats else None,
        'comment_count': stats.comment_count if stats else None,
        'reaction_count': stats.reaction_count if stats else None,
        'facebook_error': str(thrown_exception) if thrown_exception else None,
    })

    if thrown_exception:
        raise thrown_exception
    else:
        return stats
Пример #37
0
def login_with_api_key(db: DatabaseHandler, api_key: str, ip_address: str) -> CurrentUser:
    """Fetch user object for the API key. Only active users are fetched."""

    api_key = decode_object_from_bytes_if_needed(api_key)
    ip_address = decode_object_from_bytes_if_needed(ip_address)

    if not api_key:
        raise McAuthLoginException("API key is undefined.")

    if not ip_address:
        # Even if provided API key is the global one, we want the IP address
        raise McAuthLoginException("IP address is undefined.")

    api_key_user = db.query("""
        SELECT auth_users.email
        FROM auth_users
            INNER JOIN auth_user_api_keys
                ON auth_users.auth_users_id = auth_user_api_keys.auth_users_id
        WHERE
            (
                auth_user_api_keys.api_key = %(api_key)s AND
                (
                    auth_user_api_keys.ip_address IS NULL
                    OR
                    auth_user_api_keys.ip_address = %(ip_address)s
                )
            )

        GROUP BY auth_users.auth_users_id,
                 auth_users.email
        ORDER BY auth_users.auth_users_id
        LIMIT 1
    """, {
        'api_key': api_key,
        'ip_address': ip_address,
    }).hash()

    if api_key_user is None or len(api_key_user) == 0:
        raise McAuthLoginException("Unable to find user for API key '%s' and IP address '%s'" % (api_key, ip_address,))

    email = api_key_user['email']

    # Check if user has tried to log in unsuccessfully before and now is trying again too fast
    if __user_is_trying_to_login_too_soon(db=db, email=email):
        raise McAuthLoginException(
            "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email
        )

    user = user_info(db=db, email=email)

    # Reset password reset token (if any)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
          AND password_reset_token_hash IS NOT NULL
    """, {'email': email})

    if not user.active():
        raise McAuthLoginException("User '%s' for API key '%s' is not active." % (email, api_key,))

    return user
Пример #38
0
    def store_download(self, db: DatabaseHandler, download: dict,
                       content: str) -> List[int]:
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        downloads_id = download['downloads_id']

        log.info(f"Processing feed download {downloads_id}...")

        try:
            added_story_ids = self.add_stories_from_feed(db=db,
                                                         download=download,
                                                         content=content)
            story_ids_to_extract = self.add_stories_from_feed(
                db=db, download=download, content=content)

        except Exception as ex:

            error_message = f"Error processing feed for download {downloads_id}: {ex}"
            log.error(error_message)

            db.query(
                """
                UPDATE downloads
                SET state = 'feed_error',
                    error_message = %(error_message)s
                WHERE downloads_id = %(downloads_id)s
            """, {
                    'error_message': error_message,
                    'downloads_id': downloads_id,
                })

            # On non-soft errors (explicitly hard errors or unknown errors), pass the exception up
            if not isinstance(ex, McCrawlerFetcherSoftError):
                raise ex

            story_ids_to_extract = []

        else:

            if len(added_story_ids):
                last_new_story_time_sql = 'last_new_story_time = last_attempted_download_time, '
            else:
                last_new_story_time_sql = ''

            db.query(
                f"""
                UPDATE feeds
                SET {last_new_story_time_sql}
                    last_successful_download_time = GREATEST(last_successful_download_time, %(download_time)s)
                WHERE feeds_id = %(feeds_id)s
            """, {
                    'download_time': download['download_time'],
                    'feeds_id': download['feeds_id'],
                })

            # If no new stories, just store "(redundant feed)" to save storage space
            if len(added_story_ids) == 0:
                content = '(redundant feed)'

        # Reread the possibly updated download
        download = db.find_by_id(table='downloads', object_id=downloads_id)

        # Store the feed in any case
        store_content(db=db, download=download, content=content)

        log.info(f"Done processing feed download {downloads_id}")

        return story_ids_to_extract
Пример #39
0
def find_dup_story(db: DatabaseHandler, story: dict) -> bool:
    """Return existing duplicate story within the same media source.

    Search for a story that is a duplicate of the given story.  A story is a duplicate if it shares the same media
    source and:

    * has the same normalized title and has a publish_date within the same calendar week
    * has a normalized guid or url that is the same as the normalized guid or url

    If a dup story is found, insert the url and guid into the story_urls table.

    Return the found story or None if no story is found.
    """
    story = decode_object_from_bytes_if_needed(story)

    if story['title'] == '(no title)':
        return None

    urls = _get_story_url_variants(story)

    db_story = db.query(
        """
        SELECT s.*
        FROM stories s
        WHERE
            (s.guid = any( %(urls)s ) or s.url = any( %(urls)s)) and
            media_id = %(media_id)s
    """, {
            'urls': urls,
            'media_id': story['media_id'],
        }).hash()
    if db_story:
        return db_story

    # make sure that postgres uses the story_urls_url index
    db.query(
        "create temporary table _u as select stories_id from story_urls where url = any( %(a)s )",
        {'a': urls})
    db_story = db.query(
        "select * from stories s join _u u using ( stories_id ) where media_id = %(a)s order by stories_id limit 1",
        {
            'a': story['media_id']
        }).hash()
    db.query("drop table _u")
    if db_story:
        return db_story

    db_story = db.query(
        """
        SELECT *
        FROM stories
        WHERE
            (md5(title) = md5(%(title)s) OR
                normalized_title_hash = md5( get_normalized_title( %(title)s, %(media_id)s ) )::uuid)
            AND media_id = %(media_id)s

          -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index
          AND date_trunc('day', publish_date)  + interval '1 second'
            = date_trunc('day', %(publish_date)s::date) + interval '1 second'
    """, {
            'title': story['title'],
            'media_id': story['media_id'],
            'publish_date': story['publish_date'],
        }).hash()

    if db_story:
        [
            insert_story_urls(db, db_story, u)
            for u in (story['url'], story['guid'])
        ]

        return db_story

    return None
Пример #40
0
def fetch_topic_url(db: DatabaseHandler,
                    topic_fetch_urls_id: int,
                    domain_timeout: Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fetch_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db,
                             topic_fetch_url=topic_fetch_url,
                             domain_timeout=domain_timeout)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

        if 'stories_id' in topic_fetch_url and topic_fetch_url[
                'stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db,
                                        story,
                                        topic,
                                        redirect_url=redirect_url,
                                        assume_match=assume_match):
                    add_to_topic_stories(db, story, topic)

            # add redirect_url as a lookup url for the story, if it is different from the story url
            if not redirect_url == topic_fetch_url['url']:
                mediawords.dbi.stories.stories.insert_story_urls(
                    db, story, redirect_url)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(
            topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' %
                    (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'],
                    topic_fetch_url)
Пример #41
0
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None:
    """Update an existing user."""

    if not user_updates:
        raise McAuthProfileException("Existing user is undefined.")

    # Check if user exists
    try:
        user = user_info(db=db, email=user_updates.email())
    except Exception:
        raise McAuthProfileException('User with email address "%s" does not exist.' % user_updates.email())

    db.begin()

    if user_updates.full_name() is not None:
        db.query("""
            UPDATE auth_users
            SET full_name = %(full_name)s
            WHERE email = %(email)s
        """, {
            'full_name': user_updates.full_name(),
            'email': user_updates.email(),
        })

    if user_updates.notes() is not None:
        db.query("""
            UPDATE auth_users
            SET notes = %(notes)s
            WHERE email = %(email)s
        """, {
            'notes': user_updates.notes(),
            'email': user_updates.email(),
        })

    if user_updates.active() is not None:
        db.query("""
            UPDATE auth_users
            SET active = %(active)s
            WHERE email = %(email)s
        """, {
            'active': bool(int(user_updates.active())),
            'email': user_updates.email(),
        })

    if user_updates.has_consented() is not None:
        db.query("""
            UPDATE auth_users
            SET has_consented = %(has_consented)s
            WHERE email = %(email)s
        """, {
            'has_consented': bool(int(user_updates.has_consented())),
            'email': user_updates.email(),
        })

    if user_updates.password() is not None:
        try:
            change_password(
                db=db,
                email=user_updates.email(),
                new_password=user_updates.password(),
                new_password_repeat=user_updates.password_repeat(),
                do_not_inform_via_email=True,
            )
        except Exception as ex:
            db.rollback()
            raise McAuthProfileException("Unable to change password: %s" % str(ex))

    resource_limits = user_updates.resource_limits()
    if resource_limits:

        if resource_limits.weekly_requests() is not None:
            db.query("""
                UPDATE auth_user_limits
                SET weekly_requests_limit = %(weekly_requests_limit)s
                WHERE auth_users_id = %(auth_users_id)s
            """, {
                'weekly_requests_limit': resource_limits.weekly_requests(),
                'auth_users_id': user.user_id(),
            })

        if resource_limits.weekly_requested_items() is not None:
            db.query("""
                UPDATE auth_user_limits
                SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
                WHERE auth_users_id = %(auth_users_id)s
            """, {
                'weekly_requested_items_limit': resource_limits.weekly_requested_items(),
                'auth_users_id': user.user_id(),
            })

        if resource_limits.max_topic_stories() is not None:
            db.query("""
                UPDATE auth_user_limits
                SET max_topic_stories = %(max_topic_stories)s
                WHERE auth_users_id = %(auth_users_id)s
            """, {
                'max_topic_stories': resource_limits.max_topic_stories(),
                'auth_users_id': user.user_id(),
            })

    if user_updates.role_ids() is not None:
        db.query("""
            DELETE FROM auth_users_roles_map
            WHERE auth_users_id = %(auth_users_id)s
        """, {'auth_users_id': user.user_id()})

        for auth_roles_id in user_updates.role_ids():
            db.insert(table='auth_users_roles_map', insert_hash={
                'auth_users_id': user.user_id(),
                'auth_roles_id': auth_roles_id,
            })

    db.commit()
Пример #42
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" %
                  (response.code, topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException(
                    "Unable to find matching story after unique constraint error."
                )
            topic_fetch_url['stories_id'] = story_match['stories_id']

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
Пример #43
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        fallback_date: typing.Optional[datetime.datetime]=None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:_MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)
    title = mediawords.util.html.html_title(content, url, _MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re.sub('\x00', '', story[field])

    date_guess = guess_date(url, content)
    story['publish_date'] = date_guess.date if date_guess.found else fallback_date
    if story['publish_date'] is None:
        story['publish_date'] = datetime.datetime.now().isoformat()

    try:
        story = db.create('stories', story)
    except mediawords.db.exceptions.handler.McUniqueConstraintException as e:
        raise McTMStoriesDuplicateException("Attempt to insert duplicate story url %s" % url)
    except Exception as e:
        raise McTMStoriesException("Error adding story: %s" % traceback.format_exc())

    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': spidered_tag['tags_id']})

    assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id']))

    db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']})

    download = create_download_for_new_story(db, story, feed)

    download = mediawords.dbi.downloads.store_content(db, download, content)

    return story
Пример #44
0
def _try_fetch_topic_url(
        db: DatabaseHandler,
        topic_fetch_url: dict,
        domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...")

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED):
        log.info(f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching")
        return

    log.info("Checking ignore links...")
    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        log.info("Link is to be ignored, returning")
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    log.info("Checking failed URL...")
    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url'])
    if failed_url:
        log.info("URL is failed, returning")
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    log.info("Checking self-linked domain...")
    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        log.info("Link is self-linked domain, returning")
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    log.info(f"Fetching topic {topic_fetch_url['topics_id']}...")
    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    log.info("Checking story match...")
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        log.info("URL is in pending state, getting story match...")
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            log.info(f"Matched story {story_match['stories_id']}, returning")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    log.info("Checking for pending state...")
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        log.info("URL is in pending state, returning")
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    log.info("Checking seeded content...")
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        log.info("Seeded content found, fetching URL...")
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout)
        log.info(f"{response.code} response returned")
    else:
        log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}")

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        log.info(
            f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern"
        )
        if _ignore_link_pattern(response_url):
            log.info("Ignore link pattern matched, returning")
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        log.info("Checking story match for redirect URL...")
        _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url")
        story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    log.info("Checking content match...")
    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        log.info("Request failed")
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        log.info(f"Story {story_match['stories_id']} matched")
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match):
        log.info("Content matched")
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        log.info("Nothing matched, generating story...")

        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url

            log.info("Creating story...")
            story = generate_story(db=db, content=content, url=url)
            log.info(f"Created story {story['stories_id']}")

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:

            log.info("Duplicate story found, checking for story match on unique constraint error...")

            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url)
            if story_match is None:
                message = "Unable to find matching story after unique constraint error."
                log.error(message)
                raise McTMFetchLinkException(message)

            log.info(f"Matched story {story_match['stories_id']}")
            topic_fetch_url['stories_id'] = story_match['stories_id']

        log.info("Done generating story")

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")

    log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
Пример #45
0
def copy_stories_to_topic(db: DatabaseHandler, source_topics_id: int,
                          target_topics_id: int) -> None:
    """Copy stories from source_topics_id into seed_urls for target_topics_id."""
    message = "copy_stories_to_topic: %s -> %s [%s]" % (
        source_topics_id, target_topics_id, datetime.datetime.now())

    log.info("querying novel urls from source topic...")

    db.query("set work_mem = '8GB'")

    db.query(
        """
        create temporary table _stories as
            select distinct stories_id from topic_seed_urls where topics_id = %(a)s and stories_id is not null;
        create temporary table _urls as
            select distinct url from topic_seed_urls where topics_id = %(a)s;
        """, {'a': target_topics_id})

    # noinspection SqlResolve
    db.query(
        """
        create temporary table _tsu as
            select %(target)s topics_id, url, stories_id, %(message)s source
                from snap.live_stories s
                where
                    s.topics_id = %(source)s and
                    s.stories_id not in ( select stories_id from _stories ) and
                    s.url not in ( select url from _urls )
        """, {
            'target': target_topics_id,
            'source': source_topics_id,
            'message': message
        })

    # noinspection SqlResolve
    (num_inserted, ) = db.query("select count(*) from _tsu").flat()

    log.info("inserting %d urls ..." % num_inserted)

    # noinspection SqlInsertValues,SqlResolve
    db.query(
        "insert into topic_seed_urls ( topics_id, url, stories_id, source ) select * from _tsu"
    )

    # noinspection SqlResolve
    db.query("drop table _stories; drop table _urls; drop table _tsu;")
Пример #46
0
def add_story(db: DatabaseHandler, story: dict,
              feeds_id: int) -> Optional[dict]:
    """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it.

    Returns found or created story. Adds an is_new = True story if the story was created by the call.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    db_story = find_dup_story(db, story)
    if db_story:
        log.debug("found existing dup story: %s [%s]" %
                  (story['title'], story['url']))
        db.commit()
        return db_story

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    story['is_new'] = True

    [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])]

    # on conflict does not work with partitioned feeds_stories_map
    db.query(
        """
        insert into feeds_stories_map_p ( feeds_id, stories_id )
            select %(a)s, %(b)s where not exists (
                select 1 from feeds_stories_map where feeds_id = %(a)s and stories_id = %(b)s )
        """, {
            'a': feeds_id,
            'b': story['stories_id']
        })

    db.commit()

    log.debug("added story: %s" % story['url'])

    return story
Пример #47
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed
        download; if there is a new URL, check whether each story is new, and if so add it to the database and add a
        pending download for it. Return new stories that were found in the feed.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        media_id = get_media_id(db=db, download=download)
        download_time = download['download_time']

        try:
            stories = self._get_stories_from_syndicated_feed(
                content=content,
                media_id=media_id,
                download_time=download_time,
            )
        except Exception as ex:
            raise McCrawlerFetcherSoftError(
                f"Error processing feed for {download['url']}: {ex}")

        if stories_checksum_matches_feed(db=db,
                                         feeds_id=download['feeds_id'],
                                         stories=stories):
            return []

        new_story_ids = []
        for story in stories:

            # FIXME None of the helpers like keys they don't know about
            story_without_enclosures = story.copy()
            story_without_enclosures.pop('enclosures')

            if self._add_content_download_for_new_stories():
                added_story = add_story_and_content_download(
                    db=db,
                    story=story_without_enclosures,
                    parent_download=download,
                )
            else:
                added_story = add_story(
                    db=db,
                    story=story_without_enclosures,
                    feeds_id=download['feeds_id'],
                )

            # We might have received None due to a GUID conflict
            if added_story:

                stories_id = added_story['stories_id']
                story_is_new = added_story.get('is_new', False)

                if story_is_new:

                    # Add all of the enclosures
                    for enclosure in story['enclosures']:
                        # ...provided that the URL is set
                        if enclosure['url']:

                            db.query(
                                """
                                INSERT INTO story_enclosures (stories_id, url, mime_type, length)
                                VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s)
                                
                                -- Some stories have multiple enclosures pointing to the same URL
                                ON CONFLICT (stories_id, url) DO NOTHING
                            """, {
                                    'stories_id': stories_id,
                                    'url': enclosure['url'],
                                    'mime_type': enclosure['mime_type'],
                                    'length': enclosure['length'],
                                })

                    # Append to the list of newly added storyes
                    new_story_ids.append(stories_id)

        log.info(
            f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}"
        )

        return new_story_ids
Пример #48
0
def get_story_match(
        db: DatabaseHandler,
        url: str,
        redirect_url: typing.Optional[str] = None) -> typing.Optional[dict]:
    """Search for any story within the database that matches the given url.

    Searches for any story whose guid or url matches either the url or redirect_url or the
    mediawords.util.url.normalize_url_lossy() version of either.

    If multiple stories are found, use get_preferred_story() to decide which story to return.

    Only mach the first _MAX_URL_LENGTH characters of the url / redirect_url.

    Arguments:
    db - db handle
    url - story url
    redirect_url - optional url to which the story url redirects

    Returns:
    the matched story or None

    """
    u = url[0:_MAX_URL_LENGTH]

    ru = ''
    if not ignore_redirect(db, url, redirect_url):
        ru = redirect_url[0:_MAX_URL_LENGTH] if redirect_url is not None else u

    nu = mediawords.util.url.normalize_url_lossy(u)
    nru = mediawords.util.url.normalize_url_lossy(ru)

    urls = list({u, ru, nu, nru})

    # for some reason some rare urls trigger a seq scan on the below query
    db.query("set enable_seqscan=off")

    # look for matching stories, ignore those in foreign_rss_links media, only get last
    # 100 to avoid hanging job trying to handle potentially thousands of matches
    stories = db.query(
        """
with matching_stories as (
    select distinct(s.*) from stories s
            join media m on s.media_id = m.media_id
        where
            ( ( s.url = any( %(a)s ) ) or
                ( s.guid = any ( %(a)s ) ) ) and
            m.foreign_rss_links = false

    union

    select distinct(s.*) from stories s
            join media m on s.media_id = m.media_id
            join topic_seed_urls csu on s.stories_id = csu.stories_id
        where
            csu.url = any ( %(a)s ) and
            m.foreign_rss_links = false
)

select distinct(ms.*)
    from matching_stories ms
    order by collect_date desc
    limit 100
        """, {
            'a': urls
        }).hashes()

    db.query("set enable_seqscan=on")

    if len(stories) == 0:
        return None

    story = get_preferred_story(db, stories)

    return story
Пример #49
0
def add_story(db: DatabaseHandler,
              story: dict,
              feeds_id: int,
              skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(
            skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    db.find_or_create(table='feeds_stories_map',
                      insert_hash={
                          'stories_id': story['stories_id'],
                          'feeds_id': feeds_id,
                      })

    db.commit()

    return story
Пример #50
0
def lookup_medium(db: DatabaseHandler, url: str,
                  name: str) -> typing.Optional[dict]:
    """Lookup a media source by normalized url and then name.

    Uses normalize_url_lossy() to normalize urls.  Returns the parent media for duplicate media
    sources and returns no media that are marked foreign_rss_links.

    This function queries the media.normalized_url field to find the matching urls.  Because the normalization
    function is in python, we have to keep that denormalized_url field current from within python.  This function
    is responsible for keeping the table up to date by filling the field for any media for which it is null.
    Arguments:
    db - db handle
    url - url to lookup
    name - name to lookup

    Returns:
    a media source dict or None

    """
    _update_media_normalized_urls(db)

    normalized_url = _normalize_url(url)

    medium = db.query(
        """
        SELECT *
        FROM media
        where
            normalized_url = %(a)s AND
            foreign_rss_links = 'f'
        ORDER BY
            dup_media_id NULLS LAST,
            media_id
    """, {
            'a': normalized_url
        }).hash()

    if medium is None:
        medium = db.query(
            """
            SELECT *
            FROM media
            WHERE
                lower(name) = lower(%(a)s) AND
                foreign_rss_links = false
            """, {
                'a': name
            }).hash()

    if medium is None:
        return None

    if medium['dup_media_id'] is not None:

        media_cycle_lookup = dict()  # type: dict
        while medium['dup_media_id'] is not None:
            if medium['media_id'] in media_cycle_lookup:
                raise McTopicMediaException(
                    'Cycle found in duplicate media path: ' +
                    str(media_cycle_lookup.keys()))
            media_cycle_lookup[medium['media_id']] = True

            medium = db.query("select * from media where media_id = %(a)s", {
                'a': medium['dup_media_id']
            }).hash()

    if medium['foreign_rss_links']:
        raise McTopicMediaException(
            'Parent duplicate media source %d has foreign_rss_links' %
            medium['media_id'])

    return medium
Пример #51
0
def login_with_email_password(db: DatabaseHandler, email: str, password: str, ip_address: str = None) -> CurrentUser:
    """Log in with username and password; raise on unsuccessful login."""

    email = decode_object_from_bytes_if_needed(email)
    password = decode_object_from_bytes_if_needed(password)

    if not (email and password):
        raise McAuthLoginException("Email and password must be defined.")

    # Try-except block because we don't want to reveal the specific reason why the login has failed
    try:

        user = user_info(db=db, email=email)

        # Check if user has tried to log in unsuccessfully before and now is trying
        # again too fast
        if __user_is_trying_to_login_too_soon(db=db, email=email):
            raise McAuthLoginException(
                "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email
            )

        if not password_hash_is_valid(password_hash=user.password_hash(), password=password):
            raise McAuthLoginException("Password for user '%s' is invalid." % email)

    except Exception as ex:
        log.info(
            "Login failed for %(email)s, will delay any successive login attempt for %(delay)d seconds: %(exc)s" % {
                'email': email,
                'delay': __POST_UNSUCCESSFUL_LOGIN_DELAY,
                'exc': str(ex),
            }
        )

        # Set the unsuccessful login timestamp
        # (TIMESTAMP 'now' returns "current transaction's start time", so using LOCALTIMESTAMP instead)
        db.query("""
            UPDATE auth_users
            SET last_unsuccessful_login_attempt = LOCALTIMESTAMP
            WHERE email = %(email)s
        """, {'email': email})

        # It might make sense to time.sleep() here for the duration of $POST_UNSUCCESSFUL_LOGIN_DELAY seconds to prevent
        # legitimate users from trying to log in too fast. However, when being actually brute-forced through multiple
        # HTTP connections, this approach might end up creating a lot of processes that would time.sleep() and take up
        # memory.
        #
        # So, let's return the error page ASAP and hope that a legitimate user won't be able to reenter his / her
        # password before the $POST_UNSUCCESSFUL_LOGIN_DELAY amount of seconds pass.

        # Don't give out a specific reason for the user to not be able to find
        # out which user emails are registered
        raise McAuthLoginException("User '%s' was not found or password is incorrect." % email)

    if not user.active():
        raise McAuthLoginException("User with email '%s' is not active." % email)

    # Reset password reset token (if any)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
          AND password_reset_token_hash IS NOT NULL
    """, {'email': email})

    if ip_address:
        if not user.api_key_for_ip_address(ip_address):
            db.create(
                table='auth_user_api_keys',
                insert_hash={
                    'auth_users_id': user.user_id(),
                    'ip_address': ip_address,
                })

            # Fetch user again
            user = user_info(db=db, email=email)

            if not user.api_key_for_ip_address(ip_address):
                raise McAuthLoginException("Unable to create per-IP API key for IP %s" % ip_address)

    return user
Пример #52
0
def add_user(db: DatabaseHandler, new_user: NewUser) -> None:
    """Add new user."""

    if not new_user:
        raise McAuthRegisterException("New user is undefined.")

    # Check if user already exists
    user_exists = db.query(
        """
        SELECT auth_users_id
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {
            'email': new_user.email()
        }).hash()

    if user_exists is not None and 'auth_users_id' in user_exists:
        raise McAuthRegisterException("User with email '%s' already exists." %
                                      new_user.email())

    # Hash + validate the password
    try:
        password_hash = generate_secure_hash(password=new_user.password())
        if not password_hash:
            raise McAuthRegisterException("Password hash is empty.")
    except Exception as _:
        raise McAuthRegisterException('Unable to hash a new password.')

    db.begin()

    # Create the user
    db.create(table='auth_users',
              insert_hash={
                  'email': new_user.email(),
                  'password_hash': password_hash,
                  'full_name': new_user.full_name(),
                  'notes': new_user.notes(),
                  'active': bool(int(new_user.active())),
              })

    # Fetch the user's ID
    try:
        user = user_info(db=db, email=new_user.email())
    except Exception as ex:
        db.rollback()
        raise McAuthRegisterException(
            "I've attempted to create the user but it doesn't exist: %s" %
            str(ex))

    # Create roles
    try:
        for auth_roles_id in new_user.role_ids():
            db.create(table='auth_users_roles_map',
                      insert_hash={
                          'auth_users_id': user.user_id(),
                          'auth_roles_id': auth_roles_id,
                      })
    except Exception as ex:
        raise McAuthRegisterException("Unable to create roles: %s" % str(ex))

    # Update limits (if they're defined)
    if new_user.weekly_requests_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'auth_users_id': user.user_id(),
                'weekly_requests_limit': new_user.weekly_requests_limit(),
            })

    if new_user.weekly_requested_items_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'auth_users_id':
                user.user_id(),
                'weekly_requested_items_limit':
                new_user.weekly_requested_items_limit(),
            })

    # Subscribe to newsletter
    if new_user.subscribe_to_newsletter():
        db.create(table='auth_users_subscribe_to_newsletter',
                  insert_hash={'auth_users_id': user.user_id()})

    if not new_user.active():
        send_user_activation_token(
            db=db,
            email=new_user.email(),
            activation_link=new_user.activation_url(),
            subscribe_to_newsletter=new_user.subscribe_to_newsletter(),
        )

    db.commit()
Пример #53
0
def get_preferred_story(db: DatabaseHandler, stories: list) -> dict:
    """Given a set of possible story matches, find the story that is likely the best to include in the topic.

    The best story is the one that belongs to the media source that sorts first according to the following
    criteria, in descending order of importance:

    * pointed to by some dup_media_id
    * without a dup_media_id
    * url domain matches that of the story
    * lower media_id

    Within a media source, the preferred story is the one with the most sentences.

    Arguments:
    db - db handle
    url - url of matched story
    redirect_url - redirect_url of matched story
    stories - list of stories from which to choose

    Returns:
    a single preferred story

    """
    assert len(stories) > 0

    if len(stories) == 1:
        return stories[0]

    log.debug("get_preferred_story: %d stories" % len(stories))

    media = db.query(
        """
        select *,
                exists ( select 1 from media d where d.dup_media_id = m.media_id ) as is_dup_target
            from media m
            where media_id = any(%(a)s)
        """, {
            'a': [s['media_id'] for s in stories]
        }).hashes()

    story_urls = [s['url'] for s in stories]

    for medium in media:
        # is_dup_target defined in query above
        medium['is_dup_target'] = 0 if medium['is_dup_target'] else 1
        medium['is_not_dup_source'] = 1 if medium['dup_media_id'] else 0
        medium['matches_domain'] = 0 if _url_domain_matches_medium(
            medium, story_urls) else 1
        medium['stories'] = list(
            filter(lambda s: s['media_id'] == medium['media_id'], stories))

    sorted_media = sorted(media,
                          key=operator.itemgetter('is_dup_target',
                                                  'is_not_dup_source',
                                                  'matches_domain',
                                                  'media_id'))

    preferred_story = _get_story_with_most_sentences(
        db, sorted_media[0]['stories'])

    return preferred_story
Пример #54
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep media_normalized_urls table up to date.

    This function compares the media and versions in media_normalized_urls against the version returned
    by mediawords.util.url.normalize_url_lossy_version() and updates or inserts rows for any media that do not
    have up to date versions.

    """
    if not _normalized_urls_out_of_date(db):
        return

    # put a lock on this because the process of generating all media urls will take around 30 seconds, and we don't
    # want all workers to do the work
    db.begin()
    db.query("lock media_normalized_urls in access exclusive mode")

    if not _normalized_urls_out_of_date(db):
        db.commit()
        return

    log.warning("updating media_normalized_urls ...")

    version = mediawords.util.url.normalize_url_lossy_version()

    media = db.query(
        """
        select m.*
            from media m
                left join media_normalized_urls u on
                    ( m.media_id = u.media_id and u.normalize_url_lossy_version = %(a)s)
            where
                u.normalized_url is null or
                u.db_row_last_updated < m.db_row_last_updated
        """, {
            'a': version
        }).hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.query(
            "delete from media_normalized_urls where media_id = %(a)s and normalize_url_lossy_version = %(b)s",
            {
                'a': medium['media_id'],
                'b': version
            })
        db.create(
            'media_normalized_urls', {
                'media_id': medium['media_id'],
                'normalized_url': normalized_url,
                'normalize_url_lossy_version': version
            })

    db.commit()
Пример #55
0
def lookup_medium(db: DatabaseHandler, url: str,
                  name: str) -> typing.Optional[dict]:
    """Lookup a media source by normalized url and then name.

    Uses mediawords.util.url.normalize_url_lossy to normalize urls.  Returns the parent media for duplicate media
    sources and returns no media that are marked foreign_rss_links.

    This function queries the media_normalized_urls table to find the matching urls.  Because the normalization
    function is in python, we have to keep that denormalized table current from within python.  This function
    is responsible for keeping the table up to date by comparing the normalize_url_lossy_version values
    in the table with the current return value of mediawords.util.url.normalize_url_lossy_version().

    Arguments:
    db - db handle
    url - url to lookup
    name - name to lookup

    Returns:
    a media source dict or None

    """
    _update_media_normalized_urls(db)

    nu = _normalize_url(url)
    version = mediawords.util.url.normalize_url_lossy_version()

    lookup_query = \
        """
        select m.*
            from media m
                join media_normalized_urls u using ( media_id )
            where
                u.normalized_url = %(a)s and
                u.normalize_url_lossy_version = %(b)s and
                foreign_rss_links = 'f'
            order by dup_media_id asc nulls last, media_id asc
        """

    medium = db.query(lookup_query, {'a': nu, 'b': version}).hash()

    if medium is None:
        medium = db.query(
            "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false",
            {
                'a': name
            }).hash()

    if medium is None:
        return None

    if medium['dup_media_id'] is not None:

        media_cycle_lookup = dict()  # type: dict
        while medium['dup_media_id'] is not None:
            if medium['media_id'] in media_cycle_lookup:
                raise McTopicMediaException(
                    'Cycle found in duplicate media path: ' +
                    str(media_cycle_lookup.keys()))
            media_cycle_lookup[medium['media_id']] = True

            medium = db.query("select * from media where media_id = %(a)s", {
                'a': medium['dup_media_id']
            }).hash()

    if medium['foreign_rss_links']:
        raise McTopicMediaException(
            'Parent duplicate media source %d has foreign_rss_links' %
            medium['media_id'])

    return medium
Пример #56
0
def _insert_story_sentences(
    db: DatabaseHandler,
    story: dict,
    sentences: List[str],
    no_dedup_sentences: bool = False,
) -> List[str]:
    """Insert the story sentences into story_sentences, optionally skipping duplicate sentences by setting is_dup = 't'
    to the found duplicates that are already in the table.

    Returns list of sentences that were inserted into the table.
    """

    story = decode_object_from_bytes_if_needed(story)
    sentences = decode_object_from_bytes_if_needed(sentences)
    if isinstance(no_dedup_sentences, bytes):
        no_dedup_sentences = decode_object_from_bytes_if_needed(
            no_dedup_sentences)
    no_dedup_sentences = bool(int(no_dedup_sentences))

    stories_id = story['stories_id']
    media_id = story['media_id']

    # Story's publish date is the same for all the sentences, so we might as well pass it as a constant
    escaped_story_publish_date = db.quote_date(story['publish_date'])

    if len(sentences) == 0:
        log.warning(f"Story sentences are empty for story {stories_id}")
        return []

    if no_dedup_sentences:
        log.debug(
            f"Won't de-duplicate sentences for story {stories_id} because 'no_dedup_sentences' is set"
        )

        dedup_sentences_statement = """

            -- Nothing to deduplicate, return empty list
            SELECT NULL
            WHERE 1 = 0

        """

    else:

        # Limit to unique sentences within a story
        sentences = _get_unique_sentences_in_story(sentences)

        # Set is_dup = 't' to sentences already in the table, return those to be later skipped on INSERT of new
        # sentences
        dedup_sentences_statement = f"""

            -- noinspection SqlResolve
            UPDATE story_sentences
            SET is_dup = 't'
            FROM new_sentences
            WHERE public.half_md5(story_sentences.sentence) = public.half_md5(new_sentences.sentence)
              AND public.week_start_date(story_sentences.publish_date::date) = public.week_start_date({escaped_story_publish_date})
              AND story_sentences.media_id = new_sentences.media_id
            RETURNING story_sentences.sentence

        """

    # Convert to list of dicts (values escaped for insertion into database)
    sentence_dicts = _get_db_escaped_story_sentence_dicts(db=db,
                                                          story=story,
                                                          sentences=sentences)

    # Ordered list of columns
    story_sentences_columns = sorted(sentence_dicts[0].keys())
    str_story_sentences_columns = ', '.join(story_sentences_columns)

    # List of sentences (in predefined column order)
    new_sentences_sql = []
    for sentence_dict in sentence_dicts:
        new_sentence_sql = []
        for column in story_sentences_columns:
            new_sentence_sql.append(sentence_dict[column])
        new_sentences_sql.append(f"({', '.join(new_sentence_sql)})")
    str_new_sentences_sql = "\n{}".format(",\n".join(new_sentences_sql))

    # sometimes the big story_sentences query below deadlocks sticks in an idle state, holding this lock so we set a
    # short idle timeout for postgres just while we do this query. the timeout should not kick in while the
    # big story_sentences query is actively processing, so we can set it pretty short. we usually set this timeout
    # to 0 globally, but just to be safe store and reset the pre-existing value.
    idle_timeout = db.query(
        "SHOW idle_in_transaction_session_timeout").flat()[0]
    db.query("SET idle_in_transaction_session_timeout = 5000")

    db.query('SET citus.max_adaptive_executor_pool_size TO 64')

    sql = f"""
        -- noinspection SqlType,SqlResolve
        WITH new_sentences ({str_story_sentences_columns}) AS (VALUES
            -- New sentences to potentially insert
            {str_new_sentences_sql}
        )

        -- Either list of duplicate sentences already found in the table or return an empty list if deduplication is
        -- disabled
        --
        -- The query assumes that there are no existing sentences for this story in the "story_sentences" table, so
        -- if you are reextracting a story, DELETE its sentences from "story_sentences" before running this query.
        {dedup_sentences_statement}

    """
    log.debug(f"Running 'UPDATE story_sentences SET is_dup' query:\n{sql}")
    duplicate_sentences = db.query(sql).flat()

    duplicate_sentences = [
        db.quote_varchar(sentence) for sentence in duplicate_sentences
    ]

    sql = f"""
        -- noinspection SqlType,SqlResolve
        WITH new_sentences ({str_story_sentences_columns}) AS (VALUES
            {str_new_sentences_sql}
        ),
        duplicate_sentences AS (
            SELECT unnest(ARRAY[{', '.join(duplicate_sentences)}]::TEXT[]) AS sentence
        )
        INSERT INTO story_sentences (language, media_id, publish_date, sentence, sentence_number, stories_id)
        SELECT language, media_id, publish_date, sentence, sentence_number, stories_id
        FROM new_sentences
        WHERE sentence NOT IN (
            -- Skip the ones for which we've just set is_dup = 't'
            SELECT sentence
            FROM duplicate_sentences
        )
        RETURNING story_sentences.sentence
    """
    log.debug(f"Running 'INSERT INTO story_sentences' query:\n{sql}")
    inserted_sentences = db.query(sql).flat()

    db.query("SET idle_in_transaction_session_timeout = %(a)s",
             {'a': idle_timeout})

    return inserted_sentences
Пример #57
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: typing.Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        return

    if re.search(mediawords.tm.extract_story_links.IGNORE_LINK_PATTERN,
                 topic_fetch_url['url'],
                 flags=re.I) is not None:
        topic_fetch_url['state'] = FETCH_STATE_IGNORE
        topic_fetch_url['code'] = 403
        return

    log.warning("PASSED IGNORE")

    failed_url = get_failed_url(db, topic_fetch_url['topics_id'],
                                topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = mediawords.tm.stories.get_story_match(
            db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # get content from either the seed or by fetching it
    response = get_seeded_content(db, topic_fetch_url)
    if response is None:
        response = fetch_url(db,
                             topic_fetch_url['url'],
                             domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" %
                  (response.code(), topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.decoded_content()

    fetched_url = topic_fetch_url['url']
    response_url = response.request().url() if response.request() else None

    if fetched_url != response_url:
        story_match = mediawords.tm.stories.get_story_match(
            db=db, url=fetched_url, redirect_url=response_url)

    topic_fetch_url['code'] = response.code()

    if not response.is_success():
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message()
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content,
            topic=topic,
            assume_match=topic_fetch_url['assume_match']):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            url = response_url if response_url is not None else fetched_url
            story = mediawords.tm.stories.generate_story(db=db,
                                                         content=content,
                                                         url=url)
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED
            topic_fetch_url['stories_id'] = story['stories_id']
        except mediawords.tm.stories.McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = mediawords.tm.stories.get_story_match(
                db=db, url=fetched_url, redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException(
                    "Unable to find matching story after unique constraint error."
                )
            topic_fetch_url['stories_id'] = story_match['stories_id']
Пример #58
0
def update_story_sentences_and_language(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Update story vectors for the given story, updating "story_sentences".

    If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new
    ones (useful for optimization if you are very sure no story vectors exist for this story).

    If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are
    reprocessing a small set of stories).
    """

    story = decode_object_from_bytes_if_needed(story)

    use_transaction = not db.in_transaction()

    if use_transaction:
        db.begin()

    stories_id = story['stories_id']

    if not extractor_args.no_delete():
        _delete_story_sentences(db=db, story=story)

    story_text = story.get('story_text', None)
    if not story_text:
        story_text = get_text_for_word_counts(db=db, story=story)
        if not story_text:
            story_text = ''

    story_lang = language_code_for_text(text=story_text)

    sentences = _get_sentences_from_story_text(story_text=story_text,
                                               story_lang=story_lang)

    if (not story.get('language',
                      None)) or story.get('language', None) != story_lang:
        db.query(
            """
            UPDATE stories
            SET language = %(story_lang)s
            WHERE stories_id = %(stories_id)s
        """, {
                'stories_id': stories_id,
                'story_lang': story_lang
            })
        story['language'] = story_lang

    if sentences is None:
        raise McUpdateStorySentencesAndLanguageException(
            "Sentences for story {} are undefined.".format(stories_id))

    if len(sentences) == 0:
        log.debug("Story {} doesn't have any sentences.".format(stories_id))
        return

    sentences = _clean_sentences(sentences)

    _insert_story_sentences(
        db=db,
        story=story,
        sentences=sentences,
        no_dedup_sentences=extractor_args.no_dedup_sentences(),
    )

    story['ap_syndicated'] = _update_ap_syndicated(
        db=db,
        stories_id=stories_id,
        story_title=story['title'],
        story_text=story_text,
        story_language=story_lang,
    )

    if use_transaction:
        db.commit()
Пример #59
0
def _fetch_tweets_for_day(db: DatabaseHandler,
                          twitter_class: typing.Type[AbstractTwitter],
                          topic: dict,
                          topic_tweet_day: dict,
                          max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets from CH and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    twitter_class - AbstractTwitter class
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if topic_tweet_day['tweets_fetched']:
        return

    ch_posts_data = topic_tweet_day['ch_posts']

    ch_posts = ch_posts_data['posts']

    if (max_tweets is not None):
        ch_posts = ch_posts[0:max_tweets]

    log.info("adding %d tweets for topic %s, day %s" %
             (len(ch_posts), topic['topics_id'], topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(ch_posts), 100):
        _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100])

    ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p),
                           ch_posts))

    log.info("%d tweets remaining after match" % (len(ch_posts)))

    db.begin()

    log.debug("inserting into topic_tweets ...")

    [
        _store_tweet_and_urls(db, topic_tweet_day, ch_post)
        for ch_post in ch_posts
    ]

    topic_tweet_day['num_ch_tweets'] = len(ch_posts)

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {
            'a': topic_tweet_day['num_ch_tweets'],
            'b': topic_tweet_day['topic_tweet_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_tweets")
Пример #60
0
def user_info(db: DatabaseHandler, email: str) -> CurrentUser:
    """Fetch user information (email, full name, notes, API keys, password hash).

    Raises on error if user is not found.
    """

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthInfoException("User email is not defined.")

    # Fetch read-only information about the user
    user = db.query(
        """
        SELECT auth_users.auth_users_id,
               auth_users.email,
               auth_users.full_name,
               auth_users.notes,
               EXTRACT(EPOCH FROM NOW())::BIGINT AS created_timestamp,
               auth_users.active,
               auth_users.password_hash,
               auth_user_api_keys.api_key,
               auth_user_api_keys.ip_address,
               weekly_requests_sum,
               weekly_requested_items_sum,
               auth_user_limits.weekly_requests_limit,
               auth_user_limits.weekly_requested_items_limit,
               auth_roles.auth_roles_id,
               auth_roles.role

        FROM auth_users
            INNER JOIN auth_user_api_keys
                ON auth_users.auth_users_id = auth_user_api_keys.auth_users_id
            INNER JOIN auth_user_limits
                ON auth_users.auth_users_id = auth_user_limits.auth_users_id
            LEFT JOIN auth_users_roles_map
                ON auth_users.auth_users_id = auth_users_roles_map.auth_users_id
            LEFT JOIN auth_roles
                ON auth_users_roles_map.auth_roles_id = auth_roles.auth_roles_id,
            auth_user_limits_weekly_usage( %(email)s )

        WHERE auth_users.email = %(email)s
    """, {
            'email': email
        }).hashes()
    if user is None or len(user) == 0:
        raise McAuthInfoException("User with email '%s' was not found." %
                                  email)

    unique_api_keys = dict()
    unique_roles = dict()

    for row in user:
        # Should have at least one API key
        unique_api_keys[row['api_key']] = row['ip_address']

        # Might have some roles
        if row['auth_roles_id'] is not None:
            unique_roles[row['auth_roles_id']] = row['role']

    api_keys = []
    for api_key in sorted(unique_api_keys.keys()):
        api_keys.append(
            APIKey(api_key=api_key, ip_address=unique_api_keys[api_key]))

    roles = []
    for role_id in sorted(unique_roles.keys()):
        roles.append(Role(role_id=role_id, role_name=unique_roles[role_id]))

    first_row = user[0]

    return CurrentUser(
        user_id=first_row['auth_users_id'],
        email=email,
        full_name=first_row['full_name'],
        notes=first_row['notes'],
        created_timestamp=first_row['created_timestamp'],
        active=bool(int(first_row['active'])),
        password_hash=first_row['password_hash'],
        roles=roles,
        api_keys=api_keys,
        weekly_requests_limit=first_row['weekly_requests_limit'],
        weekly_requested_items_limit=first_row['weekly_requested_items_limit'],
        weekly_requests_sum=first_row['weekly_requests_sum'],
        weekly_requested_items_sum=first_row['weekly_requested_items_sum'],
    )