示例#1
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']},
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
示例#2
0
    def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None:
        """
        Update the state and message fields in the given table for the row whose '<table>_id' field matches that field
        in the job args.
        """
        job_state = decode_object_from_bytes_if_needed(job_state)

        try:
            args = decode_json(job_state.get('args', ''))
        except Exception as ex:
            log.error(f"Unable to decode args from job state {job_state}: {ex}")
            return

        extra_table = self.__state_config.extra_table()
        if extra_table:

            id_field = extra_table.table_name() + '_id'
            id_value = args.get(id_field, None)
            if not id_value:
                # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance
                # SnapshotTopic needs to create the snapshot.
                log.warning(f"Unable to get ID value for field '{id_field}' from job state {job_state}")
                return None

            update = {
                extra_table.state_column(): job_state.get('state', None),
                extra_table.message_column(): job_state.get('message', None),
            }

            db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update)

        else:
            log.debug("Extra table for storing state is not configured.")
示例#3
0
    def update_job_state_args(self, db: DatabaseHandler,
                              args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states',
                                     object_id=self.__job_states_id)

        try:

            # job_states.args got changed from JSON to JSONB while sharding the
            # database, and there's no way to disable decoding JSONB (as
            # opposed to JSON) in psycopg2, so "args" might be a JSON string or
            # a pre-decoded dictionary
            maybe_json_db_args = job_state.get('args', '')
            if isinstance(maybe_json_db_args, dict):
                db_args = maybe_json_db_args
            else:
                db_args = decode_json(maybe_json_db_args)

        except Exception as ex:
            log.error(
                f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states',
                        object_id=self.__job_states_id,
                        update_hash={
                            'args': args_json,
                        })
def validate_remote_integration(db: DatabaseHandler, source: str, query: str,
                                day: str) -> None:
    """Run sanity test on remote APIs."""

    topic = create_test_topic(db, "test_remote_integration")

    tsq = {
        'topics_id': topic['topics_id'],
        'platform': 'twitter',
        'source': source,
        'query': query
    }
    db.create('topic_seed_queries', tsq)

    topic['platform'] = 'twitter'
    topic['pattern'] = '.*'
    topic['start_date'] = day
    topic['end_date'] = day
    topic['mode'] = 'url_sharing'
    db.update_by_id('topics', topic['topics_id'], topic)

    fetch_topic_posts(db, topic['topics_id'])

    got_tts = db.query("select * from topic_posts").hashes()

    # for old ch monitors, lots of the posts may be deleted
    assert len(got_tts) > 20

    assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH
    assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
示例#5
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
示例#6
0
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict,
                        message: str) -> None:
    """Update the topic_fetch_url.message field in the database."""
    if _USE_TFU_DEBUG_MESSAGES:
        db.update_by_id('topic_fetch_urls',
                        topic_fetch_url['topic_fetch_urls_id'],
                        {'message': message})
示例#7
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successfull but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content
    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        path = _get_store_for_writing().store_content(db,
                                                      download['downloads_id'],
                                                      content)
        error = ''
    except Exception as e:
        raise McDBIDownloadsException(
            "error while trying to store download %d: %s" %
            (download['downloads_id'], e))
        new_state = 'error'
        error = str(e)
        path = ''

    if new_state == 'success':
        error = ''

    db.update_by_id('downloads', download['downloads_id'], {
        'state': new_state,
        'path': path,
        'error_message': error
    })

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
示例#8
0
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fech_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)

        if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
                    _add_to_topic_stories(db, story, topic)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
示例#9
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)

        ua = UserAgent()
        url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url'])
        request = Request(method='GET', url=url_with_credentials)
        response = ua.request(request)

        return response
示例#10
0
    def __update_table_state(self, db: DatabaseHandler,
                             job_state: Dict[str, Any]) -> None:
        """
        Update the state and message fields in the given table for the row whose '<table>_id' field matches that field
        in the job args.
        """
        job_state = decode_object_from_bytes_if_needed(job_state)

        try:

            # job_states.args got changed from JSON to JSONB while sharding the
            # database, and there's no way to disable decoding JSONB (as
            # opposed to JSON) in psycopg2, so "args" might be a JSON string or
            # a pre-decoded dictionary
            maybe_json_args = job_state.get('args', '')
            if isinstance(maybe_json_args, dict):
                args = maybe_json_args
            else:
                args = decode_json(maybe_json_args)

        except Exception as ex:
            log.error(
                f"Unable to decode args from job state {job_state}: {ex}")
            return

        extra_table = self.__state_config.extra_table()
        if extra_table:

            id_field = extra_table.table_name() + '_id'
            id_value = args.get(id_field, None)
            if not id_value:
                # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance
                # SnapshotTopic needs to create the snapshot.
                log.warning(
                    f"Unable to get ID value for field '{id_field}' from job state {job_state}"
                )
                return None

            update = {
                extra_table.state_column(): job_state.get('state', None),
                extra_table.message_column(): job_state.get('message', None),
            }

            db.update_by_id(table=extra_table.table_name(),
                            object_id=id_value,
                            update_hash=update)

        else:
            log.debug("Extra table for storing state is not configured.")
示例#11
0
def store_content(
    db: DatabaseHandler,
    download: dict,
    content: str,
    amazon_s3_downloads_config: AmazonS3DownloadsConfig = None,
    download_storage_config: DownloadStorageConfig = None,
) -> dict:
    """Store the content for the download."""
    # feed_error state indicates that the download was successful but that there was a problem
    # parsing the feed afterward.  so we want to keep the feed_error state even if we redownload
    # the content

    download = decode_object_from_bytes_if_needed(download)
    content = decode_object_from_bytes_if_needed(content)

    if not amazon_s3_downloads_config:
        amazon_s3_downloads_config = _default_amazon_s3_downloads_config()
    if not download_storage_config:
        download_storage_config = _default_download_storage_config()

    new_state = 'success' if download['state'] != 'feed_error' else 'feed_error'

    try:
        store = _get_store_for_writing(
            amazon_s3_downloads_config=amazon_s3_downloads_config,
            download_storage_config=download_storage_config,
        )
        path = store.store_content(db, download['downloads_id'], content)
    except Exception as ex:
        raise McDBIDownloadsException(
            "error while trying to store download %d: %s" %
            (download['downloads_id'], ex))

    if new_state == 'success':
        download['error_message'] = ''

    db.update_by_id(
        table='downloads',
        object_id=download['downloads_id'],
        update_hash={
            'state': new_state,
            'path': path,
            'error_message': download['error_message']
        },
    )

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
示例#12
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(
            db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query(
        "select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'],
                        {'normalized_url': normalized_url})

    db.commit()
示例#13
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
示例#14
0
def try_update_topic_link_ref_stories_id(db: DatabaseHandler,
                                         topic_fetch_url: dict) -> None:
    """Update the given topic link to point to the given ref_stories_id.

    Use the topic_fetch_url['topic_links_id'] as the id of the topic link to update and the
    topic_fetch_url['stories_id'] as the ref_stories_id.

    There is a unique constraint on topic_links(topics_id, stories_id, ref_stories_id).  This function just does the
    update to topic_links and catches and ignores any errors from that constraint.  Trying and failing on the
    constraint is faster and more reliable than checking before trying (and still maybe failing on the constraint).
    """
    try:
        db.update_by_id('topic_links', topic_fetch_url['topic_links_id'],
                        {'ref_stories_id': topic_fetch_url['stories_id']})
    except mediawords.db.exceptions.handler.McUpdateByIDException as e:
        # the query will throw a unique constraint error if stories_id,ref_stories already exists.  it's quicker
        # to just catch and ignore the error than to try to avoid id
        if 'unique constraint "topic_links_scr"' not in str(e):
            raise e
示例#15
0
def _store_map(db: DatabaseHandler,
        topics_id: int,
        timespans_id: int,
        content: bytes,
        graph_format: str,
        color_by: str) -> None:
    """Create a timespans_map row."""
    db.begin()

    options = {'color_by': color_by}
    options_json = encode_json(options)

    db.query(
        """
            DELETE FROM timespan_maps
            WHERE timespans_id = %(a)s
              AND format = %(b)s
              AND options = %(c)s
        """,
        {'a': timespans_id, 'b': graph_format, 'c': options_json}
    )

    timespan_map = {
        'topics_id': topics_id,
        'timespans_id': timespans_id,
        'options': options_json,
        'format': graph_format
    }
    timespan_map = db.create('timespan_maps', timespan_map)

    db.commit()

    content_types = {
        'svg': 'image/svg+xml',
        'gexf': 'xml/gexf'
    }
    content_type = content_types[graph_format]

    store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type)

    url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'])

    db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
示例#16
0
    def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id)

        try:
            db_args = decode_json(job_state.get('args', '{}'))
        except Exception as ex:
            log.error(f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'args': args_json,
        })
示例#17
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query("select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url})

    db.commit()
示例#18
0
def fetch_topic_url_update_state(db: DatabaseHandler,
                                 topic_fetch_urls_id: int,
                                 domain_timeout: Optional[int] = None) -> bool:
    """Tries fetch_topic_url() and updates state.

    Returns True if job completed and does not have to be requeued.

    Returns False if job was throttled and has to be requeued.

    Raises exception on other errors (after updating state).
    """
    try:
        fetch_topic_url(db=db,
                        topic_fetch_urls_id=topic_fetch_urls_id,
                        domain_timeout=domain_timeout)
        return True

    except McThrottledDomainException:
        # if a domain has been throttled, just add it back to the end of the queue
        log.info(
            "Fetch for topic_fetch_url %d domain throttled. Requeueing ..." %
            topic_fetch_urls_id)

        db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, {
            'state': FETCH_STATE_REQUEUED,
            'fetch_date': datetime.datetime.now()
        })
        return False

    except Exception as ex:
        # all non throttled errors should get caught by the try: about, but catch again here just in case
        log.error("Error while fetching URL with ID {}: {}".format(
            topic_fetch_urls_id, str(ex)))
        update = {
            'state': FETCH_STATE_PYTHON_ERROR,
            'fetch_date': datetime.datetime.now(),
            'message': traceback.format_exc(),
        }
        db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, update)
        raise ex
示例#19
0
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict:
    """Store the content for the download."""
    new_state = 'success'
    if download['state'] == 'feed_error':
        new_state = download['state']

    path = ''
    error = ''
    try:
        path = _get_store_for_writing().store_content(db, download['downloads_id'], content)
    except Exception as e:
        raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['download_id'], e))
        new_state = 'error'
        error = str(e)

    if new_state == 'success':
        error = ''

    db.update_by_id('downloads', download['downloads_id'], {'state': new_state, 'path': path, 'error_message': error})

    download = db.find_by_id('downloads', download['downloads_id'])

    return download
示例#20
0
def try_update_topic_link_ref_stories_id(db: DatabaseHandler, topic_fetch_url: dict) -> None:
    """Update the given topic link to point to the given ref_stories_id.

    Use the topic_fetch_url['topic_links_id'] as the id of the topic link to update and the
    topic_fetch_url['stories_id'] as the ref_stories_id.

    There is a unique constraint on topic_links(topics_id, stories_id, ref_stories_id).  This function just does the
    update to topic_links and catches and ignores any errors from that constraint.  Trying and failing on the
    constraint is faster and more reliable than checking before trying (and still maybe failing on the constraint).
    """
    if topic_fetch_url.get('topic_links_id', None) is None:
        return

    try:
        db.update_by_id(
            'topic_links',
            topic_fetch_url['topic_links_id'],
            {'ref_stories_id': topic_fetch_url['stories_id']})
    except McUpdateByIDException as e:
        # the query will throw a unique constraint error if stories_id,ref_stories already exists.  it's quicker
        # to just catch and ignore the error than to try to avoid id
        if 'unique constraint "topic_links_scr"' not in str(e):
            raise e
示例#21
0
def _log_download_error(db: DatabaseHandler, download: Optional[dict],
                        error_message: str) -> None:
    if not download:
        log.warning(
            f"Error while getting download from queue: {error_message}")
        return

    log.warning(
        f"Error while fetching download {download['downloads_id']}: {error_message}"
    )
    if download['state'] not in {'fetching', 'queued'}:
        downloads_id = download['downloads_id']

        download['state'] = 'error'
        download['error_message'] = error_message
        try:
            db.update_by_id(table='downloads',
                            object_id=downloads_id,
                            update_hash=download)
        except Exception as ex:
            # If we can't log the error in the database, that's really bad so a hard exception
            raise McCrawlerFetcherHardError((
                f"Unable to log download error for download {downloads_id} in the database; "
                f"download error: {error_message}; database error: {ex}"))
示例#22
0
    def update_job_state_message(self, db: DatabaseHandler, message: str) -> None:
        """
        Update the message field for the current "job_states" row.

        This is a public method that is intended to be used by code run anywhere above the stack from run() to publish
        messages updating the progress of a long running job.
        """
        message = decode_object_from_bytes_if_needed(message)

        # Verify that it exists I guess?
        db.require_by_id(table='job_states', object_id=self.__job_states_id)

        job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'message': message,
            'last_updated': sql_now(),
        })

        self.__update_table_state(db=db, job_state=job_state)
示例#23
0
    def update_job_state(self, db: DatabaseHandler, state: str, message: Optional[str] = ''):
        """
        Update the state and message fields of the "job_states" table for the currently active "job_states_id".

        "jobs_states_id" is set and unset in method run() below, so this must be called from code running from within
        the run() implementation of the subclass.
        """
        state = decode_object_from_bytes_if_needed(state)
        message = decode_object_from_bytes_if_needed(message)

        log.debug(f"{self.__queue_name} state: {state}")

        job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'state': state,
            'last_updated': sql_now(),
            'message': message,
        })

        self.__update_table_state(db=db, job_state=job_state)
示例#24
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = db.create(
        'stories',
        {
            'url': old_story['url'],
            'media_id': new_medium['media_id'],
            'guid': old_story['guid'],
            'publish_date': old_story['publish_date'],
            'collect_date': sql_now(),
            'description': old_story['description'],
            'title': old_story['title']
        },
    )
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    for old_story_tag in db.query(
        """
        SELECT tags_id
        FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        ORDER BY tags_id
        """,
        {'stories_id': old_story['stories_id']},
    ).hashes():
        stories_id = story['stories_id']
        tags_id = old_story_tag['tags_id']

        db.query("""
            INSERT INTO stories_tags_map (stories_id, tags_id)
            VALUES (%(stories_id)s, %(tags_id)s)
            ON CONFLICT (stories_id, tags_id) DO NOTHING
        """, {
            'stories_id': stories_id,
            'tags_id': tags_id,
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        """
            SELECT *
            FROM downloads
            WHERE stories_id = %(stories_id)s
            ORDER BY downloads_id
            LIMIT 1
        """,
        {
            'stories_id': old_story['stories_id'],
        }
    ).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
                INSERT INTO download_texts (
                    downloads_id,
                    download_text,
                    download_text_length
                )
                    SELECT
                        %(downloads_id)s,
                        dt.download_text,
                        dt.download_text_length
                    FROM download_texts AS dt
                    WHERE dt.downloads_id = %(downloads_id)s
            """,
            {
                'downloads_id': download['downloads_id'],
            },
        )

    # noinspection SqlInsertValues
    db.query(
        """
            INSERT INTO story_sentences (
                stories_id,
                sentence_number,
                sentence,
                media_id,
                publish_date,
                language
            )
                SELECT
                    %(new_stories_id)s,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date,
                    language
                FROM story_sentences
                WHERE stories_id = %(old_stories_id)s
        """,
        {
            'old_stories_id': old_story['stories_id'],
            'new_stories_id': int(story['stories_id']),
        },
    )

    return story
示例#25
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': mediawords.util.sql.sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': old_story['stories_id']})

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {'a': old_story['stories_id']}).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = mediawords.dbi.downloads.fetch_content(db, old_download)
            download = mediawords.dbi.downloads.store_content(db, download, content)
        except (mediawords.dbi.downloads.McDBIDownloadsException,
                mediawords.key_value_store.amazon_s3.McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """,
            {'a': download['downloads_id']})

    db.query(
        """
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select %(a)s, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': old_story['stories_id']})

    return story
示例#26
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
示例#27
0
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fech_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

        if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
                    mediawords.tm.stories.add_to_topic_stories(db, story, topic)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
示例#28
0
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> None:
    """Update the topic_fetch_url.message field in the database."""
    if _USE_TFU_DEBUG_MESSAGES:
        db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], {'message': message})