def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None:
    """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets."""
    topic_tweets = db.query(
        """
        select *
            from topic_tweets tt
                join topic_tweet_days ttd using (topic_tweet_days_id)
            where
                ttd.topics_id = %(a)s
        """,
        {'a': topic['topics_id']}).hashes()

    expected_num_urls = 0
    for topic_tweet in topic_tweets:
        data = dict(decode_json(topic_tweet['data']))
        expected_num_urls += len(data['tweet']['entities']['urls'])

    # first sanity check to make sure we got some urls
    num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0]
    assert num_urls == expected_num_urls

    total_json_urls = 0
    for topic_tweet in topic_tweets:

        ch_post = dict(decode_json(topic_tweet['data']))
        expected_urls = [x['expanded_url'] for x in ch_post['tweet']['entities']['urls']]
        total_json_urls += len(expected_urls)

        for expected_url in expected_urls:
            got_url = db.query("select * from topic_tweet_urls where url = %(a)s", {'a': expected_url}).hash()
            assert got_url is not None

    assert total_json_urls == num_urls
예제 #2
0
def test_encode_decode_json():
    test_obj = [
        'foo', {
            'bar': 'baz'
        }, ['xyz', 'zyx'], 'moo', 'ąčęėįšųūž', 42, 3.14, True, False, None
    ]
    expected_json = '["foo",{"bar":"baz"},["xyz","zyx"],"moo","ąčęėįšųūž",42,3.14,true,false,null]'

    encoded_json = encode_json(json_obj=test_obj, pretty=False)
    assert encoded_json == expected_json

    decoded_json = decode_json(json_string=encoded_json)
    assert decoded_json == test_obj

    # Encoding errors
    with pytest.raises(McEncodeJSONException):
        # noinspection PyTypeChecker
        encode_json(None)

    with pytest.raises(McEncodeJSONException):
        # noinspection PyTypeChecker
        encode_json("strings can't be encoded")

    with pytest.raises(McDecodeJSONException):
        # noinspection PyTypeChecker
        decode_json(None)

    with pytest.raises(McDecodeJSONException):
        # noinspection PyTypeChecker
        decode_json('not JSON')
예제 #3
0
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        SELECT
            topic_posts.topic_posts_id
        FROM topic_posts
            INNER JOIN topic_post_days ON
                topic_posts.topics_id = topic_post_days.topics_id AND
                topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id
            INNER JOIN topic_seed_queries ON
                topic_post_days.topics_id = topic_seed_queries.topics_id AND
                topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id
        WHERE
            topics_id = %(topics_id)s
        """, {
            'topics_id': topic['topics_id'],
        }
    ).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)
예제 #4
0
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """
        Return a mock ch response to the posts end point.

        Generate the mock response by sending back data from a consistent but semirandom selection of
        ch-posts-2016-01-0[12345].json.
        """
        assert MOCK_TWEETS_PER_DAY <= MAX_MOCK_TWEETS_PER_DAY

        test_path = mediawords.util.paths.mc_root_path(
        ) + '/mediacloud/test-data/ch/'
        filename = test_path + "ch-posts-" + day.strftime('%Y-%m-%d') + '.json'
        with open(filename, 'r', encoding='utf-8') as fh:
            json = fh.read()

        data = dict(decode_json(json))

        assert 'posts' in data
        assert len(data['posts']) >= MOCK_TWEETS_PER_DAY

        data['posts'] = data['posts'][0:MOCK_TWEETS_PER_DAY]

        # replace tweets with the epoch of the start date so that we can infer the date of each tweet in
        # tweet_urler_lookup below
        i = 0
        for ch_post in data['posts']:
            ch_post['url'] = re.sub(r'status/(\d+)/', '/status/' + str(i),
                                    ch_post['url'])
            i += 1

        return data
예제 #5
0
def fetch_100_users(screen_names: list) -> list:
    """Fetch data for up to 100 users."""
    if len(screen_names) > 100:
        raise McFetchTweetsException('tried to fetch more than 100 users')

    # tweepy returns a 404 if none of the screen names exist, and that 404 is indistinguishable from a 404
    # indicating that tweepy can't connect to the twitter api.  in the latter case, we want to let tweepy use its
    # retry mechanism, but not the former.  so we add a dummy account that we know exists to every request
    # to make sure any 404 we get back is an actual 404.
    dummy_account = 'cyberhalroberts'
    dummy_account_appended = False

    if 'cyberhalroberts' not in screen_names:
        screen_names.append('cyberhalroberts')
        dummy_account_appended = True

    users_json = _get_tweepy_api().lookup_users(screen_names=screen_names,
                                                include_entities=False)

    users = list(decode_json(users_json))

    # if we added the dummy account, remove it from the results
    if dummy_account_appended:
        users = list(filter(lambda u: u['screen_name'] != dummy_account,
                            users))

    # return simple list so that this can be mocked. relies on RawParser() in _get_tweepy_api()
    return users
예제 #6
0
    def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None:
        """
        Update the state and message fields in the given table for the row whose '<table>_id' field matches that field
        in the job args.
        """
        job_state = decode_object_from_bytes_if_needed(job_state)

        try:
            args = decode_json(job_state.get('args', ''))
        except Exception as ex:
            log.error(f"Unable to decode args from job state {job_state}: {ex}")
            return

        extra_table = self.__state_config.extra_table()
        if extra_table:

            id_field = extra_table.table_name() + '_id'
            id_value = args.get(id_field, None)
            if not id_value:
                # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance
                # SnapshotTopic needs to create the snapshot.
                log.warning(f"Unable to get ID value for field '{id_field}' from job state {job_state}")
                return None

            update = {
                extra_table.state_column(): job_state.get('state', None),
                extra_table.message_column(): job_state.get('message', None),
            }

            db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update)

        else:
            log.debug("Extra table for storing state is not configured.")
예제 #7
0
    def update_job_state_args(self, db: DatabaseHandler,
                              args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states',
                                     object_id=self.__job_states_id)

        try:

            # job_states.args got changed from JSON to JSONB while sharding the
            # database, and there's no way to disable decoding JSONB (as
            # opposed to JSON) in psycopg2, so "args" might be a JSON string or
            # a pre-decoded dictionary
            maybe_json_db_args = job_state.get('args', '')
            if isinstance(maybe_json_db_args, dict):
                db_args = maybe_json_db_args
            else:
                db_args = decode_json(maybe_json_db_args)

        except Exception as ex:
            log.error(
                f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states',
                        object_id=self.__job_states_id,
                        update_hash={
                            'args': args_json,
                        })
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """
        Return a mock ch response to the posts end point.

        Generate the mock response by sending back data from a consistent but semirandom selection of
        ch-posts-2016-01-0[12345].json.
        """
        assert MOCK_TWEETS_PER_DAY <= MAX_MOCK_TWEETS_PER_DAY

        test_path = mediawords.util.paths.mc_root_path() + '/mediacloud/test-data/ch/'
        filename = test_path + "ch-posts-" + day.strftime('%Y-%m-%d') + '.json'
        with open(filename, 'r', encoding='utf-8') as fh:
            json = fh.read()

        data = dict(decode_json(json))

        assert 'posts' in data
        assert len(data['posts']) >= MOCK_TWEETS_PER_DAY

        data['posts'] = data['posts'][0:MOCK_TWEETS_PER_DAY]

        # replace tweets with the epoch of the start date so that we can infer the date of each tweet in
        # tweet_urler_lookup below
        i = 0
        for ch_post in data['posts']:
            ch_post['url'] = re.sub(r'status/(\d+)/', '/status/' + str(i), ch_post['url'])
            i += 1

        return data
예제 #9
0
    async def fetch_store_transcript(self, stories_id: int) -> None:

        log.info(f"Fetching and storing transcript for story {stories_id}...")

        with tempfile.TemporaryDirectory(
                prefix='fetch_store_transcript') as temp_dir:
            transcript_json_path = os.path.join(temp_dir, 'transcript.json')

            gcs = GCSStore(bucket_config=self.config.transcripts())
            gcs.download_object(object_id=str(stories_id),
                                local_file_path=transcript_json_path)

            with open(transcript_json_path, 'r') as f:
                transcript_json = f.read()

        transcript = Transcript.from_dict(decode_json(transcript_json))

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': stories_id,
            }).hash()

        # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download
        # exists first
        download = db.find_or_create(
            table='downloads',
            insert_hash={
                'feeds_id': feed['feeds_id'],
                'stories_id': story['stories_id'],
                'url': story['url'],
                'host': get_url_host(story['url']),
                'type': 'content',
                'sequence': 1,
                'state': 'success',
                'path': 'content:pending',
                'priority': 1,
                'extracted': 'f'
            },
        )

        text = transcript.download_text_from_transcript()

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        log.info(
            f"Done fetching and storing transcript for story {stories_id}")
예제 #10
0
def fetch_test_data(basename: str, subdirectory: str = '') -> dict:
    """Fetch the given data from disk."""

    basename = decode_object_from_bytes_if_needed(basename)
    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    file_path = _get_data_file(basename=basename, subdirectory=subdirectory)
    with open(file_path, mode='r', encoding='utf-8') as f:
        return decode_json(f.read())
예제 #11
0
def fetch_test_data(basename: str, subdirectory: str = '') -> dict:
    """Fetch the given data from disk."""

    basename = decode_object_from_bytes_if_needed(basename)
    subdirectory = decode_object_from_bytes_if_needed(subdirectory)

    file_path = _get_data_file(basename=basename, subdirectory=subdirectory)
    with open(file_path, mode='r', encoding='utf-8') as f:
        return decode_json(f.read())
예제 #12
0
    def fetch_annotation_for_story(self, db: DatabaseHandler,
                                   stories_id: int) -> Union[dict, list, None]:
        """Fetch the annotation from key-value store for the story, or None if story is not annotated."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        if not self.story_is_annotated(db=db, stories_id=stories_id):
            log.warning("Story %d is not annotated." % stories_id)
            return None

        json = self.__postgresql_store.fetch_content(db=db,
                                                     object_id=stories_id)
        if json is None:
            raise McJSONAnnotatorException(
                "Fetched annotation is undefined or empty for story %d." %
                stories_id)

        json = json.decode('utf-8')

        try:
            annotation = decode_json(json)
            if annotation is None:
                raise McJSONAnnotatorException(
                    "Annotation is None after decoding from JSON.")
        except Exception as ex:
            raise McJSONAnnotatorException(
                "Unable to parse annotation JSON for story %d: %s\nString JSON: %s"
                % (
                    stories_id,
                    str(ex),
                    json,
                ))

        try:
            annotation = self._preprocess_stored_annotation(annotation)
            if annotation is None:
                raise McJSONAnnotatorException(
                    "Annotation is None after preprocessing.")
        except Exception as ex:
            fatal_error(
                "Unable to preprocess stored annotation for story %d: %s\nString JSON: %s"
                % (
                    stories_id,
                    str(ex),
                    json,
                ))

        return annotation
예제 #13
0
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
    """Wait for Solr to start and collections to become available, if needed."""

    # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
    sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"

    connected = False

    for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1):

        if retry > 0:
            log.debug(f"Retrying Solr connection ({retry})...")

        try:

            ua = UserAgent()
            ua.set_timeout(1)
            response = ua.get(sample_select_url)

            if not response.is_success():
                raise Exception(f"Unable to connect: {response.status_line()}")

            if not response.decoded_content():
                raise Exception("Response is empty.")

            try:
                result = decode_json(response.decoded_content())
            except Exception as ex:
                raise Exception(f"Unable to decode response: {ex}")

            if not isinstance(result, dict):
                raise Exception(
                    f"Result is not a dictionary: {response.decoded_content()}"
                )

            if 'response' not in result:
                raise Exception(
                    f"Response doesn't have 'response' key: {response.decoded_content()}"
                )

        except Exception as ex:

            log.warning(f"Solr is down, will retry: {ex}")
            time.sleep(1)

        else:
            log.debug("Solr is up!")
            connected = True
            break

    if not connected:
        raise McSolrRequestDidNotStartInTimeException(
            f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up"
        )
예제 #14
0
    def fetch_posts_from_api(self, query: str, start_date: datetime,
                             end_date: datetime) -> list:
        """Return posts from a csv that are within the given date range."""
        if self.mock_enabled:
            googler_json = self._get_mock_json(start_date, end_date)
        else:
            # disabling this for now because googler seems not to return results any more
            log.warning('google support disabled')
            return []
            global _last_google_request_epoch
            now = time.time()
            if now - _last_google_request_epoch < GOOGLE_REQUEST_DELAY:
                delay = GOOGLE_REQUEST_DELAY - (now -
                                                _last_google_request_epoch)
                log.info("waiting %d seconds to make google request..." %
                         delay)
                time.sleep(delay)

            _last_google_request_epoch = time.time()

            start_query = "after:" + start_date.strftime("%Y-%m-%d")
            end_query = "before:" + (
                end_date + datetime.timedelta(days=1)).strftime("%Y-%m-%d")

            full_query = "%s %s %s" % (query, start_query, end_query)

            googler_json = subprocess.check_output(
                ["googler", "--json", "-n 100", full_query])

        links = decode_json(googler_json)

        posts = []
        for link in links:
            publish_date = start_date.strftime('%Y-%m-%d')
            domain = mediawords.util.url.get_url_distinctive_domain(
                link['url'])

            posts.append({
                'post_id':
                link['url'],
                'content':
                "%s %s %s" % (link['title'], link['abstract'], link['url']),
                'author':
                domain,
                'channel':
                domain,
                'publish_date':
                publish_date,
                'data':
                link
            })

        return posts
예제 #15
0
 def decoded_json(self) -> Union[dict, list]:
     """Return content as JSON object(s) assuming it's formatted appropriately"""
     if self.content_type() and 'json' not in self.content_type():
         log.warning(
             F"Content-Type header ({self.content_type()}) is not json for {self.__requests_response.url}"
         )
     try:
         return decode_json(self.decoded_content())
     except Exception as ex:
         raise McUserAgentResponseException(
             F"Response from {self.__requests_response.url} not parseable to JSON: "
             F"{str(ex)}")
예제 #16
0
def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None:
    """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets."""
    topic_tweets = db.query(
        """
        select *
            from topic_tweets tt
                join topic_tweet_days ttd using (topic_tweet_days_id)
            where
                ttd.topics_id = %(a)s
        """, {
            'a': topic['topics_id']
        }).hashes()

    expected_num_urls = 0
    for topic_tweet in topic_tweets:
        data = dict(decode_json(topic_tweet['data']))
        expected_num_urls += len(data['tweet']['entities']['urls'])

    # first sanity check to make sure we got some urls
    num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0]
    assert num_urls == expected_num_urls

    total_json_urls = 0
    for topic_tweet in topic_tweets:

        ch_post = dict(decode_json(topic_tweet['data']))
        expected_urls = [
            x['expanded_url'] for x in ch_post['tweet']['entities']['urls']
        ]
        total_json_urls += len(expected_urls)

        for expected_url in expected_urls:
            got_url = db.query(
                "select * from topic_tweet_urls where url = %(a)s", {
                    'a': expected_url
                }).hash()
            assert got_url is not None

    assert total_json_urls == num_urls
예제 #17
0
def _authenticated_domains_from_json(
        value: Optional[str]) -> List[AuthenticatedDomain]:
    """Parse the string and return a list of authenticated domains."""

    if value is None:
        return []

    value = value.strip()

    if not value:
        return []

    try:
        entries = decode_json(value)
    except McDecodeJSONException as ex:
        # Don't leak JSON errors to exception which might possibly end up in a public error message
        message = "Unable to decode authenticated domains."
        log.error(f"{message}: {ex}")
        raise McConfigAuthenticatedDomainsException(message)

    domains = []

    if not isinstance(entries, collections.Iterable):
        message = "Invalid JSON configuration"
        log.error(f"{message}: root is not an iterable (a list)")
        raise McConfigAuthenticatedDomainsException(message)

    for entry in entries:

        if not callable(getattr(entry, "get", None)):
            message = "Invalid JSON configuration"
            log.error(
                f"{message}: one of the items does not have get() (is not a dictionary)"
            )
            raise McConfigAuthenticatedDomainsException(message)

        domain = entry.get('domain', None)
        username = entry.get('username', None)
        password = entry.get('password', None)

        if not (domain and username and password):
            raise McConfigAuthenticatedDomainsException(
                "Incomplete authentication credentials.")

        domains.append(
            AuthenticatedDomain(domain=domain,
                                username=username,
                                password=password))

    return domains
예제 #18
0
def test_encode_decode_json():
    test_obj = [
        'foo',
        {'bar': 'baz'},
        ['xyz', 'zyx'],
        'moo',
        'ąčęėįšųūž',
        42,
        3.14,
        True,
        False,
        None
    ]
    expected_json = '["foo",{"bar":"baz"},["xyz","zyx"],"moo","ąčęėįšųūž",42,3.14,true,false,null]'

    encoded_json = encode_json(json_obj=test_obj, pretty=False)
    assert encoded_json == expected_json

    decoded_json = decode_json(json_string=encoded_json)
    assert decoded_json == test_obj

    # Encoding errors
    with pytest.raises(McEncodeJSONException):
        # noinspection PyTypeChecker
        encode_json(None)

    with pytest.raises(McEncodeJSONException):
        # noinspection PyTypeChecker
        encode_json("strings can't be encoded")

    with pytest.raises(McDecodeJSONException):
        # noinspection PyTypeChecker
        decode_json(None)

    with pytest.raises(McDecodeJSONException):
        # noinspection PyTypeChecker
        decode_json('not JSON')
예제 #19
0
    def _fetch_posts_from_api_single_page(self, query: str,
                                          start_date: datetime,
                                          end_date: datetime,
                                          next_cursor: str) -> str:
        """Fetch the posts data from thw ch api and return the http response content."""
        try:
            (project_id, query_id) = query.split('-')
            project_id = int(project_id)
            query_id = int(query_id)
        except Exception:
            raise McPostsBWTwitterQueryException(
                f"Unable to parse query '{query}', should be in 123-456, where 123 is project id and 456 is query id."
            )

        log.debug("brandwatch_twitter.fetch_posts")

        ua = _get_user_agent()

        api_key = _get_api_key()

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        cursor = next_cursor if next_cursor is not None else ''

        url = (
            f"https://api.brandwatch.com/projects/{project_id}/data/mentions?"
            f"queryId={query_id}&startDate={start_arg}&endDate={end_arg}&"
            f"pageSize={PAGE_SIZE}&orderBy=date&orderDirection=asc&"
            f"access_token={api_key}&cursor={cursor}")

        log.debug("brandwatch url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsBWTwitterDataException(
                f"error fetching posts: {response.code()} {response.status_line()}"
            )

        json = response.decoded_content()

        data = dict(decode_json(json))

        if 'results' not in data:
            raise McPostsBWTwitterDataException("error parsing response: " +
                                                json)

        return data
예제 #20
0
    def __update_table_state(self, db: DatabaseHandler,
                             job_state: Dict[str, Any]) -> None:
        """
        Update the state and message fields in the given table for the row whose '<table>_id' field matches that field
        in the job args.
        """
        job_state = decode_object_from_bytes_if_needed(job_state)

        try:

            # job_states.args got changed from JSON to JSONB while sharding the
            # database, and there's no way to disable decoding JSONB (as
            # opposed to JSON) in psycopg2, so "args" might be a JSON string or
            # a pre-decoded dictionary
            maybe_json_args = job_state.get('args', '')
            if isinstance(maybe_json_args, dict):
                args = maybe_json_args
            else:
                args = decode_json(maybe_json_args)

        except Exception as ex:
            log.error(
                f"Unable to decode args from job state {job_state}: {ex}")
            return

        extra_table = self.__state_config.extra_table()
        if extra_table:

            id_field = extra_table.table_name() + '_id'
            id_value = args.get(id_field, None)
            if not id_value:
                # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance
                # SnapshotTopic needs to create the snapshot.
                log.warning(
                    f"Unable to get ID value for field '{id_field}' from job state {job_state}"
                )
                return None

            update = {
                extra_table.state_column(): job_state.get('state', None),
                extra_table.message_column(): job_state.get('message', None),
            }

            db.update_by_id(table=extra_table.table_name(),
                            object_id=id_value,
                            update_hash=update)

        else:
            log.debug("Extra table for storing state is not configured.")
예제 #21
0
    def fetch_posts_from_api(
        self,
        query: str,
        start_date: datetime,
        end_date: datetime,
        sample: Optional[int] = None,
        page_size: Optional[int] = None,
    ) -> list:
        """Fetch day of tweets from crimson hexagon and twitter."""
        decoded_content = self._get_content_from_api(query, start_date,
                                                     end_date)

        assert sample is None, "Sampling is not implemented."
        assert page_size is None, "Page size limiting is not supported."

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.debug("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                publish_date = dateutil.parser.parse(
                    mt['tweet']['created_at']).isoformat()

                post = {
                    'post_id': str(mt['tweet_id']),
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': publish_date,
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts
예제 #22
0
def __solr_error_message_from_response(response: Response) -> str:
    """Parse out Solr error message from response."""

    if response.error_is_client_side():
        # UserAgent error (UserAgent wasn't able to connect to the server or something like that)
        error_message = f'UserAgent error: {response.decoded_content()}'

    else:

        status_code_str = str(response.code())

        if status_code_str.startswith('4'):
            # Client error - set default message
            error_message = f'Client error: {response.status_line()} {response.decoded_content()}'

            # Parse out Solr error message if there is one
            solr_response_maybe_json = response.decoded_content()
            if solr_response_maybe_json:

                solr_response_json = {}
                try:
                    solr_response_json = decode_json(solr_response_maybe_json)
                except Exception as ex:
                    log.debug(
                        f"Unable to parse Solr error response: {ex}; raw response: {solr_response_maybe_json}"
                    )

                error_message = solr_response_json.get('error',
                                                       {}).get('msg', {})
                request_params = solr_response_json.get('responseHeader',
                                                        {}).get('params', {})

                if error_message and request_params:
                    request_params_json = encode_json(request_params)

                    # If we were able to decode Solr error message, overwrite the default error message with it
                    error_message = f'Solr error: "{error_message}", params: {request_params_json}'

        elif status_code_str.startswith('5'):
            # Server error or some other error
            error_message = f'Server error: {response.status_line()} {response.decoded_content()}'

        else:
            # Some weird stuff
            error_message = f'Other error error: {response.status_line()} {response.decoded_content()}'

    return error_message
예제 #23
0
def fetch_100_tweets(tweet_ids: list) -> list:
    """Fetch data for up to 100 tweets."""
    if len(tweet_ids) > 100:
        raise McFetchTweetsException('tried to fetch more than 100 tweets')

    if len(tweet_ids) == 0:
        return []

    tweets = _get_tweepy_api().statuses_lookup(tweet_ids, include_entities=True, trim_user=False, tweet_mode='extended')

    # return simple list so that this can be mocked. relies on RawParser() in _get_tweepy_api()
    tweets = list(decode_json(tweets))

    for tweet in tweets:
        if 'full_text' in tweet:
            tweet['text'] = tweet['full_text']

    return tweets
예제 #24
0
def _get_api_key() -> str:
    """Fetch the bw api key or use the cached one.

    To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for
    a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least
    once a year.
    """
    if hasattr(_get_api_key, "api_key"):
        return _get_api_key.api_key

    user = env_value('MC_BRANDWATCH_USER')
    password = env_value('MC_BRANDWATCH_PASSWORD')

    log.debug(f"user: {user}")
    log.debug(f"passwod: {password}")

    ua = _get_user_agent()

    url = (
        "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client"
        % (quote(user)))

    request = Request(method='POST', url=url)
    request.set_content_type(
        'application/x-www-form-urlencoded; charset=utf-8')
    request.set_content({'password': password})

    response = ua.request(request)

    if not response.is_success():
        raise McPostsBWTwitterDataException("error fetching posts: " +
                                            response.decoded_content())

    json = response.decoded_content()

    data = dict(decode_json(json))

    try:
        _get_api_key.api_key = data['access_token']
    except:
        raise McPostsBWTwitterDataException(
            "error parsing ouath response: '%s'" % json)

    return _get_api_key.api_key
예제 #25
0
    def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id)

        try:
            db_args = decode_json(job_state.get('args', '{}'))
        except Exception as ex:
            log.error(f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'args': args_json,
        })
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None:
    """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data."""
    topic_tweets = db.query(
        "select * from topic_tweets where topic_tweet_days_id = %(a)s",
        {'a': topic_tweet_day['topic_tweet_days_id']}
    ).hashes()

    # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets
    assert len(topic_tweets) > 0
    assert len(topic_tweets) == topic_tweet_day['num_ch_tweets']

    for topic_tweet in topic_tweets:
        tweet_data = dict(decode_json(topic_tweet['data']))

        # random field that should be coming from twitter
        assert 'assignedCategoryId' in tweet_data

        expected_date = datetime.datetime.strptime(tweet_data['tweet']['created_at'], '%Y-%m-%d')
        got_date = datetime.datetime.strptime(topic_tweet['publish_date'], '%Y-%m-%d 00:00:00')
        assert got_date == expected_date

        assert topic_tweet['content'] == tweet_data['tweet']['text']
예제 #27
0
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        select tt.topic_posts_id
            from topic_posts tt
                join topic_post_days ttd using ( topic_post_days_id )
            where
                topics_id = %(a)s
        """, {
            'a': topic['topics_id']
        }).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' %
                     (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)
예제 #28
0
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None:
    """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data."""
    topic_tweets = db.query(
        "select * from topic_tweets where topic_tweet_days_id = %(a)s", {
            'a': topic_tweet_day['topic_tweet_days_id']
        }).hashes()

    # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets
    assert len(topic_tweets) > 0
    assert len(topic_tweets) == topic_tweet_day['num_ch_tweets']

    for topic_tweet in topic_tweets:
        tweet_data = dict(decode_json(topic_tweet['data']))

        # random field that should be coming from twitter
        assert 'assignedCategoryId' in tweet_data

        expected_date = datetime.datetime.strptime(
            tweet_data['tweet']['created_at'], '%Y-%m-%d')
        got_date = datetime.datetime.strptime(topic_tweet['publish_date'],
                                              '%Y-%m-%d 00:00:00')
        assert got_date == expected_date

        assert topic_tweet['content'] == tweet_data['tweet']['text']
예제 #29
0
    def test_api_request(self):
        """Make an API request, see if it succeeds."""

        credentials = self.univision_credentials()

        handler = DownloadFeedUnivisionHandler(
            crawler_config=self._mock_crawler_config())
        api_request_url = handler._api_request_url_with_signature_from_config(
            api_url=credentials.url)
        assert api_request_url, 'API request URL is not empty'

        ua = UserAgent()
        ua.set_timeout(30)

        response = ua.get(api_request_url)
        assert response.is_success(), 'API request was successful'

        json_string = response.decoded_content()
        assert json_string, 'JSON response is not empty'

        json = decode_json(json_string)
        assert json.get('status',
                        None) == 'success', "JSON response was successful"
        assert 'data' in json, 'JSON response has "data" key'
    def __post(self) -> bytes:
        uri = urlparse(self.path)
        if uri.path != self._API_ENDPOINT_PATH:
            return self.__error_response(
                status=HTTPStatus.NOT_FOUND.value,
                message=f"Only {self._API_ENDPOINT_PATH} is implemented.",
            )

        content_length = int(self.headers.get('Content-Length', 0))

        log.info(f"Received extraction request ({content_length} bytes)...")

        if not content_length:
            return self.__error_response(
                status=HTTPStatus.LENGTH_REQUIRED.value,
                message="Content-Length header is not set.",
            )

        if content_length > _MAX_REQUEST_LENGTH:
            return self.__error_response(
                status=HTTPStatus.REQUEST_ENTITY_TOO_LARGE.value,
                message=f"Request is larger than {_MAX_REQUEST_LENGTH} bytes.")

        encoded_body = self.rfile.read(content_length)

        try:
            json_body = encoded_body.decode('utf-8', errors='replace')
        except Exception as ex:
            return self.__error_response(
                status=HTTPStatus.BAD_REQUEST.value,
                message=f"Unable to decode request body: {ex}",
            )

        try:
            body = decode_json(json_body)
        except Exception as ex:
            return self.__error_response(
                status=HTTPStatus.BAD_REQUEST.value,
                message=f"Unable to decode request JSON: {ex}",
            )

        if "html" not in body:
            return self.__error_response(
                status=HTTPStatus.BAD_REQUEST.value,
                message="Request JSON doesn't have 'html' key.",
            )

        html = body["html"]

        try:
            extracted_html = extract_article_from_page(html)
        except Exception as ex:
            return self.__error_response(
                status=HTTPStatus.BAD_REQUEST.value,
                message=f"Unable to extract article HTML from page HTML: {ex}")

        response = {
            'extracted_html': extracted_html,
            'extractor_version': extractor_name(),
        }

        return self.__success_response(
            status=HTTPStatus.OK.value,
            response=response,
        )
예제 #31
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results
예제 #32
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
예제 #33
0
    def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]:
        """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed."""
        content = decode_object_from_bytes_if_needed(content)
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        if not content:
            raise McCrawlerFetcherSoftError("Feed content is empty or undefined.")

        try:
            feed_json = decode_json(content)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            if not feed_json['status'] == 'success':
                raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}")
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            feed_items = feed_json.get('data', None).get('items', None)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}")

        stories = []

        for item in feed_items:
            url = item.get('url', None)
            if not url:
                # Some items in the feed don't have their URLs set
                log.warning(f"'url' for item is not set: {item}")
                continue

            # sic -- we take "uid" (without "g") and call it "guid" (with "g")
            guid = item.get('uid', None)
            if not guid:
                raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}")

            title = item.get('title', '(no title)')
            description = item.get('description', '')

            try:
                # Intentionally raise exception on KeyError:
                str_publish_date = item['publishDate']
                publish_timestamp = str2time_21st_century(str_publish_date)
                publish_date = get_sql_date_from_epoch(publish_timestamp)
            except Exception as ex:
                # Die for good because Univision's dates should be pretty predictable
                raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}")

            log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'")
            stories.append({
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
            })

        return stories
예제 #34
0
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]:
    """
    Make Facebook API request.

    Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if
    something went wrong.

    :param node: Facebook API node to call.
    :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if
                   multiple values with the same key have to be passed.
    :param config: Facebook configuration object.
    :return: API response.
    """
    node = decode_object_from_bytes_if_needed(node)
    params = decode_object_from_bytes_if_needed(params)

    if node is None:
        raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).")

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Params is not a dict.")

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    if not config.api_endpoint():
        raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.")

    api_uri = furl(config.api_endpoint())
    api_uri.path.segments.append(node)

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Parameters should be a dictionary.")

    for key, values in params.items():
        if key is None or values is None:
            raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.")

        if isinstance(values, str):
            # A single value
            api_uri = api_uri.add({key: values})

        elif isinstance(values, list):
            # Multiple values for the same key
            for value in values:
                api_uri = api_uri.add({key: value})

        else:
            raise McFacebookInvalidParametersException("Values is neither a string nor a list.")

    log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}")

    app_id = config.app_id()
    app_secret = config.app_secret()

    if not (app_id and app_secret):
        raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.")

    access_token = f"{app_id}|{app_secret}"
    api_uri = api_uri.add({'access_token': access_token})

    # Last API error to set as an exception message if we run out of retries
    last_api_error = None
    data = None

    for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1):

        if retry > 1:
            log.warning(f"Retrying #{retry}...")

        ua = UserAgent()
        ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT)

        try:
            response = ua.get(api_uri.url)
        except Exception as ex:
            # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up
            # something in the code or arguments
            raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}")

        decoded_content = response.decoded_content()

        if not decoded_content:
            # some stories consistenty return empty content, so just return a soft error and move on
            raise McFacebookSoftFailureException("Decoded content is empty.")

        try:
            data = decode_json(decoded_content)
        except Exception as ex:

            if 'something went wrong' in decoded_content:
                # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the
                # request
                last_api_error = f"API responded with 'Something went wrong', will retry"
                log.error(last_api_error)
                continue

            else:
                # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"Unable to decode JSON response: {ex}",
                )

        if response.is_success():
            # Response was successful and we managed to decode JSON -- break from the retry loop
            return data

        else:
            if 'error' not in data:
                # More likely than not it's our problem so consider it a hard failure
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"No 'error' key but HTTP status is not 2xx",
                )

            error = data['error']
            error_code = error.get('code', -1)
            error_message = error.get('message', 'unknown message')

            if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES:
                # Retryable error
                last_api_error = (
                    f"Retryable error {error_code}: {error_message}, "
                    f"will retry in {config.seconds_to_wait_between_retries()} seconds"
                )
                log.error(last_api_error)
                time.sleep(config.seconds_to_wait_between_retries())
                continue

            else:
                # Non-retryable error
                log.error(f"Non-retryable error {error_code}: {error_message}")
                return data

    # At this point, we've retried the request for some time but nothing worked
    log.error(f"Ran out of retries; last error: {last_api_error}")
    return data
예제 #35
0
def query_solr(db: DatabaseHandler, params: SolrParams) -> Dict[str, Any]:
    """
    Execute a query on the Solr server using the given parameters. Return a maximum of 1 million sentences.

    The "params" argument is a dictionary of query parameters to Solr, detailed here:

        https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html.

    The query ("params['q']") is transformed: lower case boolean operators are made uppercase to make Solr recognize
    them as boolean queries.

    Return decoded response in the format described here:

        https://lucene.apache.org/solr/guide/6_6/response-writers.html#ResponseWriters-JSONResponseWriter
    """
    params = decode_object_from_bytes_if_needed(params)

    # Avoid editing the dictionary itself
    params = copy.deepcopy(params)

    if not params:
        raise McQuerySolrInternalErrorException('Parameters must be set.')

    if not isinstance(params, dict):
        raise McQuerySolrInternalErrorException(
            'Parameters must be a dictionary.')

    params['wt'] = 'json'

    if 'rows' in params:
        params['rows'] = int(params['rows'])
    else:
        params['rows'] = 1000

    if 'df' not in params:
        params['df'] = 'text'

    params['rows'] = min(params['rows'], 10_000_000)

    if 'q' not in params:
        params['q'] = ''

    # "fq" might be nonexistent or None
    if not params.get('fq', None):
        params['fq'] = []

    if not isinstance(params['fq'], list):
        params['fq'] = [params['fq']]

    if ':[' in params['q']:
        raise McQuerySolrRangeQueryException(
            "Range queries are not allowed in the main query. Please use a filter query instead for range queries."
        )

    # if params['q']:
    #     params['q'] = f"{{!complexphrase inOrder=false}} {params['q']}"

    params['q'] = _uppercase_boolean_operators(params['q'])
    params['fq'] = _uppercase_boolean_operators(params['fq'])

    params['q'] = _replace_smart_quotes(params['q'])
    params['fq'] = _replace_smart_quotes(params['fq'])

    if params['q']:
        params['q'] = _insert_collection_media_ids(db=db, q=params['q'])
    if params['fq']:
        params['fq'] = [
            _insert_collection_media_ids(db=db, q=_) for _ in params['fq']
        ]

    response_json = solr_request(
        path='select',
        params={},
        content=params,
        content_type='application/x-www-form-urlencoded; charset=utf-8',
    )

    try:
        response = decode_json(response_json)
    except Exception as ex:
        raise McQuerySolrInternalErrorException(
            f"Error parsing Solr JSON: {ex}\nJSON: {response_json}")

    if 'error' in response:
        raise McQuerySolrInvalidQueryException(
            f"Error received from Solr: {response_json}")

    return response