Пример #1
0
    def fetch_for_country(self, country_code):

        with self.input().open('r') as facts_file:
            facts = json.load(facts_file)
            app_id = facts['ids']['apple']['appId']
        url = (f'https://itunes.apple.com/{country_code}/rss/customerreviews/'
               f'page=1/id={app_id}/sortby=mostrecent/xml')
        data_list = []

        while url:
            try:
                data, url = self.fetch_page(url)
                data_list += data
            except requests.exceptions.HTTPError as error:
                if error.response is not None and (
                        error.response.status_code == 503 or
                    (error.response.status_code in {403, 404}
                     and country_code not in {'DE', 'US', 'GB'})):
                    logger.error(f"Encountered {error.response.status_code} "
                                 f"server error '{error}' for country code "
                                 f"'{country_code}'")
                    logger.error("Continuing anyway...")
                    break
                else:
                    raise

        if not data_list:
            # no reviews for the given country code
            logger.debug(f"Empty data for country {country_code}")

        result = pd.DataFrame(data_list)
        result['country_code'] = country_code
        result.insert(0, 'app_id', app_id)

        return result
Пример #2
0
    def fetch_all(self):

        data = []
        country_codes = sorted(self.get_country_codes())
        if self.minimal_mode:
            random_num = random.randint(0, len(country_codes) - 2)  # nosec
            country_codes = country_codes[random_num:random_num + 2]
            country_codes.append('CA')

        tbar = self.tqdm(country_codes, desc="Fetching appstore reviews")
        for country_code in tbar:
            tbar.set_description(f"Fetching appstore reviews ({country_code})")
            try:
                data_for_country = self.fetch_for_country(country_code)
                if not data_for_country.empty:
                    data.append(data_for_country)
                logger.debug(f'Fetching appstore reviews for {country_code}')
            except requests.HTTPError as error:
                if error.response.status_code == 400:
                    # not all countries are available
                    pass
                else:
                    raise
        try:
            ret = pd.concat(data)
        except ValueError:
            ret = pd.DataFrame(columns=[])

        return ret.drop_duplicates(subset=['app_id', 'appstore_review_id'])
 def guess_language(self):
     try:
         return langdetect.detect(self.text)
     except langdetect.lang_detect_exception.LangDetectException as e:
         # langdetect can not handle emoji-only and link-only texts
         logger.debug(f'langdetect failed for one Doc. Error: {e}')
         logger.debug(f'Failure happened for Doc {self.to_dict()}')
Пример #4
0
    def extract_capacities(self, html_path):

        with open(html_path) as file:
            src = file.read()
        dom: html.HtmlElement = html.fromstring(src)

        quota_id, min_date = self.extract_header(dom)
        logger.debug("Scraping capacities from quota_id=%s for min_date=%s",
                     quota_id, min_date)

        capacities = self.create_zero_data(min_date)

        def load_data(data):
            return pd.DataFrame(
                data,
                columns=[*capacities.index.names, *capacities.columns],
                dtype=object).set_index(capacities.index.names)

        basic_capacities = load_data(self.extract_basic_capacities(dom))
        capacities.update(basic_capacities)

        detailed_capacities = load_data(
            self.extract_detailed_capacities(src, min_date))
        capacities.update(detailed_capacities)

        capacities = capacities.reset_index()
        capacities.insert(0, 'quota_id', quota_id)

        return capacities
    def fetch_tweets(self, query, start_date, limit):
        """All searches are limited to German tweets (twitter lang code de)."""
        logger.debug(
            f"Querying Tweets. term \"{query}\", "
            f"limit: {limit}, start_date: {start_date}"
        )

        tweets = []  # tweets go in this list

        # set config options for twint
        c = twint.Config()
        c.Limit = limit
        c.Search = query
        c.Store_object = True
        c.Since = f'{start_date} 00:00:00'
        c.Lang = 'de'
        c.Hide_output = True
        c.Store_object_tweets_list = tweets

        # execute the twitter search
        twint.run.Search(c)

        # create dataframe from search results
        tweets_df = pd.DataFrame([
            {
                'term': query,
                'user_id': t.user_id,
                'tweet_id': t.id,
                'text': t.tweet,
                'response_to': '',
                'post_date': t.datestamp,
                'permalink': t.link,
                'likes': t.likes_count,
                'retweets': t.retweets_count,
                'replies': t.replies_count
            }
            for t in tweets
        ])

        # insert space before links to match hashtags correctly
        if not tweets_df.empty:
            tweets_df['text'] = tweets_df['text']\
                .str.replace('pic.', ' pic.', regex=False)\
                .str.replace('https', ' https', regex=False)\
                .str.replace('http', ' http', regex=False)

        return tweets_df
def main():  # noqa: D103

    for table in PERFORMANCE_TABLES:
        condenser = PerformanceValueCondenser(CONNECTOR, table)

        key_columns = condenser.get_key_columns()
        performance_columns = condenser.get_performance_columns(key_columns)
        data, header = CONNECTOR.query_with_header(f'SELECT * FROM {table}')
        df = pd.DataFrame(data, columns=header)

        # Special treatment because of multi-column key
        # (pandas unique only works on series -> 1d)
        if table == 'fb_post_performance':
            df.drop(columns='page_id', inplace=True)
            key_columns.remove('page_id')
        key_column = key_columns[0]

        before = len(df)
        to_drop = []
        unique_ids = df[key_column].unique()

        logger.debug("Condensing performance table: %s", table)
        logger.debug(f"Processing {len(unique_ids)} unique ids")
        logger.debug("Before: %s", before)
        for unique_id in unique_ids:
            ordered_entries = df.loc[df[key_column] == unique_id] \
                .sort_values(by=TIMESTAMP_COLUMN, axis='index', ascending=True)

            prev_row = None
            for i, row in ordered_entries.iterrows():
                if prev_row is None:  # could be 0
                    prev_row = row
                    continue

                # if current and previous entries are equal,
                # flag current entry for deletion
                if row[performance_columns] \
                        .equals(prev_row[performance_columns]):
                    to_drop.append(i)
                prev_row = row

        logger.debug("After: %s", before - len(to_drop))

        to_drop_df = df[df.index.isin(to_drop)]

        # Note this could be optimized by using
        # cursor.copy_from and a temporary table.
        queries = []
        for _, row in to_drop_df.iterrows():
            queries.append(f'''
                DELETE FROM {table}
                WHERE {key_column} = '{row[key_column]}'
                AND {TIMESTAMP_COLUMN} = '{row[TIMESTAMP_COLUMN]}'
                ''')
        if queries:
            CONNECTOR.execute(*queries)
    def run(self):
        # Approach: Sequentially fetch all quota IDs, ignoring missing ones.
        # Stop when more than max_missing_ids consecutive IDs were invalid.

        quota_id = last_confirmed_id = 0
        with self.output().open('w') as output:
            print('file_path', file=output)

            while quota_id - last_confirmed_id <= self.max_missing_ids:
                quota_id += 1
                if self.minimal_mode:
                    quota_id += -1 + 4

                html = yield FetchGomusHTML(url=f'/admin/quotas/'
                                            f'{quota_id}',
                                            ignored_status_codes=[404])
                if html.has_error():
                    logger.debug(f"Skipping invalid quota_id={quota_id}")
                    continue
                last_confirmed_id = quota_id
                print(html.path, file=output)
Пример #8
0
    def run(self):

        current_timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        performances = []
        with self.input().open('r') as csv_in:
            df = pd.read_csv(csv_in)

        if self.minimal_mode:
            df = df.head(5)

        invalid_count = 0
        pbar = self.tqdm(df.index,
                         desc="Fetching performance data for facebook posts")
        for index in pbar:
            page_id, post_id = \
                str(df['page_id'][index]), str(df['post_id'][index])
            fb_post_id = f'{page_id}_{post_id}'
            post_date = self.post_date(df, index)
            if post_date < self.minimum_relevant_date:
                continue

            logger.debug(f"Loading performance data for FB post {fb_post_id}")

            metrics = ','.join([
                'post_reactions_by_type_total',
                'post_activity_by_action_type',
                'post_clicks_by_type',
                'post_negative_feedback',
                'post_impressions_paid',
                'post_impressions',
                'post_impressions_unique'  # "reach"
            ])
            url = f'{API_BASE}/{fb_post_id}/insights?metric={metrics}'

            response = try_request_multiple_times(url)

            if response.status_code == 400:
                invalid_count += 1
                continue
            response.raise_for_status()  # in case of another error
            response_content = response.json()

            post_perf = {
                'timestamp': current_timestamp,
            }

            # Reactions
            reactions = response_content['data'][0]['values'][0]['value']
            post_perf['react_like'] = int(reactions.get('like', 0))
            post_perf['react_love'] = int(reactions.get('love', 0))
            post_perf['react_wow'] = int(reactions.get('wow', 0))
            post_perf['react_haha'] = int(reactions.get('haha', 0))
            post_perf['react_sorry'] = int(reactions.get('sorry', 0))
            post_perf['react_anger'] = int(reactions.get('anger', 0))

            # Activity
            activity = response_content['data'][1]['values'][0]['value']
            post_perf['likes'] = int(activity.get('like', 0))
            post_perf['shares'] = int(activity.get('share', 0))
            post_perf['comments'] = int(activity.get('comment', 0))

            # Clicks
            clicks = response_content['data'][2]['values'][0]['value']
            post_perf['video_clicks'] = int(clicks.get('video play', 0))
            post_perf['link_clicks'] = int(clicks.get('link clicks', 0))
            post_perf['other_clicks'] = int(clicks.get('other clicks', 0))

            # negative feedback (only one field)
            post_perf['negative_feedback'] = \
                response_content['data'][3]['values'][0]['value']

            # number of times the post entered a person's screen through
            # paid distribution such as an ad
            post_perf['paid_impressions'] = \
                response_content['data'][4]['values'][0]['value']

            post_perf['post_impressions'] = \
                response_content['data'][5]['values'][0]['value']

            post_perf['post_impressions_unique'] = \
                response_content['data'][6]['values'][0]['value']

            post_perf.update(page_id=page_id, post_id=post_id)
            performances.append(post_perf)
        if invalid_count:
            logger.warning(f"Skipped {invalid_count} posts")

        df = pd.DataFrame(performances)

        # For some reason, all except the first set of performance
        # values get inserted twice into the performances list.
        # Investigate and fix the root cause, this is a workaround
        # TODO: Is this still up to date? Could not reproduce.
        df.drop_duplicates(subset='post_id', inplace=True, ignore_index=True)

        df = self.filter_fkey_violations(df)
        df = self.condense_performance_values(df)

        with self.output().open('w') as output_file:
            df.to_csv(output_file, index=False, header=True)