Python df_fix_columns 예제들, transparency.utils.df_fix_columns Python 예제들

예제 #1

0

파일 보기

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        utils.df_strip_char(df, 'percentprovided', '%')

        numeric_cols = ['accountsimpacted', 'memberdatarequests', 'percentprovided', 'subjecttorequest']

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df, platform='LinkedIn', platform_property='LinkedIn',
                                   report_start='', report_end='')

        utils.df_convert_from_percentage(df, 'percentprovided', 'memberdatarequests',
                                         'number where some information produced')

        # Extract requests for user data from governments:
        builder.extract_columns(
            request_type='requests for user data',
            request_subtype='all',
            num_requests_col='memberdatarequests',
            num_accounts_specified_col='subjecttorequest',
            num_requests_complied_col='number where some information produced',
            num_accounts_complied_col='accountsimpacted',
        )

        df_out = builder.get_df()

        df_out['reportend'] = df['reportend']
        df_out['reportstart'] = df['reportstart']

        return df_out

예제 #2

0

파일 보기

파일: test_utils.py 프로젝트: qut-dmrc/transparency-aggregator

 def test_df_fix_columns(self):
     d = {
         'NUMBER requests': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
         'number affected': pd.Series([4, 5, 6], index=['a', 'b', 'c']),
     }
     df = pd.DataFrame(d)
     utils.df_fix_columns(df)
     self.assertEqual(['number requests', 'number affected'],
                      df.columns.tolist())

예제 #3

0

파일 보기

파일: trans_snap_removal.py 프로젝트: qut-dmrc/transparency-aggregator

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        """['country', 'removal requests', 'percentage of requests where some content was removed'],
        """

        col_map = {
            'percentage of requests where some content was removed': 'percent removal requests complied',
        }

        # TODO - figure out what to do with nulls in the source data, because they are almost certainly zeroes
        df.columns = [ utils.strip_punctuation(x.lower()) for x in df.columns.values ]
        df.rename(columns=col_map, inplace=True)

        utils.df_strip_char(df, 'percent removal requests complied', '%')

        # convert strings to numbers
        numeric_cols = [
            'removal requests',
            'percent removal requests complied',
        ]

        utils.df_convert_to_int(df, numeric_cols)

        utils.df_percentage_to_count(df, 'percent removal requests complied', 'removal requests', 'removal requests complied')

        builder = DataFrameBuilder(df_in=df, platform='Snap', platform_property='Snapchat',
                                   report_start=report_start, report_end=report_end)


        # Extract removal requests from governments:
        builder.extract_columns(
            request_type='content restrictions',
            request_subtype='all',
            num_requests_col='removal requests',
            num_accounts_specified_col='',
            num_requests_complied_col='removal requests complied',
        )

        # Check that the date passed in matches the date in the table
        date_start_mismatch = ((report_start == df['report start from table']) | (report_start == '')).all()
        date_end_mismatch = ((report_end == df['report end from table']) | (report_end == '')).all()
        utils.check_assumption(date_start_mismatch, "Start dates in table did not match dates passed in.")
        utils.check_assumption(date_end_mismatch, "End dates in table did not match dates passed in.")

        df_out = builder.get_df()
        df_out['reportstart'] = df_out['report start from table']
        df_out['reportend'] = df_out['report end from table']

        return df_out

예제 #4

0

파일 보기

파일: twitter_removal.py 프로젝트: qut-dmrc/transparency-aggregator

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        df.query('country != "TOTAL"', inplace=True)

        # TODO - figure out what to do with nulls in the source data, because they are almost certainly zeroes

        col_map = {
            'removal requests govt agency police other': 'removal requests government agency police other',
        }

        df.rename(columns=col_map, inplace=True)

        utils.df_strip_char(df, 'percentage where some content withheld', '%')

        numeric_cols = ['removal requests court orders',
                'removal requests government agency police other',
                'percentage where some content withheld', 'accounts specified',
                'accounts withheld', 'tweets withheld', 'accounts tos',
                'accounts no action']

        df = df.replace('-', np.NaN)  # treat dashs as null, per nic 2018-01-11

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df, platform='Twitter', platform_property='Twitter',
                                   report_start=report_start, report_end=report_end)

        df['removal requests'] = df['removal requests court orders'] + df['removal requests government agency police other']

        utils.df_convert_from_percentage(df, pc_col='percentage where some content withheld',
                                         total_col='removal requests', dest_col='number where some content withheld')

        # Extract requests for content removal from governments:
        builder.extract_columns(
            request_type='removal requests',
            request_subtype='all',
            num_requests_col='removal requests',
            num_accounts_specified_col='accounts specified',
            num_requests_complied_col='number where some content withheld',
            num_accounts_suspended_col='accounts withheld',
            num_content_removed_col='tweets withheld',
        )

        df_out = builder.get_df()
        return df_out

예제 #5

0

파일 보기

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        df.query('country != "TOTAL"', inplace=True)

        utils.df_strip_char(df, 'percentage where some information produced',
                            '%')
        utils.df_strip_char(df, 'account information requests', '*')

        numeric_cols = [
            'account information requests',
            'percentage where some information produced', 'accounts specified'
        ]

        df = df.replace('-', np.NaN)  # treat dashs as null, per nic 2018-01-11

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df,
                                   platform='Twitter',
                                   platform_property='Twitter',
                                   report_start=report_start,
                                   report_end=report_end)

        utils.df_convert_from_percentage(
            df, 'percentage where some information produced',
            'account information requests',
            'number where some information produced')

        # Extract requests for user data from governments:
        builder.extract_columns(
            request_type='requests for user data',
            request_subtype='all',
            num_requests_col='account information requests',
            num_accounts_specified_col='accounts specified',
            num_requests_complied_col='number where some information produced')

        df_out = builder.get_df()
        return df_out

예제 #6

0

파일 보기

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        # Google does not report removal requests from some countries where the number is low.
        # This is signified by '<10' in number of requests column
        # And sometimes, Google reports '?' in the number of requests column.
        # We ignore all of these values.
        df = df[~(df['all requests number of requests'] == '<10')]
        df = df[~(df['all requests number of requests'] == '?')]

        numeric_cols = [
            'all requests number of requests',
            'all requests fully or partially complied with',
            'all requests items requested to be removed',
            'court orders number of requests',
            'court orders fully or partially complied with',
            'court orders items requested to be removed',
            'other requests executive police etc number of requests',
            'other requests executive police etc fully or partially complied with',
            'other requests executive police etc items requested to be removed',
        ]

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df,
                                   platform='Google',
                                   platform_property='Google',
                                   report_start='',
                                   report_end='')

        utils.df_convert_from_percentage(
            df, 'all requests fully or partially complied with',
            'all requests number of requests',
            'all requests number where some content removed')

        builder.extract_columns(
            request_type='removal requests',
            request_subtype='all',
            num_requests_col='all requests number of requests',
            num_content_specified_col=
            'all requests items requested to be removed',
            num_requests_complied_col=
            'all requests number where some content removed')

        # Extract requests for user data from governments:

        df_out = builder.get_df()
        df_out['report_end'] = df['period ending'].apply(
            lambda d: utils.str_to_date(d).replace(
                hour=23, minute=59, second=59))

        df_out['report_start'] = df_out['report_end'].apply(
            lambda report_end:
            (report_end + pd.DateOffset(days=1) - pd.DateOffset(months=6)
             ).replace(hour=0, minute=0, second=0))

        for report_start in df_out['report_start']:
            utils.check_assumption(
                report_start.day == 1,
                "Report Start date should be the first of the month")
            utils.check_assumption(
                report_start.month in [1, 7],
                "Report Start month should be January or July")
        return df_out

예제 #7

0

파일 보기

파일: trans_google_info.py 프로젝트: qut-dmrc/transparency-aggregator

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        numeric_cols = [
            'user data requests',
            'percentage of requests where some data produced',
            'usersaccounts specified'
        ]

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df,
                                   platform='Google',
                                   platform_property='Google',
                                   report_start='',
                                   report_end='')

        utils.df_convert_from_percentage(
            df, 'percentage of requests where some data produced',
            'user data requests', 'number where some information produced')

        def get_request_type(row):
            legal = row['legal process']
            if legal == 'Preservation Requests':
                return 'preservation requests'
            else:
                return 'requests for user data'

        def get_request_subtype(row):
            legal = row['legal process']
            if legal == 'Preservation Requests':
                return 'all'
            else:
                return legal

        builder.extract_columns(
            request_type=get_request_type,
            request_subtype=get_request_subtype,
            num_requests_col='user data requests',
            num_accounts_specified_col='usersaccounts specified',
            num_requests_complied_col='number where some information produced')

        # Extract requests for user data from governments:

        df_out = builder.get_df()
        df_out['reportend'] = df['period ending'].apply(
            lambda d: utils.str_to_date(d).replace(
                hour=23, minute=59, second=59))

        df_out['reportstart'] = df_out['reportend'].apply(lambda report_end: (
            report_end + pd.DateOffset(days=1) - pd.DateOffset(
                months=6)).replace(hour=0, minute=0, second=0))

        for report_start in df_out['report_start']:
            utils.check_assumption(
                report_start.day == 1,
                "Report Start date should be the first of the month")
            utils.check_assumption(
                report_start.month in [1, 7],
                "Report Start month should be January or July")
        return df_out

예제 #8

0

파일 보기

파일: trans_facebook.py 프로젝트: qut-dmrc/transparency-aggregator

    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        col_map = {
            'requests for user data':
            'total requests for user data',
            'user accounts referenced':
            'total user accounts referenced',
            'percentage of requests where some data produced':
            'total percentage of requests where some data produced',
        }

        df.columns = df.columns.str.lower()
        df.rename(columns=col_map, inplace=True)

        utils.df_strip_char(
            df, 'total percentage of requests where some data produced', '%')

        # convert strings to numbers
        numeric_cols = [
            'total requests for user data', 'total user accounts referenced',
            'total percentage of requests where some data produced',
            'content restrictions', 'preservations requested',
            'preservations_num_affected', 'usersaccounts preserved'
        ]

        utils.df_convert_to_numeric(df, numeric_cols)

        df['number of requests where some data produced'] = df[
                                                                'total percentage of requests where some data produced'] * \
                                                            df['total requests for user data'] / 100.0
        df['number of requests where some data produced'] = df[
            'number of requests where some data produced'].round()
        # this doesn't seem to work: .astype(int, errors='ignore')

        builder = DataFrameBuilder(df_in=df,
                                   platform='Facebook',
                                   platform_property='Facebook',
                                   report_start=report_start,
                                   report_end=report_end)

        # Extract requests for user data from governments:
        builder.extract_columns(
            request_type='requests for user data',
            request_subtype='all',
            num_requests_col='total requests for user data',
            num_accounts_specified_col='total user accounts referenced',
            num_requests_complied_col=
            'number of requests where some data produced',
            num_accounts_complied_col='',
        )

        # Extract content restriction requests:
        builder.extract_columns(request_type='content restrictions',
                                request_subtype='all',
                                num_requests_col='content restrictions',
                                num_accounts_specified_col='',
                                num_requests_complied_col='')

        # Extract account preservation requests
        builder.extract_columns(
            request_type='preservation requests',
            request_subtype='all',
            num_requests_col='preservations requested',
            num_accounts_specified_col='usersaccounts preserved',
            num_requests_complied_col=
            '',  # TODO: check with facebook if all preservation requests are actioned
            num_accounts_complied_col='',
        )

        df_out = builder.get_df()
        return df_out