def test_check_assumption_failed(self):
        with self.assertRaises(utils.AssumptionError) as context:
            with self.assertLogs(level="ERROR") as logger:
                utils.check_assumption(False, "Value should be true")

                self.assertEqual(1, len(logger.output))
                self.assertIn(
                    'ERROR:root:Assumption failed: Value should be true',
                    logger.output[0])
        self.assertTrue('Value should be true' in str(context.exception))
示例#2
0
    def check_date_range(self, date_range, report_start, report_end):
        if date_range:
            start = utils.str_to_date(report_start[:10]).strftime('%Y %B')
            end = utils.str_to_date(report_end[:10]).strftime('%B')

            expected = f"{start}-{end}"
            utils.check_assumption(
                date_range.startswith(expected),
                f"Expected '{expected}' to be in '{date_range}'.")
            return 1
        else:
            return 0
    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        """['country', 'removal requests', 'percentage of requests where some content was removed'],
        """

        col_map = {
            'percentage of requests where some content was removed': 'percent removal requests complied',
        }

        # TODO - figure out what to do with nulls in the source data, because they are almost certainly zeroes
        df.columns = [ utils.strip_punctuation(x.lower()) for x in df.columns.values ]
        df.rename(columns=col_map, inplace=True)

        utils.df_strip_char(df, 'percent removal requests complied', '%')

        # convert strings to numbers
        numeric_cols = [
            'removal requests',
            'percent removal requests complied',
        ]

        utils.df_convert_to_int(df, numeric_cols)

        utils.df_percentage_to_count(df, 'percent removal requests complied', 'removal requests', 'removal requests complied')

        builder = DataFrameBuilder(df_in=df, platform='Snap', platform_property='Snapchat',
                                   report_start=report_start, report_end=report_end)


        # Extract removal requests from governments:
        builder.extract_columns(
            request_type='content restrictions',
            request_subtype='all',
            num_requests_col='removal requests',
            num_accounts_specified_col='',
            num_requests_complied_col='removal requests complied',
        )

        # Check that the date passed in matches the date in the table
        date_start_mismatch = ((report_start == df['report start from table']) | (report_start == '')).all()
        date_end_mismatch = ((report_end == df['report end from table']) | (report_end == '')).all()
        utils.check_assumption(date_start_mismatch, "Start dates in table did not match dates passed in.")
        utils.check_assumption(date_end_mismatch, "End dates in table did not match dates passed in.")

        df_out = builder.get_df()
        df_out['reportstart'] = df_out['report start from table']
        df_out['reportend'] = df_out['report end from table']

        return df_out
示例#4
0
    def read(self, filename):
        with open(filename, encoding="utf8") as file:
            soup = BeautifulSoup(file, 'html.parser')

        code_block = soup.find('code',
                               {"id": "templates/legal/transparency-content"})
        json_comments = code_block.findAll(
            text=lambda text: isinstance(text, Comment))

        linkedin_json_str = json_comments[0]

        # the contract for Reader is that all elements of the dataframe are strings
        # parse_int=str uses 'str' to read in ints, which casts them to strings
        linkedin_json = json.loads(linkedin_json_str, parse_int=str)

        gov_reqs = linkedin_json['governmentRequestsTable']
        data = []
        date_range_check_count = 0

        for (i, dat) in enumerate(gov_reqs):
            (report_start, report_end) = self.linkedin_data_index_to_dates(i)
            date_range = dat.get('dateRange')
            for row in dat['countries']:
                data.append({
                    "report_start": report_start,
                    "report_end": report_end,
                    **row
                })

            date_range_check_count += self.check_date_range(
                date_range, report_start, report_end)

        utils.check_assumption(
            date_range_check_count > 0,
            "No date checks performed. Find another way of checking date assumptions."
        )

        df = pd.DataFrame(data)

        logging.debug("Found {} rows.".format(df.shape[0]))

        return df
    def check(self, df):
        actual_cols = df.columns.values
        actual_cols = np.array(
            [utils.strip_punctuation(x.lower()) for x in actual_cols])

        expected_cols_array = self.config['expected_source_columns_array']
        expected_cols_array = [[utils.strip_punctuation(x.lower()) for x in y]
                               for y in expected_cols_array]

        diffs = [
            self.get_diff(actual_cols, expected_cols)
            for expected_cols in expected_cols_array
        ]

        minimal_diff = min(diffs, key=lambda d: d['diff_count'])

        missing_cols = minimal_diff['missing_cols']
        extra_cols = minimal_diff['extra_cols']

        utils.check_assumption(
            (len(missing_cols) != 0) + (len(extra_cols) != 0) < 2,
            f"Unexpected missing columns ({json.dumps(missing_cols)}) " +
            f"and extra columns ({json.dumps(extra_cols)})")

        utils.check_assumption(
            len(extra_cols) == 0,
            "Unexpected extra columns: " + json.dumps(extra_cols))
        utils.check_assumption(
            len(missing_cols) == 0,
            "Unexpected missing columns: " + json.dumps(missing_cols))
 def test_check_assumption(self):
     with mock.patch('transparency.utils.logging.error') as mock_ret:
         utils.check_assumption(True, "Value should be true")
         mock_ret.assert_not_called()
示例#7
0
    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        # Google does not report removal requests from some countries where the number is low.
        # This is signified by '<10' in number of requests column
        # And sometimes, Google reports '?' in the number of requests column.
        # We ignore all of these values.
        df = df[~(df['all requests number of requests'] == '<10')]
        df = df[~(df['all requests number of requests'] == '?')]

        numeric_cols = [
            'all requests number of requests',
            'all requests fully or partially complied with',
            'all requests items requested to be removed',
            'court orders number of requests',
            'court orders fully or partially complied with',
            'court orders items requested to be removed',
            'other requests executive police etc number of requests',
            'other requests executive police etc fully or partially complied with',
            'other requests executive police etc items requested to be removed',
        ]

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df,
                                   platform='Google',
                                   platform_property='Google',
                                   report_start='',
                                   report_end='')

        utils.df_convert_from_percentage(
            df, 'all requests fully or partially complied with',
            'all requests number of requests',
            'all requests number where some content removed')

        builder.extract_columns(
            request_type='removal requests',
            request_subtype='all',
            num_requests_col='all requests number of requests',
            num_content_specified_col=
            'all requests items requested to be removed',
            num_requests_complied_col=
            'all requests number where some content removed')

        # Extract requests for user data from governments:

        df_out = builder.get_df()
        df_out['report_end'] = df['period ending'].apply(
            lambda d: utils.str_to_date(d).replace(
                hour=23, minute=59, second=59))

        df_out['report_start'] = df_out['report_end'].apply(
            lambda report_end:
            (report_end + pd.DateOffset(days=1) - pd.DateOffset(months=6)
             ).replace(hour=0, minute=0, second=0))

        for report_start in df_out['report_start']:
            utils.check_assumption(
                report_start.day == 1,
                "Report Start date should be the first of the month")
            utils.check_assumption(
                report_start.month in [1, 7],
                "Report Start month should be January or July")
        return df_out
    def process(self, df, report_start, report_end):
        utils.df_fix_columns(df)

        numeric_cols = [
            'user data requests',
            'percentage of requests where some data produced',
            'usersaccounts specified'
        ]

        utils.df_convert_to_numeric(df, numeric_cols)

        builder = DataFrameBuilder(df_in=df,
                                   platform='Google',
                                   platform_property='Google',
                                   report_start='',
                                   report_end='')

        utils.df_convert_from_percentage(
            df, 'percentage of requests where some data produced',
            'user data requests', 'number where some information produced')

        def get_request_type(row):
            legal = row['legal process']
            if legal == 'Preservation Requests':
                return 'preservation requests'
            else:
                return 'requests for user data'

        def get_request_subtype(row):
            legal = row['legal process']
            if legal == 'Preservation Requests':
                return 'all'
            else:
                return legal

        builder.extract_columns(
            request_type=get_request_type,
            request_subtype=get_request_subtype,
            num_requests_col='user data requests',
            num_accounts_specified_col='usersaccounts specified',
            num_requests_complied_col='number where some information produced')

        # Extract requests for user data from governments:

        df_out = builder.get_df()
        df_out['reportend'] = df['period ending'].apply(
            lambda d: utils.str_to_date(d).replace(
                hour=23, minute=59, second=59))

        df_out['reportstart'] = df_out['reportend'].apply(lambda report_end: (
            report_end + pd.DateOffset(days=1) - pd.DateOffset(
                months=6)).replace(hour=0, minute=0, second=0))

        for report_start in df_out['report_start']:
            utils.check_assumption(
                report_start.day == 1,
                "Report Start date should be the first of the month")
            utils.check_assumption(
                report_start.month in [1, 7],
                "Report Start month should be January or July")
        return df_out