def run(self, input_data): stats = defaultdict(dict) stats_v2 = defaultdict(dict) # Really bad that these parameters are implicitly passed through lmd # Perhaps sampling can be moved somewhere upwards, # so that it can be reused by all downstream phases? sample_df = sample_data( input_data.data_frame, self.transaction.lmd['sample_margin_of_error'], self.transaction.lmd['sample_confidence_level'], self.log) for col_name in sample_df.columns.values: col_data = sample_df[col_name].dropna() (data_type, data_subtype, data_type_dist, data_subtype_dist, additional_info) = self.get_column_data_type( col_data, input_data.data_frame[col_name], col_name) type_data = { 'data_type': data_type, 'data_subtype': data_subtype, 'data_type_dist': data_type_dist, 'data_subtype_dist': data_subtype_dist, } stats[col_name] = type_data stats[col_name].update(additional_info) stats_v2[col_name]['typing'] = type_data stats_v2[col_name]['additional_info'] = additional_info stats_v2[col_name]['is_foreign_key'] = is_foreign_key( col_data, col_name, data_subtype, additional_info['other_potential_subtypes']) stats[col_name]['is_foreign_key'] = stats_v2[col_name][ 'is_foreign_key'] if stats_v2[col_name]['is_foreign_key'] and self.transaction.lmd[ 'handle_foreign_keys']: self.transaction.lmd['columns_to_ignore'].append(col_name) if data_subtype_dist: self.log.info(f'Data distribution for column "{col_name}" ' f'of type "{data_type}" ' f'and subtype "{data_subtype}"') try: self.log.infoChart( data_subtype_dist, type='list', uid=f'Data Type Distribution for column "{col_name}"') except Exception: # Functionality is specific to mindsdb logger pass if not self.transaction.lmd.get('column_stats'): self.transaction.lmd['column_stats'] = {} if not self.transaction.lmd.get('stats_v2'): self.transaction.lmd['stats_v2'] = {} self.transaction.lmd['column_stats'].update(stats) self.transaction.lmd['stats_v2'].update(stats_v2)
def run(self, input_data): stats = self.transaction.lmd['column_stats'] stats_v2 = self.transaction.lmd['stats_v2'] col_data_dict = {} sample_df = sample_data(input_data.data_frame, self.transaction.lmd['sample_margin_of_error'], self.transaction.lmd['sample_confidence_level'], self.log) for col_name in self.transaction.lmd['empty_columns']: stats_v2[col_name] = {} stats_v2[col_name]['empty'] = {'is_empty': True} self.log.warning(f'Column {col_name} is empty.') for col_name in sample_df.columns.values: self.log.info(f'Analyzing column: {col_name} !') data_type = stats_v2[col_name]['typing']['data_type'] data_subtype = stats_v2[col_name]['typing']['data_subtype'] col_data = sample_df[col_name].dropna() if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP: col_data = clean_int_and_date_data(col_data, self.log) col_data_dict[col_name] = col_data stats_v2[col_name]['empty'] = get_column_empty_values_report(input_data.data_frame[col_name]) stats[col_name]['empty_cells'] = stats_v2[col_name]['empty']['empty_cells'] stats[col_name]['empty_percentage'] = stats_v2[col_name]['empty']['empty_percentage'] if data_type == DATA_TYPES.CATEGORICAL: hist_data = input_data.data_frame[col_name] stats_v2[col_name]['unique'] = get_uniq_values_report(input_data.data_frame[col_name]) else: hist_data = col_data histogram, percentage_buckets = get_histogram(hist_data, data_type=data_type, data_subtype=data_subtype) stats_v2[col_name]['histogram'] = histogram stats_v2[col_name]['percentage_buckets'] = percentage_buckets stats[col_name]['histogram'] = histogram stats[col_name]['percentage_buckets'] = percentage_buckets if histogram: S, biased_buckets = compute_entropy_biased_buckets(histogram['y'], histogram['x']) stats_v2[col_name]['bias'] = { 'entropy': S, 'description': 'TBD' } if biased_buckets: stats_v2[col_name]['bias']['biased_buckets'] = biased_buckets if S < 0.8: if data_type == DATA_TYPES.CATEGORICAL: warning_str = "You may to check if some categories occur too often to too little in this columns." else: warning_str = "You may want to check if you see something suspicious on the right-hand-side graph." stats_v2[col_name]['bias']['warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue." self.compute_scores(col_name, sample_df, col_data_dict, stats) if 'lof_outliers' in stats[col_name]: stats_v2[col_name]['outliers'] = { 'outlier_values': stats[col_name]['lof_outliers'], 'outlier_score': stats[col_name]['lof_based_outlier_score'], 'outlier_buckets': compute_outlier_buckets(outlier_values=stats[col_name]['lof_outliers'], hist_x=histogram['x'], hist_y=histogram['y'], percentage_buckets=percentage_buckets, col_stats=stats[col_name]), 'description': 'TBD' } stats_v2[col_name]['nr_warnings'] = 0 for x in stats_v2[col_name].values(): if isinstance(x, dict) and 'warning' in x: self.log.warning(x['warning']) stats_v2[col_name]['nr_warnings'] += 1 self.log.info(f'Finished analyzing column: {col_name} !\n') log_interesting_stats(self.log, stats) self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error'] self.transaction.lmd['data_preparation']['total_row_count'] = len(input_data.data_frame) self.transaction.lmd['data_preparation']['used_row_count'] = len(sample_df)
def run(self, input_data): stats = defaultdict(dict) stats_v2 = defaultdict(dict) # Really bad that these parameters are implicitly passed through lmd # Perhaps sampling can be moved somewhere upwards, # so that it can be reused by all downstream phases? sample_df = sample_data(input_data.data_frame, self.transaction.lmd['sample_margin_of_error'], self.transaction.lmd['sample_confidence_level'], self.log) for col_name in sample_df.columns.values: col_data = sample_df[col_name].dropna() (data_type, data_subtype, data_type_dist, data_subtype_dist, additional_info) = self.get_column_data_type(col_data, input_data.data_frame[col_name], col_name) type_data = { 'data_type': data_type, 'data_subtype': data_subtype, 'data_type_dist': data_type_dist, 'data_subtype_dist': data_subtype_dist, 'description': """A data type, in programming, is a classification that specifies which type of value a variable has and what type of mathematical, relational or logical operations can be applied to it without causing an error. A string, for example, is a data type that is used to classify text and an integer is a data type used to classify whole numbers.""" } stats[col_name] = deepcopy(type_data) stats[col_name].update(additional_info) del stats[col_name]['description'] stats_v2[col_name]['typing'] = type_data stats_v2[col_name]['additional_info'] = additional_info stats_v2[col_name]['is_foreign_key'] = is_foreign_key(col_data, col_name, data_subtype, additional_info['other_potential_subtypes']) stats[col_name]['is_foreign_key'] = stats_v2[col_name]['is_foreign_key'] if stats_v2[col_name]['is_foreign_key'] and self.transaction.lmd['handle_foreign_keys']: self.transaction.lmd['columns_to_ignore'].append(col_name) if data_subtype_dist: self.log.info(f'Data distribution for column "{col_name}" ' f'of type "{data_type}" ' f'and subtype "{data_subtype}"') try: self.log.infoChart(data_subtype_dist, type='list', uid=f'Data Type Distribution for column "{col_name}"') except Exception: # Functionality is specific to mindsdb logger pass if not self.transaction.lmd.get('column_stats'): self.transaction.lmd['column_stats'] = {} if not self.transaction.lmd.get('stats_v2'): self.transaction.lmd['stats_v2'] = {} self.transaction.lmd['column_stats'].update(stats) self.transaction.lmd['stats_v2'].update(stats_v2)
def run(self, input_data): stats = self.transaction.lmd['column_stats'] stats_v2 = self.transaction.lmd['stats_v2'] col_data_dict = {} sample_df = sample_data( input_data.data_frame, self.transaction.lmd['sample_margin_of_error'], self.transaction.lmd['sample_confidence_level'], self.log) for col_name in self.transaction.lmd['empty_columns']: stats_v2[col_name] = {} stats_v2[col_name]['empty'] = {'is_empty': True} self.log.warning(f'Column {col_name} is empty.') for col_name in sample_df.columns.values: self.log.info(f'Analyzing column: {col_name} !') data_type = stats_v2[col_name]['typing']['data_type'] data_subtype = stats_v2[col_name]['typing']['data_subtype'] col_data = sample_df[col_name].dropna() if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP: col_data = clean_int_and_date_data(col_data, self.log) col_data_dict[col_name] = col_data stats_v2[col_name]['empty'] = get_column_empty_values_report( input_data.data_frame[col_name]) stats[col_name]['empty_cells'] = stats_v2[col_name]['empty'][ 'empty_cells'] stats[col_name]['empty_percentage'] = stats_v2[col_name]['empty'][ 'empty_percentage'] if data_type == DATA_TYPES.CATEGORICAL: hist_data = input_data.data_frame[col_name] stats_v2[col_name]['unique'] = get_uniq_values_report( input_data.data_frame[col_name]) else: hist_data = col_data histogram, percentage_buckets = get_histogram( hist_data, data_type=data_type, data_subtype=data_subtype) stats_v2[col_name]['histogram'] = histogram stats_v2[col_name]['percentage_buckets'] = percentage_buckets stats[col_name]['histogram'] = histogram stats[col_name]['percentage_buckets'] = percentage_buckets if histogram: S, biased_buckets = compute_entropy_biased_buckets( histogram['y'], histogram['x']) stats_v2[col_name]['bias'] = { 'entropy': S, 'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected.""" } if biased_buckets: stats_v2[col_name]['bias'][ 'biased_buckets'] = biased_buckets if S < 0.8: if data_type == DATA_TYPES.CATEGORICAL: warning_str = "You may to check if some categories occur too often to too little in this columns." else: warning_str = "You may want to check if you see something suspicious on the right-hand-side graph." stats_v2[col_name]['bias'][ 'warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue." self.compute_scores(col_name, sample_df, col_data_dict, stats) if 'lof_outliers' in stats[col_name]: stats_v2[col_name]['outliers'] = { 'outlier_values': stats[col_name]['lof_outliers'], 'outlier_score': stats[col_name]['lof_based_outlier_score'], 'outlier_buckets': compute_outlier_buckets( outlier_values=stats[col_name]['lof_outliers'], hist_x=histogram['x'], hist_y=histogram['y'], percentage_buckets=percentage_buckets, col_stats=stats[col_name]), 'description': """Potential outliers can be thought as the "extremes", i.e., data points that are far from the center of mass (mean/median/interquartile range) of the data.""" } stats_v2[col_name]['nr_warnings'] = 0 for x in stats_v2[col_name].values(): if isinstance(x, dict) and 'warning' in x: self.log.warning(x['warning']) stats_v2[col_name]['nr_warnings'] += 1 self.log.info(f'Finished analyzing column: {col_name} !\n') log_interesting_stats(self.log, stats) self.transaction.lmd['data_preparation'][ 'accepted_margin_of_error'] = self.transaction.lmd[ 'sample_margin_of_error'] self.transaction.lmd['data_preparation']['total_row_count'] = len( input_data.data_frame) self.transaction.lmd['data_preparation']['used_row_count'] = len( sample_df)