def sample_data(df, sample_margin_of_error, sample_confidence_level, log): population_size = len(df) sample_size = int( calculate_sample_size(population_size, sample_margin_of_error, sample_confidence_level) ) if population_size > 50 else population_size sample_size_pct = sample_size * 100 / population_size # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) log.info( f'Analyzing a sample of {sample_size} from a total population of {population_size}, this is equivalent to {sample_size_pct}% of your data.' ) return df.iloc[input_data_sample_indexes]
def run(self, input_data, modify_light_metadata, hmd=None, print_logs=True): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ if print_logs == False: self.log = logging.getLogger('null-logger') self.log.propagate = False # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(input_data.data_frame) if population_size < 50: sample_size = population_size else: sample_size = int(calculate_sample_size(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) #if sample_size > 3000 and sample_size > population_size/8: # sample_size = min(round(population_size/8),3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) all_sampled_data = input_data.data_frame.iloc[input_data_sample_indexes] stats = {} col_data_dict = {} for col_name in all_sampled_data.columns.values: col_data = all_sampled_data[col_name].dropna() full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type(col_data, input_data.data_frame, col_name) if column_status == 'Column empty': if modify_light_metadata: self.transaction.lmd['malformed_columns']['names'].append(col_name) self.transaction.lmd['malformed_columns']['indices'].append(i) continue new_col_data = [] if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: #data_type == DATA_TYPES.DATE: for element in col_data: if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null']: new_col_data.append(None) else: try: new_col_data.append(int(parse_datetime(element).timestamp())) except: self.log.warning(f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}') new_col_data.append(None) col_data = new_col_data if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype) x = histogram['x'] y = histogram['y'] col_data = StatsGenerator.clean_int_and_date_data(col_data) # This means the column is all nulls, which we don't handle at the moment if len(col_data) < 1: return None xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.1 initial_step_size = abs(max_value-min_value)/100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i-min_value)*inc_rate i = i + i_inc else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE: histogram, _ = StatsGenerator.get_histogram(input_data.data_frame[col_name], data_type=data_type, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": histogram, "percentage_buckets": histogram['x'] } elif curr_data_subtype == DATA_SUBTYPES.IMAGE: histogram, percentage_buckets = StatsGenerator.get_histogram(col_data, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, 'percentage_buckets': percentage_buckets, 'histogram': histogram } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype) dictionary = list(histogram.keys()) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name empty_count = len(full_col_data) - len(col_data) stats[col_name]['empty_cells'] = empty_count stats[col_name]['empty_percentage'] = empty_count * 100 / len(full_col_data) if 'separator' in additional_info: stats[col_name]['separator'] = additional_info['separator'] col_data_dict[col_name] = col_data for col_name in all_sampled_data.columns: if col_name in self.transaction.lmd['malformed_columns']['names']: continue stats[col_name].update(self._compute_duplicates_score(stats, all_sampled_data, col_name)) stats[col_name].update(self._compute_empty_cells_score(stats, all_sampled_data, col_name)) #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name)) stats[col_name].update(self._compute_data_type_dist_score(stats, all_sampled_data, col_name)) stats[col_name].update(self._compute_z_score(stats, col_data_dict, col_name)) stats[col_name].update(self._compute_lof_score(stats, col_data_dict, col_name)) stats[col_name].update(self._compute_similariy_score(stats, all_sampled_data, col_name)) stats[col_name].update(self._compute_value_distribution_score(stats, all_sampled_data, col_name)) stats[col_name].update(self._compute_consistency_score(stats, col_name)) stats[col_name].update(self._compute_redundancy_score(stats, col_name)) stats[col_name].update(self._compute_variability_score(stats, col_name)) stats[col_name].update(self._compute_data_quality_score(stats, col_name)) total_rows = len(input_data.data_frame) if modify_light_metadata: self.transaction.lmd['column_stats'] = stats if 'sample_margin_of_error' in self.transaction.lmd and self.transaction.lmd['sample_margin_of_error'] is not None: self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error'] else: self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = CONFIG.DEFAULT_MARGIN_OF_ERROR self.transaction.lmd['data_preparation']['total_row_count'] = total_rows self.transaction.lmd['data_preparation']['used_row_count'] = sample_size self.transaction.lmd['data_preparation']['test_row_count'] = len(input_data.test_indexes[KEY_NO_GROUP_BY]) self.transaction.lmd['data_preparation']['train_row_count'] = len(input_data.train_indexes[KEY_NO_GROUP_BY]) self.transaction.lmd['data_preparation']['validation_row_count'] = len(input_data.validation_indexes[KEY_NO_GROUP_BY]) self._log_interesting_stats(stats) return stats
def run(self): result = self._get_prepared_input_df() self.transaction.input_data.columns = result.columns.values.tolist() self.transaction.lmd['columns'] = self.transaction.input_data.columns self.transaction.input_data.data_frame = result self._validate_input_data_integrity() is_time_series = self.transaction.lmd['model_is_time_series'] group_by = self.transaction.lmd['model_group_by'] # create a list of the column numbers (indexes) that make the group by, this is so that we can greate group by hashes for each row if len(group_by) > 0: group_by_col_indexes = [ columns.index(group_by_column) for group_by_column in group_by ] # create all indexes by group by, that is all the rows that belong to each group by self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.validation_indexes[KEY_NO_GROUP_BY] = [] for i, row in self.transaction.input_data.data_frame.iterrows(): if len(group_by) > 0: group_by_value = '_'.join([ str(row[group_by_index]) for group_by_index in group_by_col_indexes ]) if group_by_value not in self.transaction.input_data.all_indexes: self.transaction.input_data.all_indexes[ group_by_value] = [] self.transaction.input_data.all_indexes[group_by_value] += [i] self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] += [i] # move indexes to corresponding train, test, validation, etc and trim input data accordingly for key in self.transaction.input_data.all_indexes: #If this is a group by, skip the `KEY_NO_GROUP_BY` key if len(self.transaction.input_data.all_indexes ) > 1 and key == KEY_NO_GROUP_BY: continue length = len(self.transaction.input_data.all_indexes[key]) if self.transaction.lmd['type'] == TRANSACTION_LEARN: sample_size = int( calculate_sample_size(population_size=length, margin_error=self.transaction. lmd['sample_margin_of_error'], confidence_level=self.transaction. lmd['sample_confidence_level'])) # this evals True if it should send the entire group data into test, train or validation as opposed to breaking the group into the subsets should_split_by_group = type( group_by) == list and len(group_by) > 0 if should_split_by_group: self.transaction.input_data.train_indexes[ key] = self.transaction.input_data.all_indexes[key][ 0:round(length - length * CONFIG.TEST_TRAIN_RATIO)] self.transaction.input_data.train_indexes[ KEY_NO_GROUP_BY].extend( self.transaction.input_data.train_indexes[key]) self.transaction.input_data.test_indexes[ key] = self.transaction.input_data.all_indexes[key][ round(length - length * CONFIG.TEST_TRAIN_RATIO):int( round(length - length * CONFIG.TEST_TRAIN_RATIO) + round(length * CONFIG.TEST_TRAIN_RATIO / 2))] self.transaction.input_data.test_indexes[ KEY_NO_GROUP_BY].extend( self.transaction.input_data.test_indexes[key]) self.transaction.input_data.validation_indexes[ key] = self.transaction.input_data.all_indexes[key][( round(length - length * CONFIG.TEST_TRAIN_RATIO) + round(length * CONFIG.TEST_TRAIN_RATIO / 2)):] self.transaction.input_data.validation_indexes[ KEY_NO_GROUP_BY].extend(self.transaction.input_data. validation_indexes[key]) else: # make sure that the last in the time series are also the subset used for test train_window = (0, int(length * (1 - 2 * CONFIG.TEST_TRAIN_RATIO))) self.transaction.input_data.train_indexes[ key] = self.transaction.input_data.all_indexes[key][ train_window[0]:train_window[1]] validation_window = (train_window[1], train_window[1] + int(length * CONFIG.TEST_TRAIN_RATIO)) test_window = (validation_window[1], length) self.transaction.input_data.test_indexes[ key] = self.transaction.input_data.all_indexes[key][ test_window[0]:test_window[1]] self.transaction.input_data.validation_indexes[ key] = self.transaction.input_data.all_indexes[key][ validation_window[0]:validation_window[1]] self.transaction.input_data.train_df = self.transaction.input_data.data_frame.iloc[ self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY]].copy() self.transaction.input_data.test_df = self.transaction.input_data.data_frame.iloc[ self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY]].copy() self.transaction.input_data.validation_df = self.transaction.input_data.data_frame.iloc[ self.transaction.input_data. validation_indexes[KEY_NO_GROUP_BY]].copy() # @TODO: Consider deleting self.transaction.input_data.data_frame here # log some stats if self.transaction.lmd['type'] == TRANSACTION_LEARN: # @TODO I don't think the above works, fix at some point or just remove `sample_margin_of_error` option from the interface if len(self.transaction.input_data.data_frame) != sum([ len(self.transaction.input_data.train_df), len(self.transaction.input_data.test_df), len(self.transaction.input_data.validation_df) ]): self.log.info( 'You requested to sample with a *margin of error* of {sample_margin_of_error} and a *confidence level* of {sample_confidence_level}. Therefore:' .format(sample_confidence_level=self.transaction. lmd['sample_confidence_level'], sample_margin_of_error=self.transaction. lmd['sample_margin_of_error'])) self.log.info( 'Using a [Cochran\'s sample size calculator](https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/) we got the following sample sizes:' ) data = { 'total': [total_rows_in_input, 'Total number of rows in input'], 'subsets': [[total_rows_used, 'Total number of rows used']], 'label': 'Sample size for margin of error of ({sample_margin_of_error}) and a confidence level of ({sample_confidence_level})' .format(sample_confidence_level=self.transaction. lmd['sample_confidence_level'], sample_margin_of_error=self.transaction. lmd['sample_margin_of_error']) } self.log.infoChart(data, type='pie') # @TODO Bad code ends here (see @TODO above) data = { 'subsets': [[len(self.transaction.input_data.train_df), 'Train'], [len(self.transaction.input_data.test_df), 'Test'], [ len(self.transaction.input_data.validation_df), 'Validation' ]], 'label': 'Number of rows per subset' } self.log.info('We have split the input data into:') self.log.infoChart(data, type='pie')
def run(self, input_data, modify_light_metadata, hmd=None, print_logs=True): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ ''' @TODO Uncomment when we need multiprocessing, possibly disable on OSX no_processes = multiprocessing.cpu_count() - 2 if no_processes < 1: no_processes = 1 pool = multiprocessing.Pool(processes=no_processes) ''' if print_logs == False: self.log = logging.getLogger('null-logger') self.log.propagate = False # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(input_data.data_frame) if population_size < 50: sample_size = population_size else: sample_size = int( calculate_sample_size(population_size=population_size, margin_error=self.transaction. lmd['sample_margin_of_error'], confidence_level=self.transaction. lmd['sample_confidence_level'])) #if sample_size > 3000 and sample_size > population_size/8: # sample_size = min(round(population_size/8),3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info( 'population_size={population_size}, sample_size={sample_size} {percent:.2f}%' .format(population_size=population_size, sample_size=sample_size, percent=(sample_size / population_size) * 100)) all_sampled_data = input_data.data_frame.iloc[ input_data_sample_indexes] stats = {} col_data_dict = {} for col_name in all_sampled_data.columns.values: if col_name in self.transaction.lmd['columns_to_ignore']: continue col_data = all_sampled_data[col_name].dropna() full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type( col_data, input_data.data_frame, col_name) if column_status == 'Column empty': if modify_light_metadata: self.transaction.lmd['columns_to_ignore'].append(col_name) continue new_col_data = [] if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: #data_type == DATA_TYPES.DATE: for element in col_data: if str(element) in [ str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ]: new_col_data.append(None) else: try: new_col_data.append( int(parse_datetime(element).timestamp())) except: self.log.warning( f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}' ) new_col_data.append(None) col_data = new_col_data if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: histogram, _ = StatsGenerator.get_histogram( col_data, data_type=data_type, data_subtype=curr_data_subtype) x = histogram['x'] y = histogram['y'] col_data = StatsGenerator.clean_int_and_date_data(col_data) # This means the column is all nulls, which we don't handle at the moment if len(col_data) < 1: return None if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 is_float = True if max( [1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": histogram['x'] #xp } elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE: histogram, _ = StatsGenerator.get_histogram( input_data.data_frame[col_name], data_type=data_type, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": histogram, "percentage_buckets": histogram['x'] } elif curr_data_subtype == DATA_SUBTYPES.IMAGE: histogram, percentage_buckets = StatsGenerator.get_histogram( col_data, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, 'percentage_buckets': percentage_buckets, 'histogram': histogram } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word histogram, _ = StatsGenerator.get_histogram( col_data, data_type=data_type, data_subtype=curr_data_subtype) dictionary = list(histogram.keys()) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len(dictionary) / len( col_data) * 100 # if the number of uniques is too large then treat is a text is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False if dictionary_lenght_percentage > 10 and len( col_data) > 50 and is_full_text == False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name empty_count = len(full_col_data) - len(col_data) stats[col_name]['empty_cells'] = empty_count stats[col_name]['empty_percentage'] = empty_count * 100 / len( full_col_data) for k in additional_info: stats[col_name][k] = additional_info[k] col_data_dict[col_name] = col_data for col_name in all_sampled_data.columns: if col_name in self.transaction.lmd['columns_to_ignore']: continue # Use the multiprocessing pool for computing scores which take a very long time to compute # For now there's only one and computing it takes way too long, so this is not enabled scores = [] ''' scores.append(pool.apply_async(compute_clf_based_correlation_score, args=(stats, all_sampled_data, col_name))) ''' for score_promise in scores: # Wait for function on process to finish running score = score_promise.get() stats[col_name].update(score) for score_func in [ compute_duplicates_score, compute_empty_cells_score, compute_data_type_dist_score, compute_z_score, compute_lof_score, compute_similariy_score, compute_value_distribution_score ]: start_time = time.time() if 'compute_z_score' in str( score_func) or 'compute_lof_score' in str(score_func): stats[col_name].update( score_func(stats, col_data_dict, col_name)) else: stats[col_name].update( score_func(stats, all_sampled_data, col_name)) fun_name = str(score_func) run_duration = round(time.time() - start_time, 2) stats[col_name].update(compute_consistency_score(stats, col_name)) stats[col_name].update(compute_redundancy_score(stats, col_name)) stats[col_name].update(compute_variability_score(stats, col_name)) stats[col_name].update(compute_data_quality_score(stats, col_name)) stats[col_name]['is_foreign_key'] = self.is_foreign_key( col_name, stats[col_name], col_data_dict[col_name]) if stats[col_name]['is_foreign_key'] and self.transaction.lmd[ 'handle_foreign_keys']: self.transaction.lmd['columns_to_ignore'].append(col_name) total_rows = len(input_data.data_frame) if modify_light_metadata: self.transaction.lmd['column_stats'] = stats self.transaction.lmd['data_preparation'][ 'accepted_margin_of_error'] = self.transaction.lmd[ 'sample_margin_of_error'] self.transaction.lmd['data_preparation'][ 'total_row_count'] = total_rows self.transaction.lmd['data_preparation'][ 'used_row_count'] = sample_size ''' @TODO Uncomment when we need multiprocessing, possibly disable on OSX pool.close() pool.join() ''' self._log_interesting_stats(stats) return stats
def run(self): result = self._get_prepared_input_df() columns = list(result.columns.values) data_array = list(result.values.tolist()) self.transaction.input_data.columns = columns self.transaction.input_data.data_array = data_array self._validate_input_data_integrity() is_time_series = self.transaction.lmd['model_is_time_series'] group_by = self.transaction.lmd['model_group_by'] # create a list of the column numbers (indexes) that make the group by, this is so that we can greate group by hashes for each row if len(group_by) > 0: group_by_col_indexes = [ columns.index(group_by_column) for group_by_column in group_by ] # create all indexes by group by, that is all the rows that belong to each group by self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY] = [] self.transaction.input_data.validation_indexes[KEY_NO_GROUP_BY] = [] for i, row in enumerate(self.transaction.input_data.data_array): if len(group_by) > 0: group_by_value = '_'.join([ str(row[group_by_index]) for group_by_index in group_by_col_indexes ]) if group_by_value not in self.transaction.input_data.all_indexes: self.transaction.input_data.all_indexes[ group_by_value] = [] self.transaction.input_data.all_indexes[group_by_value] += [i] self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] += [i] # move indexes to corresponding train, test, validation, etc and trim input data accordingly for key in self.transaction.input_data.all_indexes: if len(self.transaction.input_data.all_indexes ) > 1 and key == KEY_NO_GROUP_BY: continue length = len(self.transaction.input_data.all_indexes[key]) if self.transaction.lmd['type'] == TRANSACTION_LEARN: sample_size = int( calculate_sample_size(population_size=length, margin_error=self.transaction. lmd['sample_margin_of_error'], confidence_level=self.transaction. lmd['sample_confidence_level'])) # this evals True if it should send the entire group data into test, train or validation as opposed to breaking the group into the subsets should_split_by_group = type( group_by) == list and len(group_by) > 0 if should_split_by_group: self.transaction.input_data.train_indexes[ key] = self.transaction.input_data.all_indexes[key][ 0:round(length - length * CONFIG.TEST_TRAIN_RATIO)] self.transaction.input_data.train_indexes[ KEY_NO_GROUP_BY].extend( self.transaction.input_data.train_indexes[key]) self.transaction.input_data.test_indexes[ key] = self.transaction.input_data.all_indexes[key][ round(length - length * CONFIG.TEST_TRAIN_RATIO):int( round(length - length * CONFIG.TEST_TRAIN_RATIO) + round(length * CONFIG.TEST_TRAIN_RATIO / 2))] self.transaction.input_data.test_indexes[ KEY_NO_GROUP_BY].extend( self.transaction.input_data.test_indexes[key]) self.transaction.input_data.validation_indexes[ key] = self.transaction.input_data.all_indexes[key][( round(length - length * CONFIG.TEST_TRAIN_RATIO) + round(length * CONFIG.TEST_TRAIN_RATIO / 2)):] self.transaction.input_data.validation_indexes[ KEY_NO_GROUP_BY].extend(self.transaction.input_data. validation_indexes[key]) else: # make sure that the last in the time series are also the subset used for test train_window = (0, int(length * (1 - 2 * CONFIG.TEST_TRAIN_RATIO))) self.transaction.input_data.train_indexes[ key] = self.transaction.input_data.all_indexes[key][ train_window[0]:train_window[1]] validation_window = (train_window[1], train_window[1] + int(length * CONFIG.TEST_TRAIN_RATIO)) test_window = (validation_window[1], length) self.transaction.input_data.test_indexes[ key] = self.transaction.input_data.all_indexes[key][ test_window[0]:test_window[1]] self.transaction.input_data.validation_indexes[ key] = self.transaction.input_data.all_indexes[key][ validation_window[0]:validation_window[1]] # log some stats if self.transaction.lmd['type'] == TRANSACTION_LEARN: total_rows_used_by_subset = { 'train': 0, 'test': 0, 'validation': 0 } average_number_of_rows_used_per_groupby = { 'train': 0, 'test': 0, 'validation': 0 } number_of_groups_per_subset = { 'train': 0, 'test': 0, 'validation': 0 } for group_key in total_rows_used_by_subset: pointer = getattr(self.transaction.input_data, group_key + '_indexes') total_rows_used_by_subset[group_key] = sum( [len(pointer[key_i]) for key_i in pointer]) number_of_groups_per_subset[group_key] = len(pointer) #average_number_of_rows_used_per_groupby[group_key] = total_rows_used_by_subset[group_key] / number_of_groups_per_subset[group_key] total_rows_used = sum(total_rows_used_by_subset.values()) total_rows_in_input = len(self.transaction.input_data.data_array) total_number_of_groupby_groups = len( self.transaction.input_data.all_indexes) if total_rows_used != total_rows_in_input: self.log.info( 'You requested to sample with a *margin of error* of {sample_margin_of_error} and a *confidence level* of {sample_confidence_level}. Therefore:' .format(sample_confidence_level=self.transaction. lmd['sample_confidence_level'], sample_margin_of_error=self.transaction. lmd['sample_margin_of_error'])) self.log.info( 'Using a [Cochran\'s sample size calculator](https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/) we got the following sample sizes:' ) data = { 'total': [total_rows_in_input, 'Total number of rows in input'], 'subsets': [[total_rows_used, 'Total number of rows used']], 'label': 'Sample size for margin of error of ({sample_margin_of_error}) and a confidence level of ({sample_confidence_level})' .format(sample_confidence_level=self.transaction. lmd['sample_confidence_level'], sample_margin_of_error=self.transaction. lmd['sample_margin_of_error']) } self.log.infoChart(data, type='pie') ''' if total_number_of_groupby_groups > 1: self.log.info('You are grouping your data by [{group_by}], we found:'.format(group_by=', '.join(group_by))) data = { 'Total number of groupby groups': total_number_of_groupby_groups, 'Average number of rows per groupby group': int(sum(average_number_of_rows_used_per_groupby.values())/len(average_number_of_rows_used_per_groupby)) } self.log.infoChart(data, type='list') ''' self.log.info('We have split the input data into:') data = { 'subsets': [[total_rows_used_by_subset['train'], 'Train'], [total_rows_used_by_subset['test'], 'Test'], [total_rows_used_by_subset['validation'], 'Validation']], 'label': 'Number of rows per subset' } self.log.infoChart(data, type='pie')
def run(self): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ header = self.transaction.input_data.columns non_null_data = {} all_sampled_data = {} for column in header: non_null_data[column] = [] all_sampled_data[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int( calculate_sample_size( population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) if sample_size > 3000 and sample_size > population_size / 8: sample_size = min(round(population_size / 8), 3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info( 'population_size={population_size}, sample_size={sample_size} {percent:.2f}%' .format(population_size=population_size, sample_size=sample_size, percent=(sample_size / population_size) * 100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = cast_string_to_python_type(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: non_null_data[column].append(value) all_sampled_data[column].append(value) column_count[column] += 1 stats = {} col_data_dict = {} for i, col_name in enumerate(non_null_data): col_data = non_null_data[col_name] # all rows in just one column full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info = self._get_column_data_type( col_data, i) if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [ str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ]: col_data[i] = None else: try: col_data[i] = int( parse_datetime(element).timestamp()) except: self.log.warning( 'Could not convert string to date and it was expected, current value {value}' .format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [ clean_float(i) for i in newData if str(i) not in [ '', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ] ] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.1 initial_step_size = abs(max_value - min_value) / 100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i - min_value) * inc_rate i = i + i_inc else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max( [1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } elif data_type == DATA_TYPES.CATEGORICAL: all_values = [] for row in self.transaction.input_data.data_array: all_values.append(row[i]) histogram = Counter(all_values) all_possible_values = histogram.keys() col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": { "x": list(histogram.keys()), "y": list(histogram.values()) } #"percentage_buckets": list(histogram.keys()) } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False dictionary, histogram = self._get_words_dictionary( col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len(dictionary) / len( col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len( col_data) > 50 and is_full_text == False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name stats[col_name]['empty_cells'] = empty_count[col_name] stats[col_name]['empty_percentage'] = empty_count[ col_name] * 100 / column_count[col_name] if 'separator' in additional_info: stats[col_name]['separator'] = additional_info['separator'] col_data_dict[col_name] = col_data for i, col_name in enumerate(all_sampled_data): stats[col_name].update( self._compute_duplicates_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_empty_cells_score(stats, all_sampled_data, col_name)) #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_data_type_dist_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_z_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_lof_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_similariy_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_value_distribution_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_consistency_score(stats, col_name)) stats[col_name].update( self._compute_redundancy_score(stats, col_name)) stats[col_name].update( self._compute_variability_score(stats, col_name)) stats[col_name].update( self._compute_data_quality_score(stats, col_name)) total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.lmd['column_stats'] = stats self.transaction.lmd['data_preparation'][ 'total_row_count'] = total_rows self.transaction.lmd['data_preparation']['test_row_count'] = test_rows self.transaction.lmd['data_preparation'][ 'train_row_count'] = train_rows self.transaction.lmd['data_preparation'][ 'validation_row_count'] = validation_rows self._log_interesting_stats(stats) return stats