def get_number_subtype(string): """ Returns the subtype inferred from a number string, or False if its not a number""" string = str(string) python_type = type(cast_string_to_python_type(string)) if python_type is float: return DATA_SUBTYPES.FLOAT elif python_type is int: return DATA_SUBTYPES.INT else: return None
def run(self): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ header = self.transaction.input_data.columns non_null_data = {} all_sampled_data = {} for column in header: non_null_data[column] = [] all_sampled_data[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int( calculate_sample_size( population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) if sample_size > 3000 and sample_size > population_size / 8: sample_size = min(round(population_size / 8), 3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info( 'population_size={population_size}, sample_size={sample_size} {percent:.2f}%' .format(population_size=population_size, sample_size=sample_size, percent=(sample_size / population_size) * 100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = cast_string_to_python_type(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: non_null_data[column].append(value) all_sampled_data[column].append(value) column_count[column] += 1 stats = {} col_data_dict = {} for i, col_name in enumerate(non_null_data): col_data = non_null_data[col_name] # all rows in just one column full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info = self._get_column_data_type( col_data, i) if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [ str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ]: col_data[i] = None else: try: col_data[i] = int( parse_datetime(element).timestamp()) except: self.log.warning( 'Could not convert string to date and it was expected, current value {value}' .format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [ clean_float(i) for i in newData if str(i) not in [ '', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null' ] ] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.1 initial_step_size = abs(max_value - min_value) / 100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i - min_value) * inc_rate i = i + i_inc else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max( [1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } elif data_type == DATA_TYPES.CATEGORICAL: all_values = [] for row in self.transaction.input_data.data_array: all_values.append(row[i]) histogram = Counter(all_values) all_possible_values = histogram.keys() col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": { "x": list(histogram.keys()), "y": list(histogram.values()) } #"percentage_buckets": list(histogram.keys()) } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False dictionary, histogram = self._get_words_dictionary( col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len(dictionary) / len( col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len( col_data) > 50 and is_full_text == False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name stats[col_name]['empty_cells'] = empty_count[col_name] stats[col_name]['empty_percentage'] = empty_count[ col_name] * 100 / column_count[col_name] if 'separator' in additional_info: stats[col_name]['separator'] = additional_info['separator'] col_data_dict[col_name] = col_data for i, col_name in enumerate(all_sampled_data): stats[col_name].update( self._compute_duplicates_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_empty_cells_score(stats, all_sampled_data, col_name)) #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_data_type_dist_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_z_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_lof_score(stats, col_data_dict, col_name)) stats[col_name].update( self._compute_similariy_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_value_distribution_score(stats, all_sampled_data, col_name)) stats[col_name].update( self._compute_consistency_score(stats, col_name)) stats[col_name].update( self._compute_redundancy_score(stats, col_name)) stats[col_name].update( self._compute_variability_score(stats, col_name)) stats[col_name].update( self._compute_data_quality_score(stats, col_name)) total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.lmd['column_stats'] = stats self.transaction.lmd['data_preparation'][ 'total_row_count'] = total_rows self.transaction.lmd['data_preparation']['test_row_count'] = test_rows self.transaction.lmd['data_preparation'][ 'train_row_count'] = train_rows self.transaction.lmd['data_preparation'][ 'validation_row_count'] = validation_rows self._log_interesting_stats(stats) return stats