def isNumber(self, string): """ Returns True if string is a number. """ try: cleanfloat(string) return True except ValueError: return False
def cast(self, string): """ Returns an integer, float or a string from a string""" try: if string is None: return None return int(string) except ValueError: try: return cleanfloat(string) except ValueError: if string == '': return None else: return string
def _getRowExtraVector(self, ret, column_name, col_row_index, distances): predict_columns = self.train_meta_data.model_predict_columns desired_total = self.train_meta_data.window_size batch_height = len(ret[column_name]) remaining_row_count = batch_height - (col_row_index + 1) harvest_count = desired_total if desired_total < remaining_row_count else remaining_row_count empty_count = desired_total - harvest_count empty_vector_len = (len(ret[column_name][col_row_index]) + sum([ len(ret[predict_col_name][0]) for predict_col_name in predict_columns ]) + 1) * empty_count # this is the width of the padding row_extra_vector = [] for i in range(harvest_count): try: row_extra_vector += ret[column_name][col_row_index + i + 1] row_extra_vector += [distances[col_row_index + i + 1]] # append the target values before: for predict_col_name in predict_columns: row_extra_vector += [ cleanfloat(v) for v in ret[predict_col_name][col_row_index + i + 1] ] except: logging.error(traceback.format_exc()) logging.error( 'something is not right, seems like we got here with np arrays and they should not be!' ) if empty_count > 0: # complete with empty row_extra_vector += [0] * empty_vector_len return row_extra_vector
def run(self): self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata) header = self.transaction.input_data.columns origData = {} for column in header: origData[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.logging.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = tryCastToNumber(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: origData[column].append(value) column_count[column] += 1 stats = {} for i, col_name in enumerate(origData): col_data = origData[col_name] # all rows in just one column data_type = self.getColumnDataType(col_data) # NOTE: Enable this if you want to assume that some numeric values can be text # We noticed that by default this should not be the behavior # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification # # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC: # unique_count = len(set(col_data)) # if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN: # data_type = DATA_TYPES.TEXT if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']: col_data[i] = None else: try: col_data[i] = int(parseDate(element).timestamp()) except: logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.05 initial_step_size = abs(max_value-min_value)/100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i-min_value)*inc_rate i = i + i_inc # TODO: Solve inc_rate for N # min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate .... # # x_0 = 0 # x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate # # sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) ) # # mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) ) # # mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) )) # = n*min(sum_(i=1)^(i=n)(inc_rate^i)) # => sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n)) # # # i + i*x else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { "column": col_name, KEYS.DATA_TYPE: data_type, # "distribution": best_fit_name, # "distributionParams": distribution_params, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } stats[col_name] = col_stats # else if its text else: # see if its a sentence or a word is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False dictionary, histogram = self.getWordsDictionary(col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { "column": col_name, KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "histogram": histogram } stats[col_name] = col_stats total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.persistent_model_metadata.column_stats = stats self.transaction.persistent_model_metadata.total_row_count = total_rows self.transaction.persistent_model_metadata.test_row_count = test_rows self.transaction.persistent_model_metadata.train_row_count = train_rows self.transaction.persistent_model_metadata.validation_row_count = validation_rows self.transaction.persistent_model_metadata.update() return stats