def run(self): # Handle transactions differently depending on the type of query # For now we only support LEARN and PREDICT # Train metadata is the metadata that was used when training the model, # note: that we need this train metadata even if we are predicting, so we can understand about the model train_metadata = None if self.transaction.metadata.type == TRANSACTION_PREDICT: # extract this from the persistent_model_metadata train_metadata = TransactionMetadata() train_metadata.setFromDict( self.transaction.persistent_model_metadata.train_metadata) elif self.transaction.metadata.type == TRANSACTION_LEARN: # Pull this straight from the the current transaction train_metadata = self.transaction.metadata else: # We cannot proceed without train metadata self.session.logging.error( 'Do not support transaction {type}'.format( type=self.transaction.metadata.type)) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return result = self.getPreparedInputDF(train_metadata) columns = list(result.columns.values) data_array = list(result.values.tolist()) self.transaction.input_data.columns = columns # make sure that the column we are trying to predict is on the input_data # else fail, because we cannot predict data we dont have # TODO: Revise this, I may pass a source data that doesnt have the column I want to predict and that may still be ok if we are making a prediction that is not time series if len(data_array[0] ) > 0 and self.transaction.metadata.model_predict_columns: for col_target in self.transaction.metadata.model_predict_columns: if col_target not in self.transaction.input_data.columns: err = 'Trying to predict column {column} but column not in source data'.format( column=col_target) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return self.transaction.input_data.data_array = data_array # extract test data if this is a learn transaction and there is a test query if self.transaction.metadata.type == TRANSACTION_LEARN: # if a test_data set was given use it if self.transaction.metadata.test_from_data: df = self.transaction.metadata.test_from_data.df test_result = df.where((pandas.notnull(df)), None) columns = list(test_result.columns.values) data_array = test_result.values.tolist() # Make sure that test adn train sets match column wise if columns != self.transaction.input_data.columns: err = 'Trying to get data for test but columns in train set and test set dont match' self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return total_data_array = len(self.transaction.input_data.data_array) total_test_array = len(data_array) test_indexes = [ i for i in range(total_data_array, total_data_array + total_test_array) ] self.transaction.input_data.test_indexes = test_indexes # make the input data relevant self.transaction.input_data.data_array += data_array # we later use this to either regenerate or not test_prob = 0 else: test_prob = CONFIG.TEST_TRAIN_RATIO validation_prob = CONFIG.TEST_TRAIN_RATIO / (1 - test_prob) group_by = self.transaction.metadata.model_group_by if group_by: try: group_by_index = self.transaction.input_data.columns.index( group_by) except: group_by_index = None err = 'Trying to group by, {column} but column not in source data'.format( column=group_by) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return # get unique group by values #all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query) #self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query)) uniques = result.groupby([group_by]).size() all_group_by_values = uniques.index.tolist() uniques_counts = uniques.values.tolist() # create a list of values in group by, this is because result is array of array we want just array all_group_by_counts = { value: uniques_counts[i] for i, value in enumerate(all_group_by_values) } max_group_by = max(list(all_group_by_counts.values())) self.transaction.persistent_model_metadata.max_group_by_count = max_group_by # we will fill these depending on the test_prob and validation_prob test_group_by_values = [] validation_group_by_values = [] train_group_by_values = [] # split the data into test, validation, train by group by data for group_by_value in all_group_by_values: # depending on a random number if less than x_prob belongs to such group # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) < test_prob and len( train_group_by_values) > 0: test_group_by_values += [group_by_value] # elif float(random.random()) < validation_prob: # validation_group_by_values += [group_by_value] else: train_group_by_values += [group_by_value] for i, row in enumerate(self.transaction.input_data.data_array): in_test = True if i in self.transaction.input_data.test_indexes else False if not in_test: if group_by: group_by_value = row[group_by_index] if group_by_value in test_group_by_values: self.transaction.input_data.test_indexes += [i] elif group_by_value in train_group_by_values: self.transaction.input_data.train_indexes += [i] elif group_by_value in validation_group_by_values: self.transaction.input_data.validation_indexes += [ i ] else: # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) <= test_prob or len( self.transaction.input_data.test_indexes) == 0: self.transaction.input_data.test_indexes += [i] elif float(random.random()) <= validation_prob or len( self.transaction.input_data.validation_indexes ) == 0: self.transaction.input_data.validation_indexes += [ i ] else: self.transaction.input_data.train_indexes += [i] if len(self.transaction.input_data.test_indexes) == 0: logging.debug('Size of test set is zero, last split') ratio = CONFIG.TEST_TRAIN_RATIO if group_by and len( self.transaction.input_data.train_indexes) > 2000: # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant ratio = ratio * 2 test_size = int( len(self.transaction.input_data.train_indexes) * ratio) self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[ -test_size:] self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[: -test_size] logging.info('- Test: {size} rows'.format( size=len(self.transaction.input_data.test_indexes))) logging.info('- Train: {size} rows'.format( size=len(self.transaction.input_data.train_indexes)))
class StatsGenerator(BaseModule): phase_name = PHASE_DATA_STATS def isNumber(self, string): """ Returns True if string is a number. """ try: cleanfloat(string) return True except ValueError: return False def isDate(self, string): """ Returns True if string is a valid date format """ try: parseDate(string) return True except ValueError: return False def getColumnDataType(self, data): """ Returns the column datatype based on a random sample of 15 elements """ currentGuess = DATA_TYPES.NUMERIC type_dist = {} for element in data: if self.isNumber(element): currentGuess = DATA_TYPES.NUMERIC elif self.isDate(element): currentGuess = DATA_TYPES.DATE else: currentGuess = DATA_TYPES.TEXT if currentGuess not in type_dist: type_dist[currentGuess] = 1 else: type_dist[currentGuess] += 1 curr_data_type = DATA_TYPES.TEXT max_data_type = 0 for data_type in type_dist: if type_dist[data_type] > max_data_type: curr_data_type = data_type max_data_type = type_dist[data_type] if curr_data_type == DATA_TYPES.TEXT: return self.getTextType(data) return curr_data_type def getBestFitDistribution(self, data, bins=40): """Model data by finding best fit distribution to data""" # Get histogram of original data y, x = np.histogram(data, bins=bins, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 # Distributions to check DISTRIBUTIONS = [ st.bernoulli, st.beta, st.cauchy, st.expon, st.gamma, st.halfcauchy, st.lognorm, st.norm, st.uniform, st.poisson ] # Best holders best_distribution = st.norm best_params = (0.0, 1.0) best_sse = np.inf # Estimate distribution parameters from data for i, distribution in enumerate(DISTRIBUTIONS): try: # Ignore warnings from data that can't be fit with warnings.catch_warnings(): warnings.filterwarnings('ignore') # fit dist to data params = distribution.fit(data) # Separate parts of parameters arg = params[:-2] loc = params[-2] scale = params[-1] # Calculate fitted PDF and error with fit in distribution pdf = distribution.pdf(x, loc=loc, scale=scale, *arg) sse = np.sum(np.power(y - pdf, 2.0)) # identify if this distribution is better if best_sse > sse > 0: best_distribution = distribution best_params = params best_sse = sse except Exception: pass return (best_distribution.name, best_params, x.tolist(), y.tolist()) def getTextType(self, data): total_length = len(data) key_count = {} max_number_of_words = 0 for cell in data: if cell not in key_count: key_count[cell] = 1 else: key_count[cell] += 1 cell_wseparator = cell sep_tag = '{#SEP#}' for separator in WORD_SEPARATORS: cell_wseparator = str(cell_wseparator).replace(separator,sep_tag) words_split = cell_wseparator.split(sep_tag) words = len([ word for word in words_split if word not in ['', None] ]) if max_number_of_words < words: max_number_of_words += words if max_number_of_words == 1: return DATA_TYPES.TEXT if max_number_of_words <= 3 and len(key_count) < total_length * 0.8: return DATA_TYPES.TEXT else: return DATA_TYPES.FULL_TEXT # def isFullText(self, data): # """ # It determines if the column is full text right # Right now we assume its full text if any cell contains any of the WORD_SEPARATORS # # :param data: a list containing all the column # :return: Boolean # """ # for cell in data: # try: # if any(separator in cell for separator in WORD_SEPARATORS): # return True # except: # exc_type, exc_value, exc_traceback = sys.exc_info() # error = traceback.format_exception(exc_type, exc_value, # exc_traceback) # return False # return False def getWordsDictionary(self, data, full_text = False): """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """ splitter = lambda w, t: [wi.split(t) for wi in w] if type(w) == type([]) else splitter(w,t) if full_text: # get all words in every cell and then calculate histograms words = [] for cell in data: words += splitRecursive(cell, WORD_SEPARATORS) hist = {i: words.count(i) for i in words} x = list(hist.keys()) histogram = { 'x': x, 'y': list(hist.values()) } return x, histogram else: hist = {i: data.count(i) for i in data} x = list(hist.keys()) histogram = { 'x': x, 'y': list(hist.values()) } return x, histogram def getParamsAsDictionary(self, params): """ Returns a dictionary with the params of the distribution """ arg = params[:-2] loc = params[-2] scale = params[-1] ret = { 'loc': loc, 'scale': scale, 'shape': arg } return ret def run(self): self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata) header = self.transaction.input_data.columns origData = {} for column in header: origData[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.logging.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = tryCastToNumber(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: origData[column].append(value) column_count[column] += 1 stats = {} for i, col_name in enumerate(origData): col_data = origData[col_name] # all rows in just one column data_type = self.getColumnDataType(col_data) # NOTE: Enable this if you want to assume that some numeric values can be text # We noticed that by default this should not be the behavior # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification # # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC: # unique_count = len(set(col_data)) # if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN: # data_type = DATA_TYPES.TEXT if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']: col_data[i] = None else: try: col_data[i] = int(parseDate(element).timestamp()) except: logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.05 initial_step_size = abs(max_value-min_value)/100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i-min_value)*inc_rate i = i + i_inc # TODO: Solve inc_rate for N # min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate .... # # x_0 = 0 # x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate # # sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) ) # # mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) ) # # mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) )) # = n*min(sum_(i=1)^(i=n)(inc_rate^i)) # => sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n)) # # # i + i*x else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { "column": col_name, KEYS.DATA_TYPE: data_type, # "distribution": best_fit_name, # "distributionParams": distribution_params, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } stats[col_name] = col_stats # else if its text else: # see if its a sentence or a word is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False dictionary, histogram = self.getWordsDictionary(col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { "column": col_name, KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "histogram": histogram } stats[col_name] = col_stats total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.persistent_model_metadata.column_stats = stats self.transaction.persistent_model_metadata.total_row_count = total_rows self.transaction.persistent_model_metadata.test_row_count = test_rows self.transaction.persistent_model_metadata.train_row_count = train_rows self.transaction.persistent_model_metadata.validation_row_count = validation_rows self.transaction.persistent_model_metadata.update() return stats
class ModelTrainer(BaseModule): phase_name = PHASE_MODEL_TRAINER def run(self): """ Run the training process, we can perhaps iterate over all hyper parameters here and spun off model variations TODO: checkout the RISELab distributed ML projects for this :return: None """ model_name = self.transaction.persistent_model_metadata.model_name self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict( self.transaction.persistent_model_metadata.train_metadata) group_by = self.train_meta_data.model_group_by # choose which models to try # NOTE: On server mode more than one can be used, on serverless, choose only # TODO: On serverless mode bring smarter way to choose if group_by: ml_models = [('pytorch.models.ensemble_fully_connected_net', {}) # ,('pytorch.models.ensemble_conv_net', {}) ] else: ml_models = [ #('pytorch.models.ensemble_conv_net', {}) ('pytorch.models.fully_connected_buckets_net', {}) #,('pytorch.models.ensemble_fully_connected_net', {}) ] self.train_start_time = time.time() self.session.logging.info( 'Training: model {model_name}, epoch 0'.format( model_name=model_name)) self.last_time = time.time() # We moved everything to a worker so we can run many of these in parallel # Train column encoders for ml_model_data in ml_models: config = ml_model_data[1] ml_model = ml_model_data[0] if CONFIG.EXEC_LEARN_IN_THREAD == False or len(ml_models) == 1: TrainWorker.start(self.transaction.model_data, model_name=model_name, ml_model=ml_model, config=config) else: # Todo: use Ray https://github.com/ray-project/tutorial # Before moving to actual workers: MUST FIND A WAY TO SEND model data to the worker in an efficient way first _thread.start_new_thread(TrainWorker.start, (self.transaction.model_data, model_name, ml_model, config)) # return total_time = time.time() - self.train_start_time self.session.logging.info( 'Trained: model {model_name} [OK], TOTAL TIME: {total_time:.2f} seconds' .format(model_name=model_name, total_time=total_time))
class DataVectorizer(BaseModule): phase_name = PHASE_DATA_VECTORIZATION def cast(self, string): """ Returns an integer, float or a string from a string""" try: if string is None: return None return int(string) except ValueError: try: return cleanfloat(string) except ValueError: if string == '': return None else: return string def _getRowExtraVector(self, ret, column_name, col_row_index, distances): predict_columns = self.train_meta_data.model_predict_columns desired_total = self.train_meta_data.window_size batch_height = len(ret[column_name]) remaining_row_count = batch_height - (col_row_index + 1) harvest_count = desired_total if desired_total < remaining_row_count else remaining_row_count empty_count = desired_total - harvest_count empty_vector_len = (len(ret[column_name][col_row_index]) + sum([ len(ret[predict_col_name][0]) for predict_col_name in predict_columns ]) + 1) * empty_count # this is the width of the padding row_extra_vector = [] for i in range(harvest_count): try: row_extra_vector += ret[column_name][col_row_index + i + 1] row_extra_vector += [distances[col_row_index + i + 1]] # append the target values before: for predict_col_name in predict_columns: row_extra_vector += [ cleanfloat(v) for v in ret[predict_col_name][col_row_index + i + 1] ] except: logging.error(traceback.format_exc()) logging.error( 'something is not right, seems like we got here with np arrays and they should not be!' ) if empty_count > 0: # complete with empty row_extra_vector += [0] * empty_vector_len return row_extra_vector def run(self): self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict( self.transaction.persistent_model_metadata.train_metadata) group_by = self.train_meta_data.model_group_by group_by_index = None if group_by: group_by_index = self.transaction.input_data.columns.index( group_by ) # TODO: Consider supporting more than one index column # this is a template of how we store columns column_packs_template = OrderedDict() # create a template of the column packs for i, column_name in enumerate(self.transaction.input_data.columns): column_packs_template[column_name] = [] if self.transaction.metadata.type == TRANSACTION_LEARN: groups = [{ 'name': 'test', 'target_set': self.transaction.model_data.test_set, 'map': self.transaction.model_data.test_set_map, 'indexes': self.transaction.input_data.test_indexes }, { 'name': 'train', 'target_set': self.transaction.model_data.train_set, 'map': self.transaction.model_data.train_set_map, 'indexes': self.transaction.input_data.train_indexes }, { 'name': 'validation', 'target_set': self.transaction.model_data.validation_set, 'map': self.transaction.model_data.validation_set_map, 'indexes': self.transaction.input_data.validation_indexes }] else: groups = [{ 'name': 'predict', 'target_set': self.transaction.model_data.predict_set, 'map': self.transaction.model_data.predict_set_map, 'indexes': range(0, len(self.transaction.input_data.data_array) ) # TODO: measure impact of this }] # iterate over all groups and populate tensors by columns for group in groups: target_set = group['target_set'] # for ease use a pointer # iterate over all indexes taht belong to this group for input_row_index in group['indexes']: row = self.transaction.input_data.data_array[ input_row_index] # extract the row from input data map = group['map'] if group_by is not None: group_by_hash = hashtext(row[group_by_index]) else: group_by_hash = KEY_NO_GROUP_BY # if the set group has not been initiated add a new one if group_by_hash not in target_set: target_set[group_by_hash] = copy.deepcopy( column_packs_template) map[group_by_hash] = {} # Now populate into the group_hash column pile for column_index, cell_value in enumerate(row): column_name = self.transaction.input_data.columns[ column_index] value = self.cast(cell_value) stats = self.transaction.persistent_model_metadata.column_stats[ column_name] # TODO: Provide framework for custom nom functions # TODO: FIX norm allways add column for is null normalized = norm( value=value, cell_stats=stats ) # this should return a vector representation already normalized # keep track of where it came from in the input data inc ase we need to go back position = len(target_set[group_by_hash][column_name]) map[group_by_hash][position] = input_row_index # append normalized vector to column tensor target_set[group_by_hash][column_name] += [normalized] if self.transaction.persistent_model_metadata.column_stats[ column_name][KEYS.DATA_TYPE] in [ DATA_TYPES.NUMERIC, DATA_TYPES.DATE ] and column_name in self.train_meta_data.model_predict_columns: column_name_expanded = EXTENSION_COLUMNS_TEMPLATE.format( column_name=column_name) if column_name_expanded not in target_set[ group_by_hash]: target_set[group_by_hash][ column_name_expanded] = [] normalized_buckets = norm_buckets(value=value, cell_stats=stats) target_set[group_by_hash][column_name_expanded] += [ normalized_buckets ] # turn into numpy arrays: for group_by_hash in target_set: distances = None # if we have a group by and order by calculate a distances vector for each data point in this batch if self.train_meta_data.model_group_by is not None and self.train_meta_data.model_order_by is not None: distances = [] batch_height = len(target_set[group_by_hash][ self.train_meta_data.model_group_by]) # create a vector for the top distance for j in range(batch_height): order_by_bottom_vector = np.array( list( itertools.chain.from_iterable([ target_set[group_by_hash][order_by_col][j] for order_by_col in self.train_meta_data.model_order_by ]))) if j == 0: order_by_top_vector = order_by_bottom_vector else: order_by_top_vector = np.array( list( itertools.chain.from_iterable([ target_set[group_by_hash][order_by_col] [j - 1] for order_by_col in self.train_meta_data.model_order_by ]))) # create a vector for the current row # calculate distance and append to distances distance = float( np.linalg.norm(order_by_top_vector - order_by_bottom_vector)) distances.append(distance) # Append the time series data to each column # NOTE: we want to make sure that the self.train_meta_data.model_predict_columns are the first in being converted into vectors # the reason for this is that if there is a time series query then we will want to add the history of the target value (see self._getRowExtraVector) columns_in_order = self.train_meta_data.model_predict_columns + [ column_name for column_name in target_set[group_by_hash] if column_name not in self.train_meta_data.model_predict_columns ] for column_name in columns_in_order: # if there is a group by and order by and this is not a column to be predicted, append history vector # TODO: Encode the history vector if possible non_groupable_columns = self.train_meta_data.model_predict_columns + [ self.train_meta_data.model_group_by ] + self.train_meta_data.model_order_by # NOTE: since distances is only not None if there is a group by this is only evaluated for group by queries if distances is not None and column_name not in non_groupable_columns: # for each row create a vector of history and append to it prev = 0 for col_row_index, col_row in enumerate( target_set[group_by_hash][column_name]): row_extra_vector = self._getRowExtraVector( target_set[group_by_hash], column_name, col_row_index, distances) target_set[group_by_hash][column_name][ col_row_index] = target_set[group_by_hash][ column_name][ col_row_index] + row_extra_vector target_set[group_by_hash][column_name] = np.array( target_set[group_by_hash][column_name]) return []
def run(self): # Handle transactions differently depending on the type of query # For now we only support LEARN and PREDICT train_metadata = self.transaction.metadata if self.transaction.metadata.type == TRANSACTION_PREDICT: self.populatePredictQuery() train_metadata = TransactionMetadata() train_metadata.setFromDict(self.transaction.persistent_model_metadata.train_metadata) elif self.transaction.metadata.type not in [TRANSACTION_PREDICT, TRANSACTION_LEARN]: self.session.logging.error('Do not support transaction {type}'.format(type=self.transaction.metadata.type)) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return query = self.prepareFullQuery(train_metadata) try: self.transaction.session.logging.info('About to pull query {query}'.format(query=query)) conn = sqlite3.connect(self.transaction.metadata.storage_file) self.logging.info(self.transaction.metadata.model_query) df = pandas.read_sql_query(query, conn) result = df.where((pandas.notnull(df)), None) df = None # clean memory except Exception: self.session.logging.error(traceback.print_exc()) self.transaction.error =True self.transaction.errorMsg = traceback.print_exc(1) return columns = list(result.columns.values) data_array = list(result.values.tolist()) self.transaction.input_data.columns = columns if len(data_array[0])>0 and self.transaction.metadata.model_predict_columns: for col_target in self.transaction.metadata.model_predict_columns: if col_target not in self.transaction.input_data.columns: err = 'Trying to predict column {column} but column not in source data'.format(column=col_target) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return self.transaction.input_data.data_array = data_array # extract test data if this is a learn transaction and there is a test query if self.transaction.metadata.type == TRANSACTION_LEARN: if self.transaction.metadata.model_test_query: try: test_query = query_wrapper.format(orig_query = self.transaction.metadata.model_test_query, order_by_string= order_by_string, where_not_null_string=where_not_null_string) self.transaction.session.logging.info('About to pull TEST query {query}'.format(query=test_query)) #drill = self.session.drill.query(test_query, timeout=CONFIG.DRILL_TIMEOUT) df = pandas.read_sql_query(test_query, conn) result = df.where((pandas.notnull(df)), None) df = None #result = vars(drill)['data'] except Exception: # If testing offline, get results from a .cache file self.session.logging.error(traceback.print_exc()) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return columns = list(result.columns.values) data_array = result.values.tolist() # Make sure that test adn train sets match column wise if columns != self.transaction.input_data.columns: err = 'Trying to get data for test but columns in train set and test set dont match' self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return total_data_array = len(self.transaction.input_data.data_array) total_test_array = len(data_array) test_indexes = [i for i in range(total_data_array, total_data_array+total_test_array)] self.transaction.input_data.test_indexes = test_indexes # make the input data relevant self.transaction.input_data.data_array += data_array # we later use this to either regenerate or not test_prob = 0 else: test_prob = CONFIG.TEST_TRAIN_RATIO validation_prob = CONFIG.TEST_TRAIN_RATIO / (1-test_prob) group_by = self.transaction.metadata.model_group_by if group_by: try: group_by_index = self.transaction.input_data.columns.index(group_by) except: group_by_index = None err = 'Trying to group by, {column} but column not in source data'.format(column=group_by) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return # get unique group by values all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query) self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query)) df = pandas.read_sql_query(all_group_by_items_query, conn) result = df.where((pandas.notnull(df)), None) # create a list of values in group by, this is because result is array of array we want just array all_group_by_counts = {i[0]:i[1] for i in result.values.tolist()} all_group_by_values = all_group_by_counts.keys() max_group_by = max(list(all_group_by_counts.values())) self.transaction.persistent_model_metadata.max_group_by_count = max_group_by # we will fill these depending on the test_prob and validation_prob test_group_by_values = [] validation_group_by_values = [] train_group_by_values = [] # split the data into test, validation, train by group by data for group_by_value in all_group_by_values: # depending on a random number if less than x_prob belongs to such group # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) < test_prob and len(train_group_by_values) > 0: test_group_by_values += [group_by_value] # elif float(random.random()) < validation_prob: # validation_group_by_values += [group_by_value] else: train_group_by_values += [group_by_value] for i, row in enumerate(self.transaction.input_data.data_array): in_test = True if i in self.transaction.input_data.test_indexes else False if not in_test: if group_by: group_by_value = row[group_by_index] if group_by_value in test_group_by_values : self.transaction.input_data.test_indexes += [i] elif group_by_value in train_group_by_values : self.transaction.input_data.train_indexes += [i] elif group_by_value in validation_group_by_values : self.transaction.input_data.validation_indexes += [i] else: # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) <= test_prob or len(self.transaction.input_data.test_indexes) == 0: self.transaction.input_data.test_indexes += [i] elif float(random.random()) <= validation_prob or len(self.transaction.input_data.validation_indexes)==0: self.transaction.input_data.validation_indexes += [i] else: self.transaction.input_data.train_indexes += [i] if len(self.transaction.input_data.test_indexes) == 0: logging.debug('Size of test set is zero, last split') ratio = CONFIG.TEST_TRAIN_RATIO if group_by and len(self.transaction.input_data.train_indexes) > 2000: # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant ratio = ratio*2 test_size = int(len(self.transaction.input_data.train_indexes) * ratio) self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[-test_size:] self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:-test_size] logging.info('- Test: {size} rows'.format(size=len(self.transaction.input_data.test_indexes))) logging.info('- Train: {size} rows'.format(size=len(self.transaction.input_data.train_indexes)))