示例#1
0
    def run(self):
        """
        Run the training process, we can perhaps iterate over all hyper parameters here and spun off model variations
        TODO: checkout the RISELab distributed ML projects for this

        :return: None
        """

        model_name = self.transaction.persistent_model_metadata.model_name
        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(
            self.transaction.persistent_model_metadata.train_metadata)

        group_by = self.train_meta_data.model_group_by

        # choose which models to try
        # NOTE: On server mode more than one can be used, on serverless, choose only
        # TODO: On serverless mode bring smarter way to choose
        if group_by:
            ml_models = [('pytorch.models.ensemble_fully_connected_net', {})
                         # ,('pytorch.models.ensemble_conv_net', {})
                         ]
        else:
            ml_models = [('pytorch.models.fully_connected_net', {})
                         #,('pytorch.models.ensemble_fully_connected_net', {})
                         ]

        self.train_start_time = time.time()

        self.session.logging.info(
            'Training: model {model_name}, epoch 0'.format(
                model_name=model_name))

        self.last_time = time.time()
        # We moved everything to a worker so we can run many of these in parallel

        # Train column encoders

        for ml_model_data in ml_models:
            config = ml_model_data[1]
            ml_model = ml_model_data[0]

            if CONFIG.EXEC_LEARN_IN_THREAD == False or len(ml_models) == 1:
                TrainWorker.start(self.transaction.model_data,
                                  model_name=model_name,
                                  ml_model=ml_model,
                                  config=config)

            else:
                # Todo: use Ray https://github.com/ray-project/tutorial
                # Before moving to actual workers: MUST FIND A WAY TO SEND model data to the worker in an efficient way first
                _thread.start_new_thread(TrainWorker.start,
                                         (self.transaction.model_data,
                                          model_name, ml_model, config))
            # return

        total_time = time.time() - self.train_start_time
        self.session.logging.info(
            'Trained: model {model_name} [OK], TOTAL TIME: {total_time:.2f} seconds'
            .format(model_name=model_name, total_time=total_time))
    def predict(self,
                predict,
                from_data=None,
                when={},
                model_name='mdsb_model',
                breakpoint=PHASE_END):
        """

        :param predict:
        :param when:
        :param model_name:
        :return:
        """

        if not predict:
            raise ValueError('Please provide valid predict value.')

        transaction_type = TRANSACTION_PREDICT

        from_ds = None if from_data is None else getDS(from_data)

        predict_columns = [predict] if type(predict) != type([]) else predict

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name
        transaction_metadata.model_predict_columns = predict_columns
        transaction_metadata.model_when_conditions = when
        transaction_metadata.type = transaction_type
        transaction_metadata.storage_file = self.storage_file
        transaction_metadata.from_data = from_ds

        transaction = self.session.newTransaction(transaction_metadata,
                                                  breakpoint)

        return transaction.output_data
示例#3
0
    def predict(self,
                predict,
                from_data=None,
                when={},
                model_name='mdsb_model',
                breakpoint=PHASE_END):
        """

        :param predict:
        :param when:
        :param model_name:
        :return:
        """

        transaction_type = TRANSACTION_PREDICT

        predict_columns = [predict] if type(predict) != type([]) else predict

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name
        transaction_metadata.model_predict_columns = predict_columns
        transaction_metadata.model_when_conditions = when
        transaction_metadata.type = transaction_type
        transaction_metadata.storage_file = self.storage_file
        transaction_metadata.from_data = from_data

        transaction = self.session.newTransaction(transaction_metadata,
                                                  breakpoint)

        return transaction.output_data
示例#4
0
    def predict(self,
                when={},
                from_data=None,
                model_name='mdsb_model',
                breakpoint=PHASE_END,
                **kargs):
        """

        :param predict:
        :param when:
        :param model_name:
        :return:
        """

        transaction_type = TRANSACTION_PREDICT

        from_ds = None if from_data is None else getDS(from_data)

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name

        # This will become irrelevant as if we have trained a model with a predict we just need to pass when or from_data
        # predict_columns = [predict] if type(predict) != type([]) else predict
        # transaction_metadata.model_predict_columns = predict_columns

        transaction_metadata.model_when_conditions = when
        transaction_metadata.type = transaction_type
        transaction_metadata.storage_file = self.storage_file
        transaction_metadata.from_data = from_ds

        transaction = self.session.newTransaction(transaction_metadata,
                                                  breakpoint)

        return transaction.output_data
示例#5
0
    def learn(self,
              predict,
              from_query=None,
              from_file=None,
              model_name='mdsb_model',
              test_query=None,
              group_by=None,
              group_by_limit=MODEL_GROUP_BY_DEAFAULT_LIMIT,
              order_by=[],
              breakpoint=PHASE_END):
        """

        :param from_query:
        :param predict:
        :param model_name:
        :param test_query:
        :return:
        """

        if from_file is not None:
            from_file_dest = os.path.basename(from_file).split('.')[0]
            self.addTable(CSVFileDS(from_file), from_file_dest)
            if from_query is None:
                from_query = 'select * from {from_file_dest}'.format(
                    from_file_dest=from_file_dest)
                logging.info('setting up custom learn query for file. ' +
                             from_query)

        transaction_type = TRANSACTION_LEARN

        predict_columns = [predict] if type(predict) != type([]) else predict

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name
        transaction_metadata.model_query = from_query
        transaction_metadata.model_predict_columns = predict_columns
        transaction_metadata.model_test_query = test_query
        transaction_metadata.model_group_by = group_by
        transaction_metadata.model_order_by = order_by if type(
            order_by) == type([]) else [order_by]
        transaction_metadata.model_group_by_limit = group_by_limit
        transaction_metadata.type = transaction_type

        self.startInfoServer()
        self.session.newTransaction(transaction_metadata, breakpoint)
示例#6
0
    def learn(self,
              predict,
              from_file=None,
              from_data=None,
              model_name='mdsb_model',
              test_from_data=None,
              group_by=None,
              window_size=MODEL_GROUP_BY_DEAFAULT_LIMIT,
              order_by=[],
              breakpoint=PHASE_END):
        """

        :param from_query:
        :param predict:
        :param model_name:
        :param test_query:
        :return:
        """

        from_ds = getDS(from_data) if from_file is None else getDS(from_file)
        test_from_ds = test_from_data if test_from_data is None else getDS(
            test_from_data)

        transaction_type = TRANSACTION_LEARN

        predict_columns = [predict] if type(predict) != type([]) else predict

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name
        transaction_metadata.model_predict_columns = predict_columns
        transaction_metadata.model_group_by = group_by
        transaction_metadata.model_order_by = order_by if type(
            order_by) == type([]) else [order_by]
        transaction_metadata.window_size = window_size
        transaction_metadata.type = transaction_type
        transaction_metadata.from_data = from_ds
        transaction_metadata.test_from_data = test_from_ds

        self.startInfoServer()
        self.session.newTransaction(transaction_metadata, breakpoint)
示例#7
0
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        # Train metadata is the metadata that was used when training the model,
        # note: that we need this train metadata even if we are predicting, so we can understand about the model
        train_metadata = None

        if self.transaction.metadata.type == TRANSACTION_PREDICT:
            # extract this from the persistent_model_metadata
            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(
                self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type == TRANSACTION_LEARN:
            # Pull this straight from the the current transaction
            train_metadata = self.transaction.metadata

        else:
            # We cannot proceed without train metadata
            self.session.logging.error(
                'Do not support transaction {type}'.format(
                    type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        result = self.getPreparedInputDF(train_metadata)

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        # make sure that the column we are trying to predict is on the input_data
        # else fail, because we cannot predict data we dont have
        # TODO: Revise this, I may pass a source data that doesnt have the column I want to predict and that may still be ok if we are making a prediction that is not time series
        if len(data_array[0]
               ) > 0 and self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(
                        column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            # if a test_data set was given use it
            if self.transaction.metadata.test_from_data:
                df = self.transaction.metadata.test_from_data.df
                test_result = df.where((pandas.notnull(df)), None)

                columns = list(test_result.columns.values)
                data_array = test_result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array = len(data_array)
                test_indexes = [
                    i for i in range(total_data_array, total_data_array +
                                     total_test_array)
                ]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1 - test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(
                        group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(
                        column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                #all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                #self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))

                uniques = result.groupby([group_by]).size()
                all_group_by_values = uniques.index.tolist()
                uniques_counts = uniques.values.tolist()

                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {
                    value: uniques_counts[i]
                    for i, value in enumerate(all_group_by_values)
                }

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(
                            train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values:
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values:
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(
                                self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(
                                self.transaction.input_data.validation_indexes
                        ) == 0:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(
                        self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio * 2
                test_size = int(
                    len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[
                    -test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:
                                                                                                      -test_size]

            logging.info('- Test: {size} rows'.format(
                size=len(self.transaction.input_data.test_indexes)))
            logging.info('- Train: {size} rows'.format(
                size=len(self.transaction.input_data.train_indexes)))
示例#8
0
    def learn(self,
              predict,
              from_file=None,
              from_data=None,
              model_name='mdsb_model',
              test_from_data=None,
              group_by=None,
              window_size=MODEL_GROUP_BY_DEAFAULT_LIMIT,
              order_by=[],
              breakpoint=PHASE_END,
              ignore_columns=[],
              rename_strange_columns=True):
        """

        :param from_query:
        :param predict:
        :param model_name:
        :param test_query:
        :return:
        """
        if self._from_data is None:
            from_ds = getDS(from_data) if from_file is None else getDS(
                from_file)
        else:
            from_ds = getDS(self._from_data)
        test_from_ds = test_from_data if test_from_data is None else getDS(
            test_from_data)

        transaction_type = TRANSACTION_LEARN

        predict_columns_map = {}
        predict_columns = [predict] if type(predict) != type([]) else predict

        if rename_strange_columns is False:
            for predict_col in predict_columns:
                predict_col_as_in_df = from_ds.getColNameAsInDF(predict_col)
                predict_columns_map[predict_col_as_in_df] = predict_col

            predict_columns = list(predict_columns_map.keys())
        else:
            logging.warning(
                'After version 1.0 rename_strange_columns in MindsDB().learn, the default value will be flipped from True to False '
            )

        transaction_metadata = TransactionMetadata()
        transaction_metadata.model_name = model_name
        transaction_metadata.model_predict_columns = predict_columns
        transaction_metadata.model_columns_map = {} if rename_strange_columns else from_ds._col_map
        transaction_metadata.model_group_by = group_by
        transaction_metadata.model_order_by = order_by if type(
            order_by) == type([]) else [order_by]
        transaction_metadata.window_size = window_size
        transaction_metadata.type = transaction_type
        transaction_metadata.from_data = from_ds
        transaction_metadata.test_from_data = test_from_ds
        transaction_metadata.ignore_columns = ignore_columns

        self.startInfoServer()
        self.session.newTransaction(transaction_metadata, breakpoint)
示例#9
0
class StatsGenerator(BaseModule):

    phase_name = PHASE_DATA_STATS

    def isNumber(self, string):
        """ Returns True if string is a number. """
        try:
            cleanfloat(string)
            return True
        except ValueError:
            return False

    def isDate(self, string):
        """ Returns True if string is a valid date format """
        try:
            parseDate(string)
            return True
        except ValueError:
            return False

    def getColumnDataType(self, data):
        """ Returns the column datatype based on a random sample of 15 elements """
        currentGuess = DATA_TYPES.NUMERIC
        type_dist = {}

        for element in data:
            if self.isNumber(element):
                currentGuess = DATA_TYPES.NUMERIC
            elif self.isDate(element):
                currentGuess = DATA_TYPES.DATE
            else:
                currentGuess = DATA_TYPES.TEXT

            if currentGuess not in type_dist:
                type_dist[currentGuess] = 1
            else:
                type_dist[currentGuess] += 1

        curr_data_type = DATA_TYPES.TEXT
        max_data_type = 0

        for data_type in type_dist:
            if type_dist[data_type] > max_data_type:
                curr_data_type = data_type
                max_data_type = type_dist[data_type]

        if curr_data_type == DATA_TYPES.TEXT:
            return self.getTextType(data)

        return curr_data_type

    def getBestFitDistribution(self, data, bins=40):
        """Model data by finding best fit distribution to data"""
        # Get histogram of original data

        y, x = np.histogram(data, bins=bins, density=False)
        x = (x + np.roll(x, -1))[:-1] / 2.0
        # Distributions to check
        DISTRIBUTIONS = [
            st.bernoulli, st.beta,  st.cauchy, st.expon,  st.gamma, st.halfcauchy, st.lognorm,
            st.norm, st.uniform, st.poisson
        ]

        # Best holders
        best_distribution = st.norm
        best_params = (0.0, 1.0)
        best_sse = np.inf
        # Estimate distribution parameters from data
        for i, distribution in enumerate(DISTRIBUTIONS):
            try:
                # Ignore warnings from data that can't be fit
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore')
                    # fit dist to data
                    params = distribution.fit(data)
                    # Separate parts of parameters
                    arg = params[:-2]
                    loc = params[-2]
                    scale = params[-1]

                    # Calculate fitted PDF and error with fit in distribution
                    pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                    sse = np.sum(np.power(y - pdf, 2.0))
                    # identify if this distribution is better
                    if best_sse > sse > 0:
                        best_distribution = distribution
                        best_params = params
                        best_sse = sse

            except Exception:
                pass

        return (best_distribution.name, best_params, x.tolist(), y.tolist())

    def getTextType(self, data):

        total_length = len(data)
        key_count = {}
        max_number_of_words = 0

        for cell in data:

            if cell not in key_count:
                key_count[cell] = 1
            else:
                key_count[cell] += 1

            cell_wseparator = cell
            sep_tag = '{#SEP#}'
            for separator in WORD_SEPARATORS:
                cell_wseparator = str(cell_wseparator).replace(separator,sep_tag)

            words_split = cell_wseparator.split(sep_tag)
            words = len([ word for word in words_split if word not in ['', None] ])

            if max_number_of_words < words:
                max_number_of_words += words

        if max_number_of_words == 1:
            return DATA_TYPES.TEXT
        if max_number_of_words <= 3 and len(key_count) < total_length * 0.8:
            return DATA_TYPES.TEXT
        else:
            return DATA_TYPES.FULL_TEXT



    # def isFullText(self, data):
    #     """
    #     It determines if the column is full text right
    #     Right now we assume its full text if any cell contains any of the WORD_SEPARATORS
    #
    #     :param data: a list containing all the column
    #     :return: Boolean
    #     """
    #     for cell in data:
    #         try:
    #             if any(separator in cell for separator in WORD_SEPARATORS):
    #                 return True
    #         except:
    #             exc_type, exc_value, exc_traceback = sys.exc_info()
    #             error = traceback.format_exception(exc_type, exc_value,
    #                                       exc_traceback)
    #             return False
    #     return False





    def getWordsDictionary(self, data, full_text = False):
        """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """

        splitter = lambda w, t: [wi.split(t) for wi in w] if type(w) == type([]) else splitter(w,t)

        if full_text:
            # get all words in every cell and then calculate histograms
            words = []
            for cell in data:
                words += splitRecursive(cell, WORD_SEPARATORS)

            hist = {i: words.count(i) for i in words}
            x = list(hist.keys())
            histogram = {
                'x': x,
                'y': list(hist.values())
            }
            return x, histogram


        else:
            hist = {i: data.count(i) for i in data}
            x = list(hist.keys())
            histogram = {
                'x': x,
                'y': list(hist.values())
            }
            return x, histogram

    def getParamsAsDictionary(self, params):
        """ Returns a dictionary with the params of the distribution """
        arg = params[:-2]
        loc = params[-2]
        scale = params[-1]
        ret = {
            'loc': loc,
            'scale': scale,
            'shape': arg
        }
        return ret



    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        header = self.transaction.input_data.columns
        origData = {}

        for column in header:
            origData[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size), sample_size)
        self.logging.info('population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]
            for i, val in enumerate(row):
                column = header[i]
                value = tryCastToNumber(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    origData[column].append(value)
                column_count[column] += 1
        stats = {}

        for i, col_name in enumerate(origData):
            col_data = origData[col_name] # all rows in just one column
            data_type = self.getColumnDataType(col_data)

            # NOTE: Enable this if you want to assume that some numeric values can be text
            # We noticed that by default this should not be the behavior
            # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification
            #
            # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC:
            #     unique_count = len(set(col_data))
            #     if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN:
            #         data_type = DATA_TYPES.TEXT

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(parseDate(element).timestamp())
                        except:
                            logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)


                col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.05
                    initial_step_size = abs(max_value-min_value)/100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i-min_value)*inc_rate
                        i = i + i_inc


                    # TODO: Solve inc_rate for N
                    #    min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate ....
                    #
                    #      x_0 = 0
                    #      x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate
                    #
                    #      sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) )
                    #
                    #      mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) )
                    #
                    #     mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) ))
                    #              = n*min(sum_(i=1)^(i=n)(inc_rate^i))
                    #    =>  sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n))
                    #
                    # # i + i*x

                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []


                is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False


                col_stats = {
                    "column": col_name,
                    KEYS.DATA_TYPE: data_type,
                    # "distribution": best_fit_name,
                    # "distributionParams": distribution_params,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
                stats[col_name] = col_stats
            # else if its text
            else:

                # see if its a sentence or a word
                is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False
                dictionary, histogram = self.getWordsDictionary(col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(
                        dictionary) / len(col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {

                    "column": col_name,
                    KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "histogram": histogram
                }
                stats[col_name] = col_stats



        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.persistent_model_metadata.column_stats = stats
        self.transaction.persistent_model_metadata.total_row_count = total_rows
        self.transaction.persistent_model_metadata.test_row_count = test_rows
        self.transaction.persistent_model_metadata.train_row_count = train_rows
        self.transaction.persistent_model_metadata.validation_row_count = validation_rows

        self.transaction.persistent_model_metadata.update()

        return stats
示例#10
0
    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        header = self.transaction.input_data.columns
        origData = {}

        for column in header:
            origData[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size), sample_size)
        self.logging.info('population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]
            for i, val in enumerate(row):
                column = header[i]
                value = tryCastToNumber(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    origData[column].append(value)
                column_count[column] += 1
        stats = {}

        for i, col_name in enumerate(origData):
            col_data = origData[col_name] # all rows in just one column
            data_type = self.getColumnDataType(col_data)

            # NOTE: Enable this if you want to assume that some numeric values can be text
            # We noticed that by default this should not be the behavior
            # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification
            #
            # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC:
            #     unique_count = len(set(col_data))
            #     if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN:
            #         data_type = DATA_TYPES.TEXT

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(parseDate(element).timestamp())
                        except:
                            logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)


                col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.05
                    initial_step_size = abs(max_value-min_value)/100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i-min_value)*inc_rate
                        i = i + i_inc


                    # TODO: Solve inc_rate for N
                    #    min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate ....
                    #
                    #      x_0 = 0
                    #      x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate
                    #
                    #      sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) )
                    #
                    #      mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) )
                    #
                    #     mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) ))
                    #              = n*min(sum_(i=1)^(i=n)(inc_rate^i))
                    #    =>  sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n))
                    #
                    # # i + i*x

                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []


                is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False


                col_stats = {
                    "column": col_name,
                    KEYS.DATA_TYPE: data_type,
                    # "distribution": best_fit_name,
                    # "distributionParams": distribution_params,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
                stats[col_name] = col_stats
            # else if its text
            else:

                # see if its a sentence or a word
                is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False
                dictionary, histogram = self.getWordsDictionary(col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(
                        dictionary) / len(col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {

                    "column": col_name,
                    KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "emptyColumns": empty_count[col_name],
                    "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100,
                    "histogram": histogram
                }
                stats[col_name] = col_stats



        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.persistent_model_metadata.column_stats = stats
        self.transaction.persistent_model_metadata.total_row_count = total_rows
        self.transaction.persistent_model_metadata.test_row_count = test_rows
        self.transaction.persistent_model_metadata.train_row_count = train_rows
        self.transaction.persistent_model_metadata.validation_row_count = validation_rows

        self.transaction.persistent_model_metadata.update()

        return stats
示例#11
0
    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(
            self.transaction.persistent_model_metadata.train_metadata)

        group_by = self.train_meta_data.model_group_by

        group_by_index = None
        if group_by:
            group_by_index = self.transaction.input_data.columns.index(
                group_by
            )  # TODO: Consider supporting more than one index column

        # this is a template of how we store columns
        column_packs_template = OrderedDict()

        # create a template of the column packs
        for i, column_name in enumerate(self.transaction.input_data.columns):
            column_packs_template[column_name] = []

        if self.transaction.metadata.type == TRANSACTION_LEARN:
            groups = [{
                'name': 'test',
                'target_set': self.transaction.model_data.test_set,
                'map': self.transaction.model_data.test_set_map,
                'indexes': self.transaction.input_data.test_indexes
            }, {
                'name': 'train',
                'target_set': self.transaction.model_data.train_set,
                'map': self.transaction.model_data.train_set_map,
                'indexes': self.transaction.input_data.train_indexes
            }, {
                'name': 'validation',
                'target_set': self.transaction.model_data.validation_set,
                'map': self.transaction.model_data.validation_set_map,
                'indexes': self.transaction.input_data.validation_indexes
            }]
        else:
            groups = [{
                'name':
                'predict',
                'target_set':
                self.transaction.model_data.predict_set,
                'map':
                self.transaction.model_data.predict_set_map,
                'indexes':
                range(0, len(self.transaction.input_data.data_array)
                      )  # TODO: measure impact of this
            }]

        # iterate over all groups and populate tensors by columns

        for group in groups:

            target_set = group['target_set']  # for ease use a pointer

            # iterate over all indexes taht belong to this group
            for input_row_index in group['indexes']:

                row = self.transaction.input_data.data_array[
                    input_row_index]  # extract the row from input data
                map = group['map']

                if group_by is not None:
                    group_by_hash = hashtext(row[group_by_index])

                else:
                    group_by_hash = KEY_NO_GROUP_BY

                # if the set group has not been initiated add a new one
                if group_by_hash not in target_set:
                    target_set[group_by_hash] = copy.deepcopy(
                        column_packs_template)
                    map[group_by_hash] = {}

                # Now populate into the group_hash column pile
                for column_index, cell_value in enumerate(row):

                    column_name = self.transaction.input_data.columns[
                        column_index]

                    value = self.cast(cell_value)
                    stats = self.transaction.persistent_model_metadata.column_stats[
                        column_name]

                    # TODO: Provide framework for custom nom functions
                    # TODO: FIX norm allways add column for is null
                    normalized = norm(
                        value=value, cell_stats=stats
                    )  # this should return a vector representation already normalized

                    # keep track of where it came from in the input data inc ase we need to go back
                    position = len(target_set[group_by_hash][column_name])
                    map[group_by_hash][position] = input_row_index

                    # append normalized vector to column tensor
                    target_set[group_by_hash][column_name] += [normalized]

                    if self.transaction.persistent_model_metadata.column_stats[
                            column_name][KEYS.DATA_TYPE] in [
                                DATA_TYPES.NUMERIC, DATA_TYPES.DATE
                            ] and column_name in self.train_meta_data.model_predict_columns:
                        column_name_expanded = EXTENSION_COLUMNS_TEMPLATE.format(
                            column_name=column_name)
                        if column_name_expanded not in target_set[
                                group_by_hash]:
                            target_set[group_by_hash][
                                column_name_expanded] = []

                        normalized_buckets = norm_buckets(value=value,
                                                          cell_stats=stats)
                        target_set[group_by_hash][column_name_expanded] += [
                            normalized_buckets
                        ]

            # turn into numpy arrays:
            for group_by_hash in target_set:

                distances = None

                # if we have a group by and order by calculate a distances vector for each data point in this batch
                if self.train_meta_data.model_group_by is not None and self.train_meta_data.model_order_by is not None:
                    distances = []
                    batch_height = len(target_set[group_by_hash][
                        self.train_meta_data.model_group_by])

                    # create a vector for the top distance

                    for j in range(batch_height):
                        order_by_bottom_vector = np.array(
                            list(
                                itertools.chain.from_iterable([
                                    target_set[group_by_hash][order_by_col][j]
                                    for order_by_col in
                                    self.train_meta_data.model_order_by
                                ])))
                        if j == 0:
                            order_by_top_vector = order_by_bottom_vector
                        else:
                            order_by_top_vector = np.array(
                                list(
                                    itertools.chain.from_iterable([
                                        target_set[group_by_hash][order_by_col]
                                        [j - 1] for order_by_col in
                                        self.train_meta_data.model_order_by
                                    ])))
                        # create a vector for the current row

                        # calculate distance and append to distances
                        distance = float(
                            np.linalg.norm(order_by_top_vector -
                                           order_by_bottom_vector))
                        distances.append(distance)

                # Append the time series data to each column
                # NOTE: we want to make sure that the self.train_meta_data.model_predict_columns are the first in being converted into vectors
                #       the reason for this is that if there is a time series query then we will want to add the history of the target value (see self._getRowExtraVector)
                columns_in_order = self.train_meta_data.model_predict_columns + [
                    column_name
                    for column_name in target_set[group_by_hash] if column_name
                    not in self.train_meta_data.model_predict_columns
                ]

                for column_name in columns_in_order:

                    # if there is a group by and order by and this is not a column to be predicted, append history vector
                    # TODO: Encode the history vector if possible
                    non_groupable_columns = self.train_meta_data.model_predict_columns + [
                        self.train_meta_data.model_group_by
                    ] + self.train_meta_data.model_order_by
                    # NOTE: since distances is only not None if there is a group by this is only evaluated for group by queries
                    if distances is not None and column_name not in non_groupable_columns:
                        # for each row create a vector of history and append to it
                        prev = 0
                        for col_row_index, col_row in enumerate(
                                target_set[group_by_hash][column_name]):
                            row_extra_vector = self._getRowExtraVector(
                                target_set[group_by_hash], column_name,
                                col_row_index, distances)
                            target_set[group_by_hash][column_name][
                                col_row_index] = target_set[group_by_hash][
                                    column_name][
                                        col_row_index] + row_extra_vector

                    target_set[group_by_hash][column_name] = np.array(
                        target_set[group_by_hash][column_name])

        return []
示例#12
0
class DataVectorizer(BaseModule):

    phase_name = PHASE_DATA_VECTORIZATION

    def cast(self, string):
        """ Returns an integer, float or a string from a string"""
        try:
            if string is None:
                return None
            return int(string)
        except ValueError:
            try:
                return float(string)
            except ValueError:
                if string == '':
                    return None
                else:
                    return string

    def _getRowExtraVector(self, ret, column_name, col_row_index, distances):

        predict_columns = self.train_meta_data.model_predict_columns

        desired_total = self.train_meta_data.window_size
        batch_height = len(ret[column_name])
        remaining_row_count = batch_height - (col_row_index + 1)

        harvest_count = desired_total if desired_total < remaining_row_count else remaining_row_count
        empty_count = desired_total - harvest_count
        empty_vector_len = (len(ret[column_name][col_row_index]) + sum([
            len(ret[predict_col_name][0])
            for predict_col_name in predict_columns
        ]) + 1) * empty_count  # this is the width of the padding

        row_extra_vector = []

        for i in range(harvest_count):
            try:
                row_extra_vector += ret[column_name][col_row_index + i + 1]
                row_extra_vector += [distances[col_row_index + i + 1]]

                # append the target values before:
                for predict_col_name in predict_columns:
                    row_extra_vector += [
                        float(v)
                        for v in ret[predict_col_name][col_row_index + i + 1]
                    ]
            except:
                logging.error(traceback.format_exc())
                logging.error(
                    'something is not right, seems like we got here with np arrays and they should not be!'
                )

        if empty_count > 0:
            # complete with empty
            row_extra_vector += [0] * empty_vector_len

        return row_extra_vector

    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(
            self.transaction.persistent_model_metadata.train_metadata)

        group_by = self.train_meta_data.model_group_by

        group_by_index = None
        if group_by:
            group_by_index = self.transaction.input_data.columns.index(
                group_by
            )  # TODO: Consider supporting more than one index column

        # this is a template of how we store columns
        column_packs_template = OrderedDict()

        # create a template of the column packs
        for i, column_name in enumerate(self.transaction.input_data.columns):
            column_packs_template[column_name] = []

        if self.transaction.metadata.type == TRANSACTION_LEARN:
            groups = [{
                'name': 'test',
                'target_set': self.transaction.model_data.test_set,
                'map': self.transaction.model_data.test_set_map,
                'indexes': self.transaction.input_data.test_indexes
            }, {
                'name': 'train',
                'target_set': self.transaction.model_data.train_set,
                'map': self.transaction.model_data.train_set_map,
                'indexes': self.transaction.input_data.train_indexes
            }, {
                'name': 'validation',
                'target_set': self.transaction.model_data.validation_set,
                'map': self.transaction.model_data.validation_set_map,
                'indexes': self.transaction.input_data.validation_indexes
            }]
        else:
            groups = [{
                'name':
                'predict',
                'target_set':
                self.transaction.model_data.predict_set,
                'map':
                self.transaction.model_data.predict_set_map,
                'indexes':
                range(0, len(self.transaction.input_data.data_array)
                      )  # TODO: measure impact of this
            }]

        # iterate over all groups and populate tensors by columns

        for group in groups:

            target_set = group['target_set']  # for ease use a pointer

            # iterate over all indexes taht belong to this group
            for input_row_index in group['indexes']:

                row = self.transaction.input_data.data_array[
                    input_row_index]  # extract the row from input data
                map = group['map']

                if group_by is not None:
                    group_by_hash = hashtext(row[group_by_index])

                else:
                    group_by_hash = KEY_NO_GROUP_BY

                # if the set group has not been initiated add a new one
                if group_by_hash not in target_set:
                    target_set[group_by_hash] = copy.deepcopy(
                        column_packs_template)
                    map[group_by_hash] = {}

                # Now populate into the group_hash column pile
                for column_index, cell_value in enumerate(row):

                    column_name = self.transaction.input_data.columns[
                        column_index]

                    value = self.cast(cell_value)
                    stats = self.transaction.persistent_model_metadata.column_stats[
                        column_name]

                    # TODO: Provide framework for custom nom functions
                    # TODO: FIX norm allways add column for is null
                    normalized = norm(
                        value=value, cell_stats=stats
                    )  # this should return a vector representation already normalized

                    # keep track of where it came from in the input data inc ase we need to go back
                    position = len(target_set[group_by_hash][column_name])
                    map[group_by_hash][position] = input_row_index

                    # append normalized vector to column tensor
                    target_set[group_by_hash][column_name] += [normalized]

            # turn into numpy arrays:
            for group_by_hash in target_set:

                distances = None

                # if we have a group by and order by calculate a distances vector for each data point in this batch
                if self.train_meta_data.model_group_by is not None and self.train_meta_data.model_order_by is not None:
                    distances = []
                    batch_height = len(target_set[group_by_hash][
                        self.train_meta_data.model_group_by])

                    # create a vector for the top distance

                    for j in range(batch_height):
                        order_by_bottom_vector = np.array(
                            list(
                                itertools.chain.from_iterable([
                                    target_set[group_by_hash][order_by_col][j]
                                    for order_by_col in
                                    self.train_meta_data.model_order_by
                                ])))
                        if j == 0:
                            order_by_top_vector = order_by_bottom_vector
                        else:
                            order_by_top_vector = np.array(
                                list(
                                    itertools.chain.from_iterable([
                                        target_set[group_by_hash][order_by_col]
                                        [j - 1] for order_by_col in
                                        self.train_meta_data.model_order_by
                                    ])))
                        # create a vector for the current row

                        # calculate distance and append to distances
                        distance = float(
                            np.linalg.norm(order_by_top_vector -
                                           order_by_bottom_vector))
                        distances.append(distance)

                # Append the time series data to each column
                # NOTE: we want to make sure that the self.train_meta_data.model_predict_columns are the first in being converted into vectors
                #       the reason for this is that if there is a time series query then we will want to add the history of the target value (see self._getRowExtraVector)
                columns_in_order = self.train_meta_data.model_predict_columns + [
                    column_name
                    for column_name in target_set[group_by_hash] if column_name
                    not in self.train_meta_data.model_predict_columns
                ]

                for column_name in columns_in_order:

                    # if there is a group by and order by and this is not a column to be predicted, append history vector
                    # TODO: Encode the history vector if possible
                    non_groupable_columns = self.train_meta_data.model_predict_columns + [
                        self.train_meta_data.model_group_by
                    ] + self.train_meta_data.model_order_by
                    # NOTE: since distances is only not None if there is a group by this is only evaluated for group by queries
                    if distances is not None and column_name not in non_groupable_columns:
                        # for each row create a vector of history and append to it
                        prev = 0
                        for col_row_index, col_row in enumerate(
                                target_set[group_by_hash][column_name]):
                            row_extra_vector = self._getRowExtraVector(
                                target_set[group_by_hash], column_name,
                                col_row_index, distances)
                            target_set[group_by_hash][column_name][
                                col_row_index] = target_set[group_by_hash][
                                    column_name][
                                        col_row_index] + row_extra_vector

                    target_set[group_by_hash][column_name] = np.array(
                        target_set[group_by_hash][column_name])

        return []
示例#13
0
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        train_metadata = self.transaction.metadata

        if self.transaction.metadata.type == TRANSACTION_PREDICT:

            self.populatePredictQuery()

            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type not in [TRANSACTION_PREDICT, TRANSACTION_LEARN]:

            self.session.logging.error('Do not support transaction {type}'.format(type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return



        query = self.prepareFullQuery(train_metadata)

        try:
            self.transaction.session.logging.info('About to pull query {query}'.format(query=query))
            conn = sqlite3.connect(self.transaction.metadata.storage_file)
            self.logging.info(self.transaction.metadata.model_query)
            df = pandas.read_sql_query(query, conn)
            result = df.where((pandas.notnull(df)), None)
            df = None # clean memory

        except Exception:

            self.session.logging.error(traceback.print_exc())
            self.transaction.error =True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        if len(data_array[0])>0 and  self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            if self.transaction.metadata.model_test_query:
                try:
                    test_query = query_wrapper.format(orig_query = self.transaction.metadata.model_test_query, order_by_string= order_by_string, where_not_null_string=where_not_null_string)
                    self.transaction.session.logging.info('About to pull TEST query {query}'.format(query=test_query))
                    #drill = self.session.drill.query(test_query, timeout=CONFIG.DRILL_TIMEOUT)
                    df = pandas.read_sql_query(test_query, conn)
                    result = df.where((pandas.notnull(df)), None)
                    df = None

                    #result = vars(drill)['data']
                except Exception:

                    # If testing offline, get results from a .cache file
                    self.session.logging.error(traceback.print_exc())
                    self.transaction.error = True
                    self.transaction.errorMsg = traceback.print_exc(1)
                    return

                columns = list(result.columns.values)
                data_array = result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array =  len(data_array)
                test_indexes = [i for i in range(total_data_array, total_data_array+total_test_array)]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1-test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))
                df = pandas.read_sql_query(all_group_by_items_query, conn)
                result = df.where((pandas.notnull(df)), None)
                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {i[0]:i[1] for i in result.values.tolist()}
                all_group_by_values = all_group_by_counts.keys()

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values :
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values :
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values :
                            self.transaction.input_data.validation_indexes += [i]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(self.transaction.input_data.validation_indexes)==0:
                            self.transaction.input_data.validation_indexes += [i]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio*2
                test_size = int(len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[-test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:-test_size]

            logging.info('- Test: {size} rows'.format(size=len(self.transaction.input_data.test_indexes)))
            logging.info('- Train: {size} rows'.format(size=len(self.transaction.input_data.train_indexes)))