Exemplo n.º 1
0
 def __init__(self, e_c=None, user_id='PoC_gDayF', workflow_id='default'):
     self.timestamp = str(time())
     if e_c is None:
         if workflow_id == 'default':
             self._ec = E_C(user_id=user_id,
                            workflow_id=workflow_id + '_' + self.timestamp)
         else:
             self._ec = E_C(user_id=user_id, workflow_id=workflow_id)
     else:
         self._ec = e_c
     self._config = self._ec.config.get_config()
     self._labels = self._ec.labels.get_config()['messages']['controller']
     self._frameworks = self._ec.config.get_config()['frameworks']
     self._logging = LogsHandler(self._ec)
     self.analysis_list = OrderedDict()  # For future multi-analysis uses
     self.model_handler = OrderedDict()
     self.adviser = importlib.import_module(
         self._config['optimizer']['adviser_classpath'])
     self._logging.log_info('gDayF', "Controller",
                            self._labels["loading_adviser"],
                            self._config['optimizer']['adviser_classpath'])
Exemplo n.º 2
0
 def __init__(self,
              e_c,
              deep_impact=5,
              metric='accuracy',
              dataframe_name='',
              hash_dataframe=''):
     self._ec = e_c
     self._labels = self._ec.labels.get_config()['messages']['adviser']
     self._config = self._ec.config.get_config()['optimizer']
     self._frameworks = self._ec.config.get_config()['frameworks']
     self._logging = LogsHandler(self._ec)
     self.timestamp = time()
     self.an_objective = None
     self.deep_impact = deep_impact
     self.analysis_recommendation_order = list()
     self.analyzed_models = list()
     self.excluded_models = list()
     self.next_analysis_list = list()
     self.metric = metric
     self.dataframe_name = dataframe_name
     self.hash_dataframe = hash_dataframe
Exemplo n.º 3
0
class Controller(object):

    ## Constructor
    def __init__(self, e_c=None, user_id='PoC_gDayF', workflow_id='default'):
        self.timestamp = str(time())
        if e_c is None:
            if workflow_id == 'default':
                self._ec = E_C(user_id=user_id,
                               workflow_id=workflow_id + '_' + self.timestamp)
            else:
                self._ec = E_C(user_id=user_id, workflow_id=workflow_id)
        else:
            self._ec = e_c
        self._config = self._ec.config.get_config()
        self._labels = self._ec.labels.get_config()['messages']['controller']
        self._frameworks = self._ec.config.get_config()['frameworks']
        self._logging = LogsHandler(self._ec)
        self.analysis_list = OrderedDict()  # For future multi-analysis uses
        self.model_handler = OrderedDict()
        self.adviser = importlib.import_module(
            self._config['optimizer']['adviser_classpath'])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["loading_adviser"],
                               self._config['optimizer']['adviser_classpath'])

    ## Method leading configurations coherence checks
    # @param self object pointer
    # @return True if OK / False if wrong
    def config_checks(self):
        storage_conf = self._config['storage']
        grants = storage_conf['grants']
        localfs = (storage_conf['localfs'] is not None) \
                  and self._coherence_fs_checks(storage_conf['localfs'], grants=grants)
        hdfs = (storage_conf['hdfs'] is not None) \
                  and self._coherence_fs_checks(storage_conf['hdfs'], grants=grants)
        mongoDB = (storage_conf['mongoDB'] is not None) \
                  and self._coherence_db_checks(storage_conf['mongoDB'])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["primary_path"],
                               str(storage_conf['primary_path']))
        ''' Checking primary Json storage Paths'''
        primary = False
        #if storage_conf['primary_path'] in ['localfs', 'hdfs']:
        for storage in StorageMetadata(self._ec).get_load_path():
            if storage_conf['primary_path'] == storage[
                    'type'] and storage['type'] != 'mongoDB':
                primary = True
            if storage['type'] == 'mongoDB':
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_file_storage"],
                                           str(storage))
                return False
            elif storage['type'] == 'localfs':
                if not localfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_load"],
                                               str(storage))
                    return False
            elif storage['type'] == 'hdfs':
                if not hdfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_load"],
                                               str(storage))
                    return False

        if not primary:
            self._logging.log_critical(
                'gDayF', "Controller", self._labels["no_primary"],
                str(storage_conf[storage_conf['primary_path']]))
            return False
        ''' Checking Load storage Paths'''
        at_least_on = False
        for storage in StorageMetadata(self._ec).get_json_path():
            if storage['type'] == 'mongoDB':
                if not mongoDB:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_json"],
                                               str(storage))
                    return False
                else:
                    at_least_on = at_least_on or True
            elif storage['type'] == 'localfs':
                if not localfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_json"],
                                               str(storage))
                    return False
                else:
                    at_least_on = at_least_on or True
            elif storage['type'] == 'hdfs':
                if not hdfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_json"],
                                               str(storage))
                    return False
                else:
                    at_least_on = at_least_on or True

        if not at_least_on:
            self._logging.log_critical(
                'gDayF', "Controller", self._labels["no_primary"],
                str(storage_conf[storage_conf['primary_path']]))
            return False
        ''' Checking log storage Paths'''
        at_least_on = False
        for storage in StorageMetadata(self._ec).get_log_path():
            if storage['type'] == 'mongoDB':
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_file_storage"],
                                           str(storage))
                return False
            elif storage['type'] == 'localfs':
                if not localfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_log"],
                                               str(storage))
                    return False
                else:
                    at_least_on = at_least_on or True
            elif storage['type'] == 'hdfs':
                if not hdfs:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_log"],
                                               str(storage))
                    return False
                else:
                    at_least_on = at_least_on or True
        if not at_least_on:
            self._logging.log_critical(
                'gDayF', "Controller", self._labels["no_primary"],
                str(storage_conf[storage_conf['primary_path']]))
            return False
        ''' If all things OK'''
        return True

    ## Method leading configurations coherence checks on fs engines
    # @param self object pointer
    # @param storage StorageMetadata
    # @param grants Octal grants format
    # @return True if OK / False if wrong
    def _coherence_fs_checks(self, storage, grants):
        persistence = PersistenceHandler(self._ec)
        try:
            if persistence.mkdir(type=storage['type'],
                                 path=str(storage['value']),
                                 grants=grants):
                return False
        except OSError:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_json_path"],
                                       str(storage['value']))
            return False
        if storage['hash_type'] not in ['MD5', 'SHA256']:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_hash_method"],
                                       str(storage))
            return False
        return True

    ## Method leading configurations coherence checks on fs engines
    # @param self object pointer
    # @param storage StorageMetadata
    # @return True if OK / False if wrong
    def _coherence_db_checks(self, storage):
        if storage['type'] == 'mongoDB':
            try:
                client = MongoClient(host=storage['url'],
                                     port=int(storage['port']),
                                     document_class=OrderedDict)
            except ConnectionFailure as cexecution_error:
                print(repr(cexecution_error))
                return False
            try:
                db = client[storage['value']]
                collection = db[self._ec.get_id_user()]
                test_insert = collection.insert_one({
                    'test':
                    'connection.check.dot.bson'
                }).inserted_id
                collection.delete_one({"_id": test_insert})
            except PyMongoError as wexecution_error:
                print(repr(wexecution_error))
                return False
            finally:
                client.close()
        return True

    ## Method leading and controlling prediction's executions on all frameworks
    # @param self object pointer
    # @param datapath String Path indicating file to be analyzed or Dataframe
    # @param armetadata
    # @param model_file String Path indicating model_file ArMetadata.json structure
    def exec_prediction(self, datapath, armetadata=None, model_file=None):

        self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"],
                               'prediction')
        if armetadata is None and model_file is None:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_model"], datapath)
            return self._labels["failed_model"]
        elif armetadata is not None:
            try:
                assert isinstance(armetadata, ArMetadata)
                base_ar = deep_ordered_copy(armetadata)
            except AssertionError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           armetadata)
                return self._labels["failed_model"]
        elif model_file is not None:
            try:
                #json_file = open(model_file)
                persistence = PersistenceHandler(self._ec)
                invalid, base_ar = persistence.get_ar_from_engine(model_file)
                del persistence

                if invalid:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_model"],
                                               model_file)
                    return self._labels["failed_model"]
            except IOError as iexecution_error:
                print(repr(iexecution_error))
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           model_file)
                return self._labels["failed_model"]
            except OSError as oexecution_error:
                print(repr(oexecution_error))
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           model_file)
                return self._labels["failed_model"]

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
            except [IOError, OSError, JSONDecodeError]:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            pd_dataset = datapath
            self._logging.log_info('gDayF', "Controller",
                                   self._labels["input_param"],
                                   str(datapath.shape))
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input']

        fw = get_model_fw(base_ar)

        self.init_handler(fw)

        prediction_frame = None
        try:
            prediction_frame, _ = self.model_handler[fw]['handler'].predict(
                predict_frame=pd_dataset, base_ar=base_ar)
        except TypeError:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_model"],
                                       model_file)

        self.clean_handler(fw)

        self._logging.log_info('gDayF', 'controller', self._labels["pred_end"])

        return prediction_frame

    ## Method focus on cleaning handler objects
    # @param fw framework
    def clean_handler(self, fw):
        if self.model_handler[fw]['handler'] is not None:
            self.model_handler[fw]['handler'].delete_frames()
            self.model_handler[fw]['handler'] = None

    ## Method oriented to init handler objects
    # @param fw framework
    def init_handler(self, fw):
        try:
            if self.model_handler[fw]['handler'] is None:
                handler = importlib.import_module(
                    self._frameworks[fw]['conf']['handler_module'])
                self.model_handler[fw]['handler'] = \
                    eval('handler.' + self._frameworks[fw]['conf']['handler_class'] + '(e_c=self._ec)')
        except KeyError:
            self.model_handler[fw] = OrderedDict()
            handler = importlib.import_module(
                self._frameworks[fw]['conf']['handler_module'])
            self.model_handler[fw]['handler'] = \
                eval('handler.' + self._frameworks[fw]['conf']['handler_class'] + '(e_c=self._ec)')
            self.model_handler[fw]['initiated'] = False
        if not self.model_handler[fw]['handler'].is_alive():
            initiated = self.model_handler[fw]['handler'].connect()
            self.model_handler[fw]['initiated'] = (
                self.model_handler[fw]['initiated'] or initiated)

    ## Method oriented to shutdown localClusters
    def clean_handlers(self):
        for fw, each_handlers in self.model_handler.items():
            if each_handlers['handler'] is not None:
                #self.model_handler[fw][each_handlers['handler']].clean_handler(fw)
                self.clean_handler(fw)
                self._logging.log_exec('gDayF', "Controller",
                                       self._labels["cleaning"], fw)
                if each_handlers['initiated']:
                    handler = importlib.import_module(
                        self._frameworks[fw]['conf']['handler_module'])
                    self.model_handler[fw]['handler'] = \
                        eval('handler.' + self._frameworks[fw]['conf']['handler_class']
                             + '(e_c=self._ec).shutdown_cluster()')
                    self._logging.log_exec('gDayF', "Controller",
                                           self._labels["shuttingdown"], fw)

    ## Method leading and controlling analysis's executions on all frameworks
    # @param self object pointer
    # @param datapath String Path indicating file to be analyzed or DataFrame
    # @param objective_column string indicating objective column
    # @param amode Analysis mode of execution [0,1,2,3,4,5,6]
    # @param metric to evalute models ['train_accuracy', 'train_rmse', 'test_accuracy', 'combined_accuracy', 'test_rmse', 'cdistance']
    # @param deep_impact  deep analysis
    # @return status, adviser.analysis_recommendation_order
    def exec_analysis(self,
                      datapath,
                      objective_column,
                      amode=POC,
                      metric='test_accuracy',
                      deep_impact=3,
                      **kwargs):
        # Clustering variables
        k = None
        estimate_k = False

        #Force analysis variable
        atype = None

        hash_dataframe = ''

        for pname, pvalue in kwargs.items():
            if pname == 'k':
                assert isinstance(pvalue, int)
                k = pvalue
            elif pname == 'estimate_k':
                assert isinstance(pvalue, bool)
                estimate_k = pvalue
            elif pname == 'atype':
                assert pvalue in atypes
                atype = pvalue

        supervised = True
        if objective_column is None:
            supervised = False

        self._logging.log_info('gDayF', "Controller", self._labels["start"])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["ana_param"], metric)
        self._logging.log_info('gDayF', "Controller",
                               self._labels["dep_param"], deep_impact)
        self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"],
                               amode)

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
                id_datapath = Path(datapath).name
                hash_dataframe = hash_key('MD5', datapath)
            except IOError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except OSError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except JSONDecodeError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            self._logging.log_info('gDayF', "Controller",
                                   self._labels["input_param"],
                                   str(datapath.shape))
            pd_dataset = datapath
            id_datapath = 'Dataframe' + \
                          '_' + str(pd_dataset.size) + \
                          '_' + str(pd_dataset.shape[0]) + \
                          '_' + str(pd_dataset.shape[1])
            #hash_dataframe = md5(datapath.to_msgpack()).hexdigest()
            hash_dataframe = md5(
                datapath.to_json().encode('utf-8')).hexdigest()
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input'], None

        pd_test_dataset = None
        ''' Changed 05/04/2018
        if metric == 'combined_accuracy' or 'test_accuracy':'''
        if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \
                and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS):
            pd_dataset, pd_test_dataset = pandas_split_data(
                pd_dataset,
                train_perc=self._config['common']['test_frame_ratio'])

        df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas')

        self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath +
                                 '_' + str(time()))
        adviser = self.adviser.AdviserAStar(e_c=self._ec,
                                            metric=metric,
                                            deep_impact=deep_impact,
                                            dataframe_name=id_datapath,
                                            hash_dataframe=hash_dataframe)

        adviser.set_recommendations(dataframe_metadata=df,
                                    objective_column=objective_column,
                                    amode=amode,
                                    atype=atype)

        while adviser.next_analysis_list is not None:
            for each_model in adviser.next_analysis_list:
                fw = get_model_fw(each_model)

                if k is not None:
                    try:
                        each_model["model_parameters"][fw]["parameters"]["k"][
                            "value"] = k
                        each_model["model_parameters"][fw]["parameters"]["k"][
                            "seleccionable"] = True
                        each_model["model_parameters"][fw]["parameters"][
                            "estimate_k"]["value"] = estimate_k
                        each_model["model_parameters"][fw]["parameters"][
                            "estimate_k"]["seleccionable"] = True
                    except KeyError:
                        pass

                self.init_handler(fw)
                if pd_test_dataset is not None:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_test_dataset,
                                                  filtering='STANDARDIZE')
                else:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_dataset,
                                                  filtering='STANDARDIZE')

                if analyzed_model is not None:
                    adviser.analysis_recommendation_order.append(
                        analyzed_model)
            adviser.next_analysis_list.clear()
            adviser.analysis_recommendation_order = adviser.priorize_models(
                model_list=adviser.analysis_recommendation_order)
            adviser.set_recommendations(dataframe_metadata=df,
                                        objective_column=objective_column,
                                        amode=amode)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["ana_models"],
                               str(len(adviser.analyzed_models)))
        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["exc_models"],
                               str(len(adviser.excluded_models)))

        self._logging.log_exec(self._ec.get_id_analysis(), 'controller',
                               self._labels["end"])

        self.clean_handlers()

        adviser.analysis_recommendation_order = adviser.priorize_models(
            model_list=adviser.analysis_recommendation_order)

        return self._labels[
            'success_op'], adviser.analysis_recommendation_order

    ## Method oriented to log leaderboard against selected metrics
    # @param ar_list List of AR models Execution Data
    # @param metric to execute order ['train_accuracy', 'train_rmse', 'test_accuracy', 'combined_accuracy', 'test_rmse', 'cdistance']
    def log_model_list(self, ar_list, metric):
        best_check = True
        ordered_list = self.priorize_list(arlist=ar_list, metric=metric)
        for model in ordered_list:
            if best_check:
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["best_model"], model['model_parameters']
                    [get_model_fw(model)]['parameters']['model_id']['value'])
                best_check = False
            else:
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["res_model"], model['model_parameters']
                    [get_model_fw(model)]['parameters']['model_id']['value'])

            self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                                   self._labels["round_reach"], model['round'])
            if model["normalizations_set"] is None:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller', self._labels["norm_app"],
                                       [])
            else:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller', self._labels["norm_app"],
                                       model["normalizations_set"])

            if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller',
                                       self._labels["ametric_order"],
                                       model['metrics']['accuracy'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["pmetric_order"],
                    model['metrics']['execution']['train']['RMSE'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["pmetric_order"],
                    model['metrics']['execution']['test']['RMSE'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["rmetric_order"],
                    model['metrics']['execution']['train']['r2'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["rmetric_order"],
                    model['metrics']['execution']['test']['r2'])
            if metric in CLUSTERING_METRICS:
                try:
                    self._logging.log_info(
                        self._ec.get_id_analysis(), 'controller',
                        self._labels["ckmetric_order"],
                        model['metrics']['execution']['train']['k'])
                except KeyError:
                    self._logging.log_info(self._ec.get_id_analysis(),
                                           'controller',
                                           self._labels["ckmetric_order"], "0")
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["ctmetric_order"],
                    model['metrics']['execution']['train']['tot_withinss'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["cbmetric_order"],
                    model['metrics']['execution']['train']['betweenss'])

    ## Method oriented to log leaderboard against selected metrics on dataframe
    # @param analysis_id
    # @param ar_list List of AR models Execution Data
    # @param metric to execute order ['train_accuracy', 'train_rmse', 'test_accuracy', 'combined_accuracy', 'test_rmse', 'cdistance']
    # @return Dataframe performance model list

    def table_model_list(self, ar_list, metric):
        dataframe = list()
        normal_cols = [
            'Model', 'Train_accuracy', 'Test_accuracy', 'Combined_accuracy',
            'train_rmse', 'test_rmse'
        ]
        cluster_cols = ['Model', 'k', 'tot_withinss', 'betweenss']

        ordered_list = self.priorize_list(arlist=ar_list, metric=metric)
        for model in ordered_list:
            if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS:
                try:
                    dataframe.append({
                        'Model':
                        model['model_parameters'][get_model_fw(model)]
                        ['parameters']['model_id']['value'],
                        'Round':
                        model['round'],
                        'train_accuracy':
                        model['metrics']['accuracy']['train'],
                        'test_accuracy':
                        model['metrics']['accuracy']['test'],
                        'combined_accuracy':
                        model['metrics']['accuracy']['combined'],
                        'train_rmse':
                        model['metrics']['execution']['train']['RMSE'],
                        'test_rmse':
                        model['metrics']['execution']['test']['RMSE'],
                        'train_r2':
                        model['metrics']['execution']['train']['r2'],
                        'test_r2':
                        model['metrics']['execution']['test']['r2'],
                        'path':
                        model['json_path'][0]['value']
                    })
                # AutoEncoders metrics
                except KeyError:
                    dataframe.append({
                        'Model':
                        model['model_parameters'][get_model_fw(model)]
                        ['parameters']['model_id']['value'],
                        'Round':
                        model['round'],
                        'train_accuracy':
                        model['metrics']['accuracy']['train'],
                        'test_accuracy':
                        model['metrics']['accuracy']['test'],
                        'combined_accuracy':
                        model['metrics']['accuracy']['combined'],
                        'train_rmse':
                        model['metrics']['execution']['train']['RMSE'],
                        'path':
                        model['json_path'][0]['value']
                    })

            if metric in CLUSTERING_METRICS:
                try:
                    aux = model['metrics']['execution']['train']['k']
                except KeyError:
                    aux = 0

                dataframe.append({
                    'Model':
                    model['model_parameters'][get_model_fw(model)]
                    ['parameters']['model_id']['value'],
                    'Round':
                    model['round'],
                    'k':
                    aux,
                    'tot_withinss':
                    model['metrics']['execution']['train']['tot_withinss'],
                    'betweenss':
                    model['metrics']['execution']['train']['betweenss'],
                    'path':
                    model['json_path'][0]['value']
                })
        return DataFrame(dataframe)

    ## Method leading and controlling analysis's executions on specific analysis
    # @param self object pointer
    # @param datapath String Path indicating file to be analyzed or DataFrame
    # @param list_ar_metadata list of models to execute
    # @param metric to evalute models
    # @param deep_impact  deep analysis
    # @return status, adviser.analysis_recommendation_order
    def exec_sanalysis(self,
                       datapath,
                       list_ar_metadata,
                       metric='combined_accuracy',
                       deep_impact=1,
                       **kwargs):

        self._logging.log_info('gDayF', "Controller", self._labels["start"])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["ana_param"], metric)
        self._logging.log_info('gDayF', "Controller",
                               self._labels["dep_param"], deep_impact)

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
                id_datapath = Path(datapath).name
                hash_dataframe = hash_key('MD5', datapath)
            except IOError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except OSError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except JSONDecodeError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            hash_dataframe = None
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["input_param"],
                                       str(datapath.shape))
            pd_dataset = datapath
            id_datapath = 'Dataframe' + \
                          '_' + str(pd_dataset.size) + \
                          '_' + str(pd_dataset.shape[0]) + \
                          '_' + str(pd_dataset.shape[1])
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input'], None

        pd_test_dataset = None
        if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \
                and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS):
            pd_dataset, pd_test_dataset = pandas_split_data(
                pd_dataset,
                train_perc=self._config['common']['test_frame_ratio'])

        df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas')
        self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath +
                                 '_' + str(time()))
        adviser = self.adviser.AdviserAStar(e_c=self._ec,
                                            metric=metric,
                                            deep_impact=deep_impact,
                                            dataframe_name=id_datapath,
                                            hash_dataframe=hash_dataframe)

        adviser.analysis_specific(dataframe_metadata=df,
                                  list_ar_metadata=list_ar_metadata)

        while adviser.next_analysis_list is not None:

            for each_model in adviser.next_analysis_list:
                fw = get_model_fw(each_model)

                self.init_handler(fw)

                if pd_test_dataset is not None:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_test_dataset,
                                                  filtering='NONE')
                else:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  filtering='NONE')
                if analyzed_model is not None:
                    adviser.analysis_recommendation_order.append(
                        analyzed_model)

            adviser.next_analysis_list.clear()
            adviser.analysis_recommendation_order = adviser.priorize_models(
                model_list=adviser.analysis_recommendation_order)
            adviser.analysis_specific(
                dataframe_metadata=df,
                list_ar_metadata=adviser.analysis_recommendation_order)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["ana_models"],
                               str(len(adviser.analyzed_models)))
        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["exc_models"],
                               str(len(adviser.excluded_models)))

        self.log_model_list(adviser.analysis_recommendation_order, metric)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["end"])

        self.clean_handlers()

        adviser.analysis_recommendation_order = adviser.priorize_models(
            model_list=adviser.analysis_recommendation_order)

        return self._labels[
            'success_op'], adviser.analysis_recommendation_order

    ## Method leading and controlling coversion to java model
    # @param self object pointer
    # @param armetadata Armetada object
    # @param type base type if is possible
    # @return download_path, hash MD5 key
    def get_external_model(self, armetadata, type='pojo'):
        fw = get_model_fw(armetadata)
        self.init_handler(fw)
        results = self.model_handler[fw]['handler'].get_external_model(
            armetadata, type)
        self.clean_handler(fw)
        return results

    ## Method leading and controlling model savings
    # @param self object pointer
    # @param mode [BEST, BEST_3, EACH_BEST, ALL]
    # @param  arlist List of armetadata
    # @param  metric ['accuracy', 'combined', 'test_accuracy', 'rmse']
    def save_models(self, arlist, mode=BEST, metric='accuracy'):
        if mode == BEST:
            model_list = [arlist[0]]
        elif mode == BEST_3:
            model_list = arlist[0:3]
        elif mode == EACH_BEST:
            exclusion = list()
            model_list = list()
            for model in arlist:
                if (get_model_fw(model), model['model_parameters'][
                        get_model_fw(model)]['model'],
                        model['normalizations_set']) not in exclusion:
                    model_list.append(model)
                    exclusion.append(
                        (get_model_fw(model), model['model_parameters'][
                            get_model_fw(model)]['model'],
                         model['normalizations_set']))
        elif mode == ALL:
            model_list = arlist
        elif mode == NONE:
            model_list = list()
        for fw in self._config['frameworks'].keys():
            self.init_handler(fw)
            for each_model in model_list:
                if fw in each_model['model_parameters'].keys():
                    self.model_handler[fw]['handler'].store_model(
                        each_model, user=self._ec.get_id_user())
            self.clean_handler(fw)

    ## Method leading and controlling model loads
    # @param self object pointer
    # @param  arlist List of armetadata
    # @return  list of ar_descriptors of models correctly loaded
    def load_models(self, arlist):
        model_loaded = list()
        for fw in self._config['frameworks'].keys():
            self.init_handler(fw)
            for each_model in arlist:
                if fw in each_model['model_parameters'].keys():
                    model_load = self.model_handler[fw]['handler'].load_model(
                        each_model)
                    if model_load is not None:
                        model_loaded.append(model_load)
            self.clean_handler(fw)
        return model_loaded

    ## Method leading and controlling model removing from server
    # @param self object pointer
    # @param mode to be keeped in memory [BEST, BEST_3, EACH_BEST, ALL,NONE]
    # @param  arlist List of armetadata
    def remove_models(self, arlist, mode=ALL):
        if mode == BEST:
            model_list = arlist[1:]
        elif mode == BEST_3:
            model_list = arlist[3:]
        elif mode == EACH_BEST:
            exclusion = list()
            model_list = list()
            for model in arlist:
                if (get_model_fw(model), model['model_parameters'][
                        get_model_fw(model)]['model'],
                        model['normalizations_set']) not in exclusion:
                    exclusion.append(
                        (get_model_fw(model), model['model_parameters'][
                            get_model_fw(model)]['model'],
                         model['normalizations_set']))
                else:
                    model_list.append(model)
        elif mode == ALL:
            model_list = arlist
        elif mode == NONE:
            model_list = list()
        fw_list = list()
        for models in model_list:
            if get_model_fw(models) not in fw_list:
                fw_list.append(get_model_fw(models))

        for fw in fw_list:
            self.init_handler(fw)
            self.model_handler[fw]['handler'].remove_models(model_list)
            self.clean_handler(fw)

    ##Method oriented to generate execution tree for visualizations and analysis issues
    # @param arlist Priorized ArMetadata list
    # @param  metric ['accuracy', 'combined', 'test_accuracy', 'rmse']
    # @param  store True/False
    # @param experiment analysys_id for mongoDB recovery
    # @param user user_id for mongoDB recovery
    # @return OrderedDict() with execution tree data Analysis
    def reconstruct_execution_tree(self,
                                   arlist=None,
                                   metric='combined',
                                   store=True):
        if (arlist is None
                or len(arlist) == 0) and self._ec.get_id_analysis() is None:
            self._logging.log_critical('gDayF', 'controller',
                                       self._labels["failed_model"])
            return None
        elif self._ec.get_id_analysis(
        ) is not None and self._ec.get_id_user() != 'guest':
            new_arlist = PersistenceHandler(
                self._ec).recover_experiment_mongoDB()
        else:
            analysis_id = arlist[0]['model_id']
            new_arlist = arlist

        ordered_list = self.priorize_list(arlist=new_arlist, metric=metric)

        root = OrderedDict()
        root['data'] = None
        root['ranking'] = 0
        root['successors'] = OrderedDict()
        variable_dict = OrderedDict()
        variable_dict[0] = {'root': root}

        ranking = 1
        for new_tree_structure in ordered_list:
            new_model = deep_ordered_copy(new_tree_structure)
            model_id = new_tree_structure['model_parameters'][get_model_fw(new_tree_structure)]\
                                         ['parameters']['model_id']['value']
            level = new_tree_structure['round']
            if level not in variable_dict.keys():
                variable_dict[level] = OrderedDict()

            new_tree_structure = OrderedDict()
            new_tree_structure['ranking'] = ranking
            new_tree_structure['data'] = new_model
            new_tree_structure['successors'] = OrderedDict()
            variable_dict[level][model_id] = new_tree_structure

            ranking += 1

        level = 1
        max_level = max(variable_dict.keys())
        while level in range(1, max_level + 1):
            for model_id, new_tree_structure in variable_dict[level].items():
                counter = 1
                found = False
                while not found or (level - counter) == 0:
                    if new_tree_structure['data'][
                            'predecessor'] in variable_dict[level -
                                                            counter].keys():
                        container = eval(
                            'variable_dict[level-counter][new_tree_structure[\'data\'][\'predecessor\']]'
                        )
                        container['successors'][model_id] = new_tree_structure
                        found = True
                    counter += 1
                if not found:
                    self._logging.log_debug(self._ec.get_id_analysis(),
                                            'controller',
                                            self._labels['fail_reconstruct'],
                                            model_id)
            level += 1

        #Store_json on primary path
        if store and self._config['storage']['primary_path'] != 'mongoDB':
            primary_path = self._config['storage']['primary_path']
            fstype = self._config['storage'][primary_path]['type']

            datafile = list()
            datafile.append(self._config['storage'][primary_path]['value'])
            datafile.append('/')
            datafile.append(self._ec.get_id_user())
            datafile.append('/')
            datafile.append(self._ec.get_id_workflow())
            datafile.append('/')
            datafile.append(self._config['common']['execution_tree_dir'])
            datafile.append('/')
            datafile.append(self._ec.get_id_analysis())
            datafile.append('.json')

            if self._config['persistence']['compress_json']:
                datafile.append('.gz')

            storage = StorageMetadata(self._ec)
            storage.append(value=''.join(datafile), fstype=fstype)
            PersistenceHandler(self._ec).store_json(storage, root)
        return root

    ##Method oriented to priorize ARlist
    # @param self object pointer
    # @param analysis_id
    # @param arlist Priorized ArMetadata list
    # @param  metric ['accuracy', 'combined', 'test_accuracy', 'rmse']
    # @return OrderedDict() with execution tree data Analysis
    def priorize_list(self, arlist, metric):
        adviser = self.adviser.AdviserAStar(e_c=self._ec, metric=metric)
        ordered_list = adviser.priorize_models(arlist)
        del adviser
        return ordered_list

    ## Method base to get an ArMetadata Structure from file
    # @param self object pointer
    # @param path FilePath
    # @return operation status (0 success /1 error, ArMetadata/None)
    def get_ar_from_engine(self, path):
        persistence = PersistenceHandler(self._ec)
        failed, armetadata = persistence.get_ar_from_engine(path=path)
        del persistence
        return failed, armetadata
Exemplo n.º 4
0
 def __init__(self, user_id='PoC_gDayF'):
     self._ec = E_C(user_id=user_id)
     self._config = self._ec.config.get_config()
     self._labels = self._ec.labels.get_config()['messages']['workflow']
     self._logging = LogsHandler(self._ec)
     self.timestamp = str(time())
Exemplo n.º 5
0
class Workflow(object):
    ## Constructor
    def __init__(self, user_id='PoC_gDayF'):
        self._ec = E_C(user_id=user_id)
        self._config = self._ec.config.get_config()
        self._labels = self._ec.labels.get_config()['messages']['workflow']
        self._logging = LogsHandler(self._ec)
        self.timestamp = str(time())

    ## Method leading workflow executions
    # @param self object pointer
    # @param e_c experiment_configuration
    # @param datapath String Path indicating file to be analyzed or Dataframe
    # @param workflow String Path indicating train workflow definition path
    # @param remove_models [BEST, BEST_3, EACH_BEST, ALL]
    # @param prefix value

    def workflow(self,
                 datapath,
                 workflow,
                 prefix=None,
                 remove_models=EACH_BEST):

        if isinstance(workflow, str):
            file = open(workflow, 'r')
            wf = load(file, object_hook=OrderedDict)
            if self._ec.get_id_workflow() == 'default':
                self._ec.set_id_workflow(
                    Path(workflow).stem + '_' + self.timestamp)
        else:
            wf = workflow
            if self._ec.get_id_workflow() == 'default':
                self._ec.set_id_workflow(self._ec.get_id_workflow() + '_' +
                                         self.timestamp)

        for wkey, wvalue in wf.items():
            if prefix is None:
                #_prefix = xstr(wvalue['parameters']['objective_column'])
                _prefix = xstr(
                    wvalue['parameters']['objective_column']) + '_' + wkey
            else:
                #_prefix = prefix + '_' + xstr(wvalue['parameters']['objective_column'])
                _prefix = prefix + '_' + xstr(
                    wvalue['parameters']['objective_column']) + '_' + wkey
            if wvalue['parameters']['mode'] == "train":
                self.train_workflow(datapath=datapath,
                                    wkey=wkey,
                                    workflow=wvalue,
                                    prefix=_prefix,
                                    remove_models=remove_models)
            elif wvalue['parameters']['mode'] == "predict":
                self.predict_workflow(datapath=datapath,
                                      wkey=wkey,
                                      workflow=wvalue,
                                      prefix=_prefix,
                                      remove_models=remove_models)
            else:
                self._logging.log_info('gDayF', "Workflow",
                                       self._labels["nothing_to_do"])

    ## Method leading train workflow executions
    # @param self object pointer
    # @param datapath String Path indicating file to be analyzed or Dataframe
    # @param wkey Step name
    # @param workflow String Path indicating train workflow definition path
    # @param prefix value
    # @param remove_models [BEST, BEST_3, EACH_BEST, ALL]

    def train_workflow(self,
                       datapath,
                       wkey,
                       workflow,
                       prefix='main',
                       remove_models=EACH_BEST):
        set_option('display.max_rows', 500)
        set_option('display.max_columns', 500)
        set_option('display.width', 1000)

        wf = workflow
        pfix = prefix

        error, dataset = self.check_path(datapath)
        if dataset is None:
            return error

        controller = Controller(e_c=self._ec)
        if controller.config_checks():
            variables = dataset.columns.tolist()

            #for wkey, wvalue in wf.items():
            if wf["data"]["filtered_columns"] is not None:
                for delete in wf["data"]["filtered_columns"]:
                    try:
                        variables.remove(delete)
                    except Exception:
                        self._logging.log_info('gDayF', "Workflow",
                                               self._labels["failed_var"],
                                               delete)
            self._logging.log_info('gDayF', "Workflow",
                                   self._labels["variables_desc"], variables)
            if wf["data"]["for_each"] is not None:
                fe_column = wf["data"]["for_each"]
                fe_data_exclusions = wf["data"]["for_each_exclusions"]
                fe_filtered_data = wf["data"]["filtered_data"]
                fe_parameters = wf["parameters"]
                fe_next = wf["Next"]

                for each in eval('dataset.' + fe_column + '.unique()'):
                    if fe_data_exclusions is None or each not in fe_data_exclusions:
                        aux_dataset = eval('dataset[dataset.' + fe_column +
                                           '== each]')
                        pfix = xstr(prefix + '_' + str(each))

                        if fe_filtered_data is not None:
                            qcolumn = fe_filtered_data["column"]
                            quantile = aux_dataset[qcolumn].quantile(
                                q=fe_filtered_data["quantile"])
                            aux_dataset = eval('aux_dataset.loc[aux_dataset.' +
                                               qcolumn + '<= ' +
                                               str(quantile) + ']')
                            pfix = xstr(pfix + '_' +
                                        str(fe_filtered_data["quantile"]))

                        if fe_parameters is not None:
                            source_parameters = list()
                            source_parameters.append(
                                'controller.exec_analysis(')
                            source_parameters.append(
                                'datapath=aux_dataset.loc[:, variables]')
                            for ikey, ivalue in fe_parameters.items():
                                source_parameters.append(',')
                                source_parameters.append(ikey)
                                source_parameters.append('=')
                                if isinstance(ivalue, str) and ikey != "amode":
                                    source_parameters.append('\'')
                                    source_parameters.append(ivalue)
                                    source_parameters.append('\'')
                                else:
                                    source_parameters.append(str(ivalue))
                            source_parameters.append(')')

                            self._logging.log_info(
                                'gDayF', "Workflow",
                                self._labels["desc_operation"],
                                ''.join(source_parameters))
                            status, recomendations = eval(
                                ''.join(source_parameters))
                            controller.remove_models(recomendations,
                                                     mode=remove_models)
                            controller.reconstruct_execution_tree(
                                recomendations,
                                metric=fe_parameters['metric'],
                                store=True)

                            #model_id = recomendations[0]['model_id']
                            table_model_list = controller.table_model_list(
                                ar_list=recomendations,
                                metric=eval(fe_parameters['metric']))
                            self._logging.log_info(
                                'gDayF', 'workflow',
                                self._labels["results"] + '\n',
                                table_model_list.to_string(justify="left"))

                            #filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'train_performance'
                            if self._config['common'][
                                    'workflow_summary_enabled']:
                                filename = self.storage_path(
                                    'train',
                                    str(pfix) + '_' + 'train_performance',
                                    'xls')
                                table_model_list.to_excel(
                                    filename,
                                    index=False,
                                    sheet_name='performance')
                                self.replicate_file('train', filename=filename)

                            prediction_frame = controller.exec_prediction(
                                datapath=aux_dataset,
                                model_file=recomendations[0]['json_path'][0]
                                ['value'])
                            try:
                                if 'predict' in prediction_frame.columns.values:
                                    prediction_frame.rename(
                                        columns={"predict": wkey},
                                        inplace=True)
                                elif 'prediction' in prediction_frame.columns.values:
                                    prediction_frame.rename(
                                        columns={"prediction": wkey},
                                        inplace=True)

                                self._logging.log_info(
                                    'gDayF', 'workflow',
                                    self._labels["results"] + '\n',
                                    prediction_frame.to_string(
                                        index_names=False, justify="left"))
                                '''filename = self.storage_path('train', wkey + '_'
                                                             + str(pfix) + '_' + 'prediction', 'xls')'''
                                if self._config['common'][
                                        'workflow_summary_enabled']:
                                    filename = self.storage_path(
                                        'train',
                                        str(pfix) + '_' + 'prediction', 'xls')
                                    prediction_frame.to_excel(
                                        filename,
                                        index=False,
                                        sheet_name='train_prediction')
                                    self.replicate_file('train',
                                                        filename=filename)

                            except AttributeError as oexecution_error:
                                self._logging.log_info(
                                    'gDayF', "Workflow",
                                    self._labels["failed_model"],
                                    str(repr(oexecution_error)))

                            try:
                                if fe_next is not None and prediction_frame is not None:
                                    self.workflow(prediction_frame,
                                                  fe_next,
                                                  pfix,
                                                  remove_models=remove_models)
                            except Exception as oexecution_error:
                                self._logging.log_critical(
                                    'gDayF', "Workflow",
                                    self._labels["failed_wf"], str(fe_next))
            else:
                aux_dataset = dataset

                if wf["data"]["filtered_data"] is not None:
                    qcolumn = wf["data"]["filtered_data"]["column"]
                    quantile = aux_dataset[[qcolumn]].quatile(
                        [wf["data"]["filtered_data"]["quantile"]])
                    aux_dataset = aux_dataset.query('%s <= %s' %
                                                    (qcolumn, quantile))

                if wf['parameters'] is not None:
                    source_parameters = list()
                    source_parameters.append('controller.exec_analysis(')
                    source_parameters.append(
                        'datapath=aux_dataset.loc[:, variables]')
                    for ikey, ivalue in wf['parameters'].items():
                        source_parameters.append(',')
                        source_parameters.append(ikey)
                        source_parameters.append('=')
                        if isinstance(ivalue, str) and ikey != "amode":
                            source_parameters.append('\'')
                            source_parameters.append(ivalue)
                            source_parameters.append('\'')
                        else:
                            source_parameters.append(str(ivalue))
                    source_parameters.append(')')
                    self._logging.log_info('gDayF', "Workflow",
                                           self._labels["desc_operation"],
                                           ''.join(source_parameters))
                    status, recomendations = eval(''.join(source_parameters))
                    controller.remove_models(recomendations,
                                             mode=remove_models)
                    controller.reconstruct_execution_tree(
                        recomendations,
                        metric=wf['parameters']['metric'],
                        store=True)

                    model_id = recomendations[0]['model_id']
                    table_model_list = controller.table_model_list(
                        ar_list=recomendations,
                        metric=eval(wf['parameters']['metric']))
                    self._logging.log_info(
                        'gDayF', 'workflow', self._labels["results"] + '\n',
                        table_model_list.to_string(justify="left"))

                    if self._config['common']['workflow_summary_enabled']:
                        '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_'
                                                     + 'train_performance', 'xls')'''
                        filename = self.storage_path(
                            'train',
                            str(pfix) + '_' + 'train_performance', 'xls')
                        table_model_list.to_excel(filename,
                                                  index=False,
                                                  sheet_name="performace")
                        self.replicate_file('train', filename=filename)

                    prediction_frame = controller.exec_prediction(
                        datapath=aux_dataset,
                        model_file=recomendations[0]['json_path'][0]['value'])
                    try:
                        if 'predict' in prediction_frame.columns.values:
                            prediction_frame.rename(columns={"predict": wkey},
                                                    inplace=True)
                        elif 'prediction' in prediction_frame.columns.values:
                            prediction_frame.rename(
                                columns={"prediction": wkey}, inplace=True)

                        self._logging.log_info(
                            'gDayF', 'workflow',
                            self._labels["results"] + '\n',
                            prediction_frame.to_string(index_names=False,
                                                       justify="left"))
                        '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_'
                                                     + 'prediction', 'xls')'''
                        if self._config['common']['workflow_summary_enabled']:
                            filename = self.storage_path(
                                'train',
                                str(pfix) + '_' + 'prediction', 'xls')
                            prediction_frame.to_excel(
                                filename,
                                index=False,
                                sheet_name="train_prediction")
                            self.replicate_file('train', filename=filename)

                    except AttributeError as oexecution_error:
                        self._logging.log_info('gDayF', "Workflow",
                                               self._labels["failed_model"],
                                               str(repr(oexecution_error)))

                    if wf['Next'] is not None and prediction_frame is not None:
                        try:
                            self.workflow(datapath=prediction_frame,
                                          workflow=wf['Next'],
                                          prefix=pfix,
                                          remove_models=remove_models)
                        except Exception as oexecution_error:
                            self._logging.log_critical(
                                'gDayF', "Workflow", self._labels["failed_wf"],
                                str(wf['Next']))
                            self._logging.log_critical(
                                'gDayF', "Workflow", self._labels["failed_wf"],
                                repr(oexecution_error))

        controller.clean_handlers()
        del controller

    ## Method leading predict workflow executions
    # @param self object pointer
    # @param e_c experiment_configuration
    # @param datapath String Path indicating file to be analyzed or Dataframe
    # @param wkey Step name
    # @param workflow String Path indicating test workflow definition path
    # @param remove_models [BEST, BEST_3, EACH_BEST, ALL]
    # @param prefix value

    def predict_workflow(self,
                         datapath,
                         wkey,
                         workflow,
                         prefix='main',
                         workflow_id='default',
                         remove_models=EACH_BEST):
        set_option('display.height', 1000)
        set_option('display.max_rows', 500)
        set_option('display.max_columns', 500)
        set_option('display.width', 1000)

        error, dataset = self.check_path(datapath)
        if dataset is None:
            return error

        if isinstance(workflow, str):
            file = open(workflow, 'r')
            wf = load(file, object_hook=OrderedDict)
        else:
            wf = workflow
        pfix = xstr(prefix)
        controller = Controller(e_c=self._ec)
        if controller.config_checks():
            variables = dataset.columns.tolist()

            #for wkey, wvalue in wf.items():
            if wf["model"] is not None and \
                    (isinstance(wf["model"], str) or isinstance(wf["model"], dict)):

                if wf["data"]["filtered_columns"] is not None:
                    for delete in wf["data"]["filtered_columns"]:
                        try:
                            variables.remove(delete)
                        except Exception:
                            self._logging.log_info('gDayF', "Workflow",
                                                   self._labels["failed_var"],
                                                   delete)

                self._logging.log_info('gDayF', "Workflow",
                                       self._labels["variables_desc"],
                                       variables)

                if wf["data"]["for_each"] is not None:
                    fe_column = wf["data"]["for_each"]
                    fe_data_exclusions = wf["data"]["for_each_exclusions"]
                    fe_filtered_data = wf["data"]["filtered_data"]
                    fe_next = wf["Next"]

                    for each in eval('dataset.' + fe_column + '.unique()'):
                        if fe_data_exclusions is None or each not in fe_data_exclusions:
                            aux_dataset = eval('dataset[dataset.' + fe_column +
                                               '== each]')
                            pfix = xstr(prefix + '_' + str(each))

                            if fe_filtered_data is not None:
                                qcolumn = fe_filtered_data["column"]
                                quantile = aux_dataset[qcolumn].quantile(
                                    q=fe_filtered_data["quantile"])
                                aux_dataset = eval(
                                    'aux_dataset.loc[aux_dataset.' + qcolumn +
                                    '<= ' + str(quantile) + ']')
                                pfix = xstr(pfix + '_' +
                                            str(fe_filtered_data["quantile"]))

                            prediction_frame = controller.exec_prediction(
                                datapath=aux_dataset,
                                model_file=wf["model"][str(each)])
                            try:
                                if 'predict' in prediction_frame.columns.values:
                                    prediction_frame.rename(
                                        columns={"predict": wkey},
                                        inplace=True)
                                elif 'prediction' in prediction_frame.columns.values:
                                    prediction_frame.rename(
                                        columns={"prediction": wkey},
                                        inplace=True)
                            except AttributeError:
                                self._logging.log_info(
                                    'gDayF', "Workflow",
                                    self._labels["anomalies_operation"])

                            self._logging.log_info(
                                'gDayF', 'workflow',
                                self._labels["results"] + '\n',
                                prediction_frame.to_string(index_names=False,
                                                           justify="left"))

                            try:
                                if isinstance(prediction_frame, DataFrame) \
                                        and self._config['common']['workflow_summary_enabled']:
                                    '''filename = self.storage_path('predict', wkey + '_'
                                                        + str(pfix) + '_' + 'prediction', 'xls')'''
                                    filename = self.storage_path(
                                        'predict',
                                        str(pfix) + '_' + str(self.timestamp) +
                                        '_' + 'prediction', 'xls')
                                    prediction_frame.to_excel(
                                        filename,
                                        index=False,
                                        sheet_name="prediction")
                                    self.replicate_file('predict',
                                                        filename=filename)
                                elif self._config['common'][
                                        'workflow_summary_enabled']:
                                    for ikey, ivalue in prediction_frame[
                                            'columns'].items():
                                        ppDF = decode_ordered_dict_to_dataframe(
                                            ivalue)
                                        if isinstance(ppDF, DataFrame):
                                            '''filename = self.storage_path('predict', wkey + '_'
                                                          + str(pfix) + '_' + 'prediction_' + ikey, 'xls')'''
                                            filename = self.storage_path(
                                                'predict',
                                                str(pfix) + '_' +
                                                str(self.timestamp) + '_' +
                                                'prediction_' + ikey, 'xls')
                                            ppDF.to_excel(
                                                filename,
                                                index=False,
                                                sheet_name="prediction")
                                            self.replicate_file(
                                                'predict', filename=filename)

                                    filename = self.storage_path(
                                        'predict',
                                        str(pfix) + '_' + str(self.timestamp) +
                                        '_' + '_prediction', 'json')
                                    with open(filename, 'w') as f:
                                        f.write(
                                            dumps(
                                                prediction_frame['global_mse'])
                                        )
                                    self.replicate_file('predict',
                                                        filename=filename)
                            except AttributeError:
                                self._logging.log_info(
                                    'gDayF', "Workflow",
                                    self._labels["anomalies_operation"],
                                    prediction_frame)

                            try:
                                if fe_next is not None and prediction_frame is not None:
                                    self.workflow(prediction_frame,
                                                  fe_next,
                                                  pfix,
                                                  remove_models=remove_models)
                            except Exception as oexecution_error:
                                self._logging.log_critical(
                                    'gDayF', "Workflow",
                                    self._labels["failed_wf"], str(fe_next))
                                self._logging.log_critical(
                                    'gDayF', "Workflow",
                                    self._labels["failed_wf"],
                                    repr(oexecution_error))
                else:
                    aux_dataset = dataset

                    prediction_frame = controller.exec_prediction(
                        datapath=aux_dataset, model_file=wf["model"])
                    if 'predict' in prediction_frame.columns.values:
                        prediction_frame.rename(columns={"predict": wkey},
                                                inplace=True)
                    elif 'prediction' in prediction_frame.columns.values:
                        prediction_frame.rename(columns={"prediction": wkey},
                                                inplace=True)

                    self._logging.log_info(
                        'gDayF', 'workflow', self._labels["results"] + '\n',
                        prediction_frame.to_string(index_names=False,
                                                   justify="left"))
                    if isinstance(
                            prediction_frame, DataFrame
                    ) and self._config['common']['workflow_summary_enabled']:
                        filename = self.storage_path(
                            'predict',
                            str(pfix) + str(self.timestamp) + '_' +
                            '_prediction', 'xls')
                        prediction_frame.to_excel(filename,
                                                  index=False,
                                                  sheet_name="prediction")
                        self.replicate_file('predict', filename=filename)
                    elif self._config['common']['workflow_summary_enabled']:
                        for ikey, ivalue in prediction_frame['columns'].items(
                        ):
                            ppDF = decode_ordered_dict_to_dataframe(ivalue)
                            if isinstance(ppDF, DataFrame):
                                filename = self.storage_path(
                                    'predict',
                                    str(pfix) + '_' + str(self.timestamp) +
                                    '_' + 'prediction_' + ikey, 'xls')
                                ppDF.to_excel(filename,
                                              index=False,
                                              sheet_name="prediction")
                                self.replicate_file('predict',
                                                    filename=filename)

                        filename = self.storage_path(
                            'predict',
                            str(pfix) + '_' + str(self.timestamp) + '_' +
                            '_prediction', 'json')
                        with open(filename, 'w') as f:
                            f.write(dumps(prediction_frame))
                        self.replicate_file('predict', filename=filename)

                    if wf['Next'] is not None and prediction_frame is not None:
                        try:
                            self.workflow(datapath=prediction_frame,
                                          workflow=wf['Next'],
                                          prefix=pfix,
                                          remove_models=remove_models)
                        except Exception as oexecution_error:
                            self._logging.log_critical(
                                'gDayF', "Workflow", self._labels["failed_wf"],
                                str(wf['Next']))
                            self._logging.log_critical(
                                'gDayF', "Workflow", self._labels["failed_wf"],
                                repr(oexecution_error))

        controller.clean_handlers()
        del controller

    ## Method managing dataset load from datapath:
    # @param datapath String Path indicating file to be analyzed or Dataframe
    # @return  None, Dataframe if no load errors, Error Message/None if load errors
    def check_path(self, datapath):
        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Workflow",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
                return None, pd_dataset.copy()
            except [IOError, OSError, JSONDecodeError]:
                self._logging.log_critical('gDayF', "Workflow",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input'], None
        elif isinstance(datapath, DataFrame):
            self._logging.log_info('gDayF', "Controller",
                                   self._labels["input_param"],
                                   str(datapath.shape))
            return None, datapath
        else:
            self._logging.log_critical('gDayF', "Workflow",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input'], None

    ## Method managing storage path
    # @param mode ['train','predict']
    # @param filename filename
    # @param filetype file type
    # @return  None if no localfs primary path found . Abosulute path if true
    def storage_path(self, mode, filename, filetype):
        load_storage = StorageMetadata(self._ec)
        if self._config['common']['workflow_summary_enabled']:
            include = True
        else:
            include = False
        for each_storage_type in load_storage.get_load_path(include=include):
            if each_storage_type['type'] == 'localfs':
                source_data = list()
                primary_path = self._config['storage'][
                    each_storage_type['type']]['value']
                source_data.append(primary_path)
                source_data.append('/')
                source_data.append(self._ec.get_id_user())
                source_data.append('/')
                source_data.append(self._ec.get_id_workflow())
                source_data.append('/')
                source_data.append(
                    self._config['common']['workflow_summary_dir'])
                source_data.append('/')
                source_data.append(mode)
                source_data.append('/')

                PersistenceHandler(self._ec).mkdir(
                    type=each_storage_type['type'],
                    path=''.join(source_data),
                    grants=self._config['storage']['grants'])
                source_data.append(filename)
                source_data.append('.' + filetype)

                return ''.join(source_data)
        return None

    ## Method replicate files from primery to others
    # @param mode ['train','predict']
    # @param filename filename
    # @return  None if no localfs primary path found . Abosulute path if true
    def replicate_file(self, mode, filename):
        load_storage = StorageMetadata(self._ec).get_json_path()
        persistence = PersistenceHandler(self._ec)
        for each_storage_type in load_storage:
            if each_storage_type['type'] in ['localfs', 'hdfs']:
                source_data = list()
                primary_path = self._config['storage'][
                    each_storage_type['type']]['value']
                source_data.append(primary_path)
                source_data.append('/')
                source_data.append(self._ec.get_id_user())
                source_data.append('/')
                source_data.append(self._ec.get_id_workflow())
                source_data.append('/')
                source_data.append(
                    self._config['common']['workflow_summary_dir'])
                source_data.append('/')
                source_data.append(mode)
                source_data.append('/')
                '''if each_storage_type['type'] == 'hdfs':
                    source_data = self._config['storage'][each_storage_type['type']]['uri'] + ''.join(source_data)'''
                each_storage_type['value'] = ''.join(source_data)

                persistence.mkdir(type=each_storage_type['type'],
                                  path=each_storage_type['value'],
                                  grants=self._config['storage']['grants'])
                each_storage_type['value'] = each_storage_type['value'] + Path(
                    filename).name

        persistence.store_file(storage_json=load_storage, filename=filename)
        del persistence
Exemplo n.º 6
0
 def __init__(self, e_c):
     self._ec = e_c
     self._config = self._ec.config.get_config()['normalizer']
     self._labels = self._ec.labels.get_config()['messages']['normalizer']
     self._logging = LogsHandler(self._ec, __name__)
Exemplo n.º 7
0
class Normalizer(object):

    ## Constructor
    # @param e_c context pointer
    def __init__(self, e_c):
        self._ec = e_c
        self._config = self._ec.config.get_config()['normalizer']
        self._labels = self._ec.labels.get_config()['messages']['normalizer']
        self._logging = LogsHandler(self._ec, __name__)

    ## Method oriented to specificate data_normalizations
    # @param dataframe_metadata DFMetadata()
    # @param an_objective ATypesMetadata
    # @param objective_column string indicating objective column
    # @return None if nothing to DO or Normalization_sets OrderedDict() on other way
    def define_normalizations(self, dataframe_metadata, an_objective,
                              objective_column):
        if not self._config['non_minimal_normalizations_enabled']:
            return None
        else:
            df_type = dataframe_metadata['type']
            rowcount = dataframe_metadata['rowcount']
            #cols = dataframe_metadata['cols']
            columns = dataframe_metadata['columns']
            norms = list()
            normoption = NormalizationSet()
            if df_type == 'pandas':
                for description in columns:
                    col = description['name']
                    if col != objective_column:
                        if int(description['missed']) > 0 and \
                           (int(description['missed'])/rowcount >= self._config['exclusion_missing_threshold']):
                            normoption.set_ignore_column()
                            norms.append({col: normoption.copy()})
                        if self._config['clustering_standardize_enabled'] and an_objective[0]['type'] in ['clustering'] \
                                and description['type'] in DTYPES \
                                and int(description['cardinality']) > 1 and description['mean'] != 0.0 and \
                                description['std'] != 1.0 \
                                and (
                                float(description['std']) / (float(description['max']) - float(description['min']))) \
                                > self._config['std_threshold']:
                            normoption.set_stdmean(description['mean'],
                                                   description['std'])
                            norms.append({col: normoption.copy()})
                        if self._config['standardize_enabled'] and description['type'] in DTYPES \
                            and an_objective[0]['type'] not in ['clustering']\
                            and int(description['cardinality']) > 1 and description['mean'] != 0.0 and \
                            description['std'] != 1.0 \
                            and(float(description['std']) / (float(description['max']) - float(description['min']))) \
                                 > self._config['std_threshold']:
                            normoption.set_stdmean(description['mean'],
                                                   description['std'])
                            norms.append({col: normoption.copy()})

                self._logging.log_exec('gDayF', "Normalizer",
                                       self._labels["norm_set_establish"],
                                       norms)
                if len(norms) != 0:
                    return norms.copy()
                else:
                    return None
            else:
                return None

    ## Method oriented to specificate ignored_columns
    # @param dataframe_metadata DFMetadata()
    # @param objective_column string indicating objective column
    # @return None if nothing to DO or Normalization_sets orderdict() on other way
    def define_ignored_columns(self, dataframe_metadata, objective_column):
        if not self._config['non_minimal_normalizations_enabled']:
            return None
        else:
            df_type = dataframe_metadata['type']
            rowcount = dataframe_metadata['rowcount']
            # cols = dataframe_metadata['cols']
            columns = dataframe_metadata['columns']
            norms = list()
            normoption = NormalizationSet()
            if df_type == 'pandas':
                for description in columns:
                    col = description['name']
                    if col != objective_column:
                        if int(description['cardinality']) == 1:
                            normoption.set_ignore_column()
                            norms.append({col: normoption.copy()})
                        elif self._config['datetime_columns_management'] is not None \
                                and self._config['datetime_columns_management'] \
                                and description['type'] == 'datetime64[ns]':
                            normoption.set_ignore_column()
                            norms.append({col: normoption.copy()})
                self._logging.log_exec('gDayF', "Normalizer",
                                       self._labels["ignored_set_establish"],
                                       norms)
                if len(norms) != 0:
                    return norms.copy()
                else:
                    return None
            else:
                return None

    ## Method oriented to specificate minimal data_normalizations
    # @param dataframe_metadata DFMetadata()
    # @param an_objective ATypesMetadata
    # @param objective_column string indicating objective column
    # @return [None] if nothing to DO or Normalization_sets orderdict() on other way
    def define_special_spark_naive_norm(self, dataframe_metadata):
        df_type = dataframe_metadata['type']
        if df_type == 'pandas':
            norms = list()
            normoption = NormalizationSet()
            columns = dataframe_metadata['columns']
            norms = list()
            for description in columns:
                col = description['name']
                if description['min'] is not None and float(
                        description['min']) < 0.0:
                    normoption.set_offset(
                        offset=abs(float(description['min'])) *
                        self._config['special_spark_naive_offset'])
                    norms.append({col: normoption.copy()})

            return norms.copy()
        else:
            return None

    ## Method oriented to specificate special data_normalizations non negative
    # @param dataframe_metadata DFMetadata()
    # @param an_objective ATypesMetadata
    # @param objective_column string indicating objective column
    # @return None if nothing to DO or Normalization_sets orderdict() on other way
    def define_minimal_norm(self, dataframe_metadata, an_objective,
                            objective_column):
        df_type = dataframe_metadata['type']
        if not self._config['minimal_normalizations_enabled']:
            return [None]
        elif objective_column is None:
            norms = list()
            normoption = NormalizationSet()
            columns = dataframe_metadata['columns']
            for description in columns:
                col = description['name']
                if description['type'] == "object" and self._config[
                        'base_normalization_enabled']:
                    normoption.set_base(datetime=False)
                    norms.append({col: normoption.copy()})
            return norms.copy()
        else:
            if df_type == 'pandas':
                rowcount = dataframe_metadata['rowcount']
                norms = list()
                normoption = NormalizationSet()
                normoption.set_drop_missing()
                norms.append({objective_column: normoption.copy()})

                columns = dataframe_metadata['columns']
                for description in columns:
                    col = description['name']
                    if col != objective_column:
                        if description['type'] == "object" and self._config[
                                'base_normalization_enabled']:
                            normoption.set_base()
                            norms.append({col: normoption.copy()})
                        if int(description['missed']) > 0 and \
                           (int(description['missed'])/rowcount >= self._config['exclusion_missing_threshold']):
                            if an_objective[0]['type'] in [
                                    'binomial', 'multinomial'
                            ] and self._config['manage_on_train_errors']:
                                normoption.set_mean_missing_values(
                                    objective_column, full=False)
                                norms.append({col: normoption.copy()})
                            elif an_objective[0]['type'] in [
                                    'regression'
                            ] and self._config['manage_on_train_errors']:
                                normoption.set_progressive_missing_values(
                                    objective_column)
                                norms.append({col: normoption.copy()})
                            elif an_objective[0]['type'] in ['anomalies']:
                                normoption.set_mean_missing_values(
                                    objective_column, full=True)
                                norms.append({col: normoption.copy()})
                            else:
                                normoption.set_mean_missing_values(
                                    objective_column, full=True)
                                norms.append({col: normoption.copy()})
                return norms.copy()

    ## Method oriented to filter stdmean operations on non standardize algorithms
    # @param normalizemd OrderedDict() compatible structure
    # @param model_id Model_identification
    # @return normalizemd OrderedDict() compatible structure
    def filter_standardize(self, normalizemd, model_id):
        filter_normalized = list()
        for norm_set in normalizemd:
            if norm_set is not None:
                col = list(norm_set.keys())[0]
                norms = norm_set.get(col)
                if norms['class'] == 'stdmean' and model_id in NO_STANDARDIZE:
                    self._logging.log_info('gDayF', "Normalizer",
                                           self._labels["excluding"],
                                           col + ' - ' + norms['class'])
                else:
                    filter_normalized.append(norm_set)
        return filter_normalized

    ## Method oriented to filter drop_missing operations on non standardize algorithms
    # @param normalizemd OrderedDict() compatible structure
    # @return normalizemd OrderedDict() compatible structure
    def filter_drop_missing(self, normalizemd):
        filter_normalized = list()
        for norm_set in normalizemd:
            if norm_set is not None:
                col = list(norm_set.keys())[0]
                norms = norm_set.get(col)
                if norms['class'] == 'drop_missing':
                    self._logging.log_exec('gDayF', "Normalizer",
                                           self._labels["excluding"],
                                           col + ' - ' + norms['class'])
                else:
                    filter_normalized.append(norm_set)

        return filter_normalized

    ## Method oriented to filter filling_missing operations dependent of objective_column
    # @param normalizemd OrderedDict() compatible structure
    # @return normalizemd OrderedDict() compatible structure
    def filter_objective_base(self, normalizemd):
        filter_normalized = list()
        for norm_set in normalizemd:
            if norm_set is not None:
                col = list(norm_set.keys())[0]
                norms = norm_set.get(col)
                if norms['class'] == 'progressive_missing_values' or \
                   (norms['class'] == 'mean_missing_values' and not norms['objective']['full']):
                    self._logging.log_exec('gDayF', "Normalizer",
                                           self._labels["excluding"],
                                           col + ' - ' + norms['class'])
                else:
                    filter_normalized.append(norm_set)

        return filter_normalized

    ## Main method oriented to define and manage normalizations sets applying normalizations
    # @param self object pointer
    # @param df dataframe
    # @param normalizemd OrderedDict() compatible structure
    # @return dataframe
    def normalizeDataFrame(self, df, normalizemd):
        self._logging.log_info('gDayF', "Normalizer",
                               self._labels["start_data_norm"], str(df.shape))
        if isinstance(df, pd.DataFrame):
            dataframe = df.copy()
            for norm_set in normalizemd:
                if norm_set is not None:
                    col = list(norm_set.keys())[0]
                    norms = norm_set.get(col)
                    if norms['class'] == 'base':
                        dataframe.loc[:, col] = self.normalizeBase(
                            dataframe.loc[:, col])
                        self._logging.log_info('gDayF', "Normalizer",
                                               self._labels["applying"],
                                               col + ' - ' + norms['class'])
                        if dataframe[col].dtype == '<M8[ns]' and norms[
                                'datetime']:
                            dataframe = self.normalizeDateTime(
                                dataframe=dataframe, date_column=col)
                            if self._config['datetime_columns_management'] is not None \
                                    and self._config['datetime_columns_management']['enable']:
                                self._logging.log_info(
                                    'gDayF', "Normalizer",
                                    self._labels["applying"], col + ' - ' +
                                    str(self.
                                        _config['datetime_columns_management']
                                        ['filter']))
                    elif norms['class'] == 'drop_missing':
                        try:
                            dataframe = self.normalizeDropMissing(
                                dataframe, col)
                            self._logging.log_info(
                                'gDayF', "Normalizer",
                                self._labels["applying"],
                                col + ' - ' + norms['class'])
                        except KeyError:
                            self._logging.log_info(
                                'gDayF', "Normalizer",
                                self._labels["excluding"],
                                col + ' - ' + norms['class'])
                    elif norms['class'] == 'stdmean':
                        dataframe.loc[:, col] = self.normalizeStdMean(
                            dataframe.loc[:, col], norms['objective']['mean'],
                            norms['objective']['std'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['mean']) + ',' +
                            str(norms['objective']['std']) + ' ) ')
                    elif norms['class'] == 'working_range':
                        dataframe.loc[:, col] = self.normalizeWorkingRange(
                            dataframe.loc[:,
                                          col], norms['objective']['minval'],
                            norms['objective']['maxval'],
                            norms['objective']['minrange'],
                            norms['objective']['maxrange'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['minval']) + ',' +
                            str(norms['objective']['maxval']) + ' ) ')
                    elif norms['class'] == 'offset':
                        dataframe.loc[:, col] = self.normalizeOffset(
                            dataframe.loc[:, col],
                            norms['objective']['offset'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['offset']) + ' )')
                    elif norms['class'] == 'discretize':
                        dataframe.loc[:, col] = self.normalizeDiscretize(
                            dataframe.loc[:, col],
                            norms['objective']['buckets_number'],
                            norms['objective']['fixed_size'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['buckets_number']) + ',' +
                            str(norms['objective']['fixed_size']) + ' ) ')
                    elif norms['class'] == 'aggregation':
                        dataframe.loc[:, col] = self.normalizeAgregation(
                            dataframe.loc[:, col],
                            norms['objective']['bucket_ratio'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['bucket_ratio']) + ' ) ')
                    elif norms['class'] == 'fixed_missing_values':
                        dataframe.loc[:, col] = self.fixedMissingValues(
                            dataframe.loc[:, col], norms['objective']['value'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            str(norms['objective']['value']) + ' ) ')
                    elif norms['class'] == 'mean_missing_values':
                        dataframe = self.meanMissingValues(
                            dataframe, col,
                            norms['objective']['objective_column'],
                            norms['objective']['full'])
                        if norms['objective']['objective_column'] is None:
                            norms['objective']['objective_column'] = 'None'
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            norms['objective']['objective_column'] + ',' +
                            str(norms['objective']['full']) + ' ) ')
                    elif norms['class'] == 'progressive_missing_values':
                        dataframe = self.progressiveMissingValues(
                            dataframe, col,
                            norms['objective']['objective_column'])
                        self._logging.log_info(
                            'gDayF', "Normalizer", self._labels["applying"],
                            col + ' - ' + norms['class'] + ' ( ' +
                            norms['objective']['objective_column'] + ' ) ')
                    elif norms['class'] == 'ignore_column':
                        pass
                    #elif norms['class'] == 'binary_encoding':
                    #self.normalizeBinaryEncoding(dataframe[col])
                else:
                    self._logging.log_info('gDayF', "Normalizer",
                                           self._labels["nothing_to_do"])
            return dataframe
        else:
            return df

    ##Method oriented to generate ignored_column_list on issues where missed > exclusion_missing_threshold
    # @param  normalizemd  mormalizations_set_metadata
    # @return ignored_list updated
    def ignored_columns(self, normalizemd):
        ignored_list = list()
        if normalizemd is not None:
            for elements in normalizemd:
                for col, value in elements.items():
                    if value['class'] == 'ignore_column':
                        ignored_list.append(col)
        self._logging.log_info('gDayF', "Normalizer",
                               self._labels["ignored_list"], ignored_list)
        return ignored_list.copy()

    ## Internal method oriented to manage drop NaN values from dataset
    # @param self object pointer
    # @param dataframe single column dataframe
    # @return dataframe
    def normalizeBase(self, dataframe):
        if dataframe.dtype == np.object:
            try:
                return pd.to_numeric(dataframe)
            except ValueError:
                try:
                    return pd.to_datetime(dataframe)
                except ValueError:
                    return pd.Categorical(dataframe)

    ## Internal method oriented to manage base normalizations
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param col column base to reference drop NaN
    # @return dataframe
    def normalizeDropMissing(self, dataframe, col):
        return dataframe.dropna(axis=0, subset=[col])

    ## Internal method oriented to manage Working range normalizations on a [closed, closed] interval
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param minval
    # @param maxval
    # @return dataframe
    def normalizeWorkingRange(self,
                              dataframe,
                              minval=-1.0,
                              maxval=1.0,
                              minrange=-1.0,
                              maxrange=1.0):
        assert (maxval > minval)
        if dataframe.dtype != np.object:
            if dataframe.dtype != np.object:
                convert_factor = (maxrange - minrange) / (maxval - minval)
                dataframe = dataframe.astype(np.float16)
                dataframe = dataframe.apply(
                    lambda x: (x - minval) * convert_factor + minrange)
            return dataframe.copy()

    ## Internal method oriented to manage Working range normalizations on a [closed, closed] interval
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param minval
    # @param maxval
    # @return dataframe
    def normalizeOffset(self, dataframe, offset=0):
        if dataframe.dtype != np.object:
            dataframe = offset + dataframe
        return dataframe.copy()

    ## Internal method oriented to manage bucket ratio normalizations head - tail
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param br bucket ratio
    # @return dataframe
    def normalizeAgregation(self, dataframe, br=0.25):
        if (dataframe.dtype != np.object):
            buckets = int(1 / (br / 2))
            q, bins = pd.qcut(dataframe.iloc[:], buckets, retbins=True)
            if dataframe.dtype != np.int:
                dataframe[dataframe <= bins[1]] = np.int(
                    dataframe[dataframe <= bins[1]].mean().copy())
                dataframe[dataframe >= bins[-2]] = np.int(
                    dataframe[dataframe >= bins[-2]].mean().copy())
            else:
                dataframe[dataframe <= bins[1]] = dataframe[
                    dataframe <= bins[1]].mean().copy()
                dataframe[dataframe <= bins[-2]] = dataframe[
                    dataframe <= bins[-2]].mean().copy()
        return dataframe.copy()

    ## Internal method oriented to manage Binary encodings
    # @param self object pointer
    # @param dataframe single column dataframe
    # @return dataframe
    def normalizeBinaryEncoding(self, dataframe):
        return dataframe.copy()

    ## Internal method oriented to manage mean and std normalizations. Default mean=0 std=1
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param mean mean value to center
    # @param std standard deviation value to be normalized
    # @return dataframe
    def normalizeStdMean(self, dataframe, mean, std):
        if dataframe.dtype != np.object and dataframe.dtype != "datetime64[ns]":
            try:
                dataframe = dataframe.astype(np.float64)
                dataframe = dataframe.apply(lambda x: x - float(mean))
                dataframe = dataframe.apply(lambda x: x / float(std))
            except ZeroDivisionError:
                dataframe = dataframe.apply(lambda x: x + float(mean))
            #dataframe = preprocessing.scale(dataframe)
        return dataframe.copy()

    ## Internal method oriented to manage bucketing for discretize
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param buckets_number Int
    # @param fixed_size Boolean (True=Fixed Size, False Fixed Frecuency
    # @return dataframe
    def normalizeDiscretize(self, dataframe, buckets_number, fixed_size):
        #Un número de buckets de tamaño fixed_size
        if fixed_size:
            return pd.qcut(dataframe, buckets_number)
        else:
            return pd.cut(dataframe, buckets_number)

    ## Internal method oriented to manage imputation for missing values to fixed value
    # @param self object pointer
    # @param dataframe single column dataframe
    # @param value int
    # @return dataframe
    def fixedMissingValues(self, dataframe, value=0.0):
        return dataframe.fillna(value)

    ## Internal method oriented to manage imputation for missing values to mean value
    # @param self object pointer
    # @param dataframe full column dataframe
    # @param col column name for imputation
    # @param objective_col objective_column
    # @param full True means fll_dataframe.mean(), False means objective_col.value.mean()
    # @return dataframe
    def meanMissingValues(self, dataframe, col, objective_col, full=False):
        if full:
            return dataframe.fillna(dataframe.mean())
        else:
            nullfalse = dataframe[dataframe[:][col].notnull()][[
                col, objective_col
            ]]
            if objective_col in DTYPES:
                nullfalse_gb = nullfalse.groupby(objective_col).mean()
            else:
                nullfalse_gb = nullfalse.groupby(objective_col).agg(
                    lambda x: x.value_counts().index[0])
            for index, row in dataframe[dataframe[:][col].isnull()].iterrows():
                row = row.copy()
                if nullfalse_gb.index.isin([row[objective_col]]).any():
                    dataframe.loc[index,
                                  col] = nullfalse_gb.loc[row[objective_col],
                                                          col]
            return dataframe.copy()

    ## Internal method oriented to manage progressive imputations for missing values.
    # ([right_not_nan] - [left_not_nan])/Cardinality(is_nan)
    # @param self object pointer
    # @param dataframe full column dataframe
    # @param col column name for imputation
    # @param objective_col objective_column
    # @return dataframe
    def progressiveMissingValues(self, dataframe, col, objective_col):
        nullfalse = dataframe[dataframe[:][col].notnull()].sort_values(
            objective_col, axis=0, ascending=True)[[col, objective_col]]
        nullfalse_gb = nullfalse.groupby(objective_col).mean()
        for index, row in dataframe[dataframe[:][col].isnull()].iterrows():
            row = row.copy()
            if nullfalse_gb.index.isin([row[objective_col]]).any():
                dataframe.loc[index,
                              col] = nullfalse_gb.loc[row[objective_col], col]
            else:
                index_max = nullfalse_gb.index.where(
                    nullfalse_gb.index > row[objective_col]).min()
                index_min = nullfalse_gb.index.where(
                    nullfalse_gb.index < row[objective_col]).max()
                try:
                    if index_min is np.nan and index_max is np.nan \
                       or index_min is None or index_max is None:
                        pass
                    if index_min is np.nan or index_min is None:
                        dataframe.loc[index, col] = nullfalse_gb.loc[index_max,
                                                                     col]
                    elif index_max is np.nan or index_max is None:
                        dataframe.loc[index, col] = nullfalse_gb.loc[index_min,
                                                                     col]
                    else:
                        minimal = min(nullfalse_gb.loc[index_min, col],
                                      nullfalse_gb.loc[index_max, col])
                        maximal = max(nullfalse_gb.loc[index_min, col],
                                      nullfalse_gb.loc[index_max, col])
                        b = maximal - minimal
                        a = index_max - index_min
                        x = (row[objective_col] - index_min) / a
                        offset = b * x
                        dataframe.loc[index, col] = minimal + offset
                except TypeError:
                    pass
        return dataframe.copy()

    ## Internal method oriented to manage date_time conversions to pattern
    # @param self object pointer
    # @param dataframe full column dataframe to be expanded
    # @param date_column Date_Column name to be transformed
    # @return dataframe
    def normalizeDateTime(self, dataframe, date_column=None):
        datetime_columns_management = self._config[
            'datetime_columns_management']
        if date_column is not None:
            if datetime_columns_management is not None and datetime_columns_management[
                    'enable']:
                for element in datetime_columns_management['filter']:
                    try:
                        if element not in ['weekday', 'weeknumber']:
                            dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
                                .transform(lambda x: eval('x.' + element))
                        elif element == 'weekday':
                            dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
                                .transform(lambda x: x.isoweekday())
                        elif element == 'weeknumber':
                            dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
                                .transform(lambda x: x.isocalendar()[1])
                    except AttributeError:
                        print('TRC: invalid configuration:' + element)
                        pass
        return dataframe.copy()
Exemplo n.º 8
0
class Adviser(object):
    deepness = 1

    ## Constructor
    # @param self object pointer
    # @param e_c context pointer
    # @param deep_impact A* max_deep
    # @param metric metrict for priorizing models ['accuracy', 'rmse', 'test_accuracy', 'combined'] on train
    # @param dataframe_name dataframe_name or id
    # @param hash_dataframe MD5 hash value

    def __init__(self,
                 e_c,
                 deep_impact=5,
                 metric='accuracy',
                 dataframe_name='',
                 hash_dataframe=''):
        self._ec = e_c
        self._labels = self._ec.labels.get_config()['messages']['adviser']
        self._config = self._ec.config.get_config()['optimizer']
        self._frameworks = self._ec.config.get_config()['frameworks']
        self._logging = LogsHandler(self._ec)
        self.timestamp = time()
        self.an_objective = None
        self.deep_impact = deep_impact
        self.analysis_recommendation_order = list()
        self.analyzed_models = list()
        self.excluded_models = list()
        self.next_analysis_list = list()
        self.metric = metric
        self.dataframe_name = dataframe_name
        self.hash_dataframe = hash_dataframe

    ## Main method oriented to execute smart analysis
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param amode [POC, NORMAL, FAST, PARANOIAC, FAST_PARANOIAC]
    # @param objective_column string indicating objective column
    # @param atype atypes constats or None
    # @return ArMetadata()'s Prioritized queue
    def set_recommendations(self,
                            dataframe_metadata,
                            objective_column,
                            amode=POC,
                            atype=None):
        supervised = True
        if objective_column is None:
            supervised = False
        self._logging.log_exec(self._ec.get_id_analysis(), 'AdviserAStar',
                               self._labels["ana_type"],
                               str(atype) + ' (' + str(self.deepness) + ')')
        if supervised:
            if self.deepness == 1:
                self.an_objective = self.get_analysis_objective(
                    dataframe_metadata,
                    objective_column=objective_column,
                    atype=atype)
            if amode == POC:
                return self.analysispoc(dataframe_metadata,
                                        objective_column,
                                        amode=FAST)
            if amode in [FAST, NORMAL]:
                return self.analysisnormal(dataframe_metadata,
                                           objective_column,
                                           amode=amode)
            elif amode in [FAST_PARANOIAC, PARANOIAC]:
                return self.analysisparanoiac(dataframe_metadata,
                                              objective_column,
                                              amode=amode)
        else:
            if amode in [ANOMALIES]:
                self.an_objective = ATypesMetadata(anomalies=True)
                return self.analysisanomalies(dataframe_metadata,
                                              objective_column,
                                              amode=amode)
            elif amode in [CLUSTERING]:
                self.an_objective = ATypesMetadata(clustering=True)
                return self.analysisclustering(dataframe_metadata,
                                               objective_column,
                                               amode=amode)

    ## Method oriented to execute smart normal and fast analysis
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param objective_column string indicating objective column
    # @param amode [POC, NORMAL, FAST, PARANOIAC, FAST_PARANOIAC]
    # @return analysis_id, Ordered[(algorithm_metadata.json, normalizations_sets.json)]
    def analysisnormal(self, dataframe_metadata, objective_column, amode):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            self.base_iteration(amode, dataframe_metadata, objective_column)
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.deepness == 2:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        if model_type not in best_models and len(
                                best_models) < self._config['adviser_L2_wide']:
                            fw_model_list.extend(
                                self.optimize_models(
                                    self.analysis_recommendation_order[indexer]
                                ))
                            best_models.append(model_type)
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    register it as evaluated and seleted'''
                    best_models.append(model_type)
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        if model_type not in best_models and len(
                                best_models
                        ) < self._config['adviser_normal_wide']:
                            fw_model_list.extend(
                                self.optimize_models(
                                    self.analysis_recommendation_order[indexer]
                                ))
                            #print("Trace:%s-%s" % (model_type, best_models))
                            best_models.append(model_type)
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    register it as evaluated and seleted'''
                    best_models.append(model_type)
            '''' Modified 20/09/2017
            # Get two most potential best models
            fw_model_list = list()
            for indexer in range(0, 2):
                try:
                    fw_model_list.extend(self.optimize_models(self.analysis_recommendation_order[indexer]))
                except TypeError:
                    pass
            #if fw_model_list is not None:'''
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to execute poc analysis
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param objective_column string indicating objective column
    # @param amode [POC, NORMAL, FAST, PARANOIAC, FAST_PARANOIAC]
    # @return analysis_id, Ordered[(algorithm_metadata.json, normalizations_sets.json)]
    def analysispoc(self, dataframe_metadata, objective_column, amode):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            self.base_iteration(amode, dataframe_metadata, objective_column)
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            # Get two most potential best models
            fw_model_list = list()
            for indexer in range(0, 1):
                try:
                    if self.analysis_recommendation_order[indexer][
                            'status'] == 'Executed':
                        fw_model_list.extend(
                            self.optimize_models(
                                self.analysis_recommendation_order[indexer]))
                except TypeError:
                    pass
            # if fw_model_list is not None:
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to execute new analysis
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()j
    # @param list_ar_metadata List of ar json compatible model's descriptors
    # @return analysis_id, Ordered[(algorithm_metadata.json, normalizations_sets.json)]
    def analysis_specific(self, dataframe_metadata, list_ar_metadata):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            #Check_dataframe_metadata compatibility
            self.base_specific(dataframe_metadata, list_ar_metadata)
        # Added 22/09/1974
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    # Modified 31/08/2017
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        if model_type not in best_models:
                            fw_model_list.extend(
                                self.optimize_models(
                                    self.analysis_recommendation_order[indexer]
                                ))
                            #print("Trace:%s-%s" % (model_type, best_models))
                            best_models.append(model_type)
                            # End - Modified 31/08/2017
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    pass and look for next best model on this type'''
                    pass
            # if fw_model_list is not None:
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to execute smart normal and fast analysis
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param amode [POC, NORMAL, FAST, PARANOIAC, FAST_PARANOIAC]
    # @param objective_column string indicating objective column
    # @return analysis_id,(framework, Ordered[(algorithm_metadata.json, normalizations_sets.json)])
    def analysisparanoiac(self, dataframe_metadata, objective_column, amode):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            self.base_iteration(amode, dataframe_metadata, objective_column)
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    # Modified 31/08/2017
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        if model_type not in best_models:
                            fw_model_list.extend(
                                self.optimize_models(
                                    self.analysis_recommendation_order[indexer]
                                ))
                            #print("Trace:%s-%s" % (model_type, best_models))
                            best_models.append(model_type)
                            # End - Modified 31/08/2017
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    pass and look for next best model on this type'''
                    pass
            #if fw_model_list is not None:
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to execute unsupervised anomalies models
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param amode [ANOMALIES]
    # @param objective_column string indicating objective column
    # @return analysis_id,(framework, Ordered[(algorithm_metadata.json, normalizations_sets.json)])

    def analysisanomalies(self, dataframe_metadata, objective_column, amode):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            self.base_iteration(amode, dataframe_metadata, objective_column)
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    # Modified 31/08/2017
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        if model_type not in best_models:
                            #print("Trace:%s-%s"%(model_type, best_models))
                            fw_model_list.extend(
                                self.optimize_models(
                                    self.analysis_recommendation_order[indexer]
                                ))
                            best_models.append(model_type)
                            # End - Modified 31/08/2017
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    pass and look for next best model on this type'''
                    pass
            #if fw_model_list is not None:
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to execute unsupervised clustering models
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param amode [CLUSTERING]
    # @param objective_column string indicating objective column
    # @return analysis_id,(framework, Ordered[(algorithm_metadata.json, normalizations_sets.json)])

    def analysisclustering(self, dataframe_metadata, objective_column, amode):
        self.next_analysis_list.clear()
        if self.deepness == 1:
            self.base_iteration(amode, dataframe_metadata, objective_column)
        elif self.deepness > self.deep_impact:
            self.next_analysis_list = None
        elif self.next_analysis_list is not None:
            fw_model_list = list()
            # Added 31/08/2017
            best_models = list()
            # End - Added 31/08/2017
            aux_loop_controller = len(self.analysis_recommendation_order)
            for indexer in range(0, aux_loop_controller):
                try:
                    # Modified 31/08/2017
                    model = self.analysis_recommendation_order[indexer]
                    if model['status'] == 'Executed':
                        model_type = model['model_parameters'][get_model_fw(
                            model)]['model']
                        #if model_type not in best_models:
                        #print("Trace:%s-%s"%(model_type, best_models))
                        fw_model_list.extend(
                            self.optimize_models(
                                self.analysis_recommendation_order[indexer]))
                        #best_models.append(model_type)
                        # End - Modified 31/08/2017
                except TypeError:
                    ''' If all optimize_models doesn't return new models 
                    pass and look for next best model on this type'''
                    best_models.append(model_type)
            #if fw_model_list is not None:
            self.next_analysis_list.extend(fw_model_list)
            if len(self.next_analysis_list) == 0:
                self.next_analysis_list = None
        self.deepness += 1
        return self._ec.get_id_analysis(), self.next_analysis_list

    ## Method oriented to generate specific candidate metadata
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param list_ar_metadata
    def base_specific(self, dataframe_metadata, list_ar_metadata):
        version = self._ec.config.get_config()['common']['version']
        for ar_metadata in list_ar_metadata:

            ar_structure = ArMetadata()
            if ar_metadata['dataset_hash_value'] == self.hash_dataframe:
                self._ec.set_id_analysis(ar_metadata['model_id'])
                ar_structure['predecessor'] = ar_metadata['model_parameters'][get_model_fw(ar_metadata)] \
                    ['parameters']['model_id']['value']
                ar_structure['round'] = int(ar_metadata['round']) + 1
            else:
                ar_structure['predecessor'] = 'root'

            ar_structure['model_id'] = self._ec.get_id_analysis()
            ar_structure['version'] = version
            ar_structure['user_id'] = self._ec.get_id_user()
            ar_structure['workflow_id'] = ar_metadata['workflow_id']
            ar_structure['objective_column'] = ar_metadata['objective_column']
            ar_structure['timestamp'] = self.timestamp
            ar_structure['normalizations_set'] = ar_metadata[
                'normalizations_set']
            ar_structure['dataset'] = self.dataframe_name
            ar_structure['dataset_hash_value'] = self.hash_dataframe
            ar_structure['data_initial'] = dataframe_metadata
            ar_structure['data_normalized'] = None
            ar_structure['model_parameters'] = ar_metadata['model_parameters']
            ar_structure['ignored_parameters'] = None
            ar_structure['full_parameters_stack'] = None
            ar_structure['status'] = -1
            self.next_analysis_list.append(ar_structure)
            self.analyzed_models.append(
                self.generate_vectors(ar_structure,
                                      ar_metadata['normalizations_set']))

    ## Method oriented to select initial candidate models
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param amode [POC, NORMAL, FAST, PARANOIAC, FAST_PARANOIAC]
    # @param objective_column string indicating objective column
    def base_iteration(self, amode, dataframe_metadata, objective_column):
        version = self._ec.config.get_config()['common']['version']
        supervised = True
        if objective_column is None:
            supervised = False

        increment = self.get_size_increment(dataframe_metadata)
        fw_model_list = self.get_candidate_models(self.an_objective,
                                                  amode,
                                                  increment=increment)

        aux_model_list = list()
        norm = Normalizer(self._ec)
        #modified 11/09/2017
        #minimal_nmd = [norm.define_minimal_norm(objective_column=objective_column)]
        minimal_nmd = norm.define_minimal_norm(
            dataframe_metadata=dataframe_metadata,
            objective_column=objective_column,
            an_objective=self.an_objective)
        for fw, model, _ in fw_model_list:
            aux_model_list.append((fw, model, deepcopy(minimal_nmd)))
        fw_model_list = aux_model_list

        self.applicability(fw_model_list,
                           nrows=dataframe_metadata['rowcount'],
                           ncols=dataframe_metadata['cols'])

        nmd = norm.define_normalizations(dataframe_metadata=dataframe_metadata,
                                         objective_column=objective_column,
                                         an_objective=self.an_objective)

        if nmd is not None:
            nmdlist = list()
            for fw, model, _ in fw_model_list:
                if minimal_nmd is not None and len(minimal_nmd) > 0:
                    whole_nmd = deepcopy(minimal_nmd)
                    whole_nmd.extend(deepcopy(nmd))
                    nmdlist.append((fw, model, whole_nmd))
                else:
                    nmdlist.append((fw, model, deepcopy(nmd)))

            fw_model_list.extend(nmdlist)

        for fw, model_params, norm_sets in fw_model_list:
            #Included 26/05/2018: Changeset: "only_standardize"
            if not(norm_sets is not None and len(norm_sets) > 0 and compare_sorted_list_dict(norm_sets, minimal_nmd) \
                   and model_params['only_standardize'])\
               or ((norm_sets is None or len(norm_sets) == 0) and model_params['only_standardize']):
                ar_structure = ArMetadata()
                ar_structure['model_id'] = self._ec.get_id_analysis()
                ar_structure['version'] = version
                ar_structure['user_id'] = self._ec.get_id_user()
                ar_structure['workflow_id'] = self._ec.get_id_workflow()
                ar_structure['objective_column'] = objective_column
                ar_structure['timestamp'] = self.timestamp
                ar_structure['normalizations_set'] = norm_sets
                ar_structure['dataset'] = self.dataframe_name
                ar_structure['dataset_hash_value'] = self.hash_dataframe
                ar_structure['data_initial'] = dataframe_metadata
                ar_structure['data_normalized'] = None
                ar_structure['model_parameters'] = OrderedDict()
                ar_structure['model_parameters'][fw] = model_params
                ar_structure['ignored_parameters'] = None
                ar_structure['full_parameters_stack'] = None
                ar_structure['predecessor'] = 'root'
                ar_structure['status'] = -1
                self.next_analysis_list.append(ar_structure)
                self.analyzed_models.append(
                    self.generate_vectors(ar_structure, norm_sets))

    ## Method oriented to get frameworks default values from config
    # @param self object pointer
    # @return FrameWorkMetadata
    def load_frameworks(self):
        return FrameworkMetadata(self._ec)

    ## Method oriented to analyze DFmetadata and select analysis objective
    # @param self object pointer
    # @param dataframe_metadata DFMetadata()
    # @param objective_column string indicating objective column
    # @param atype atypes constats or None
    # @return ArType or None if objective_column not found
    def get_analysis_objective(self,
                               dataframe_metadata,
                               objective_column,
                               atype=None):
        config = self._config['AdviserStart_rules']['common']
        for each_column in dataframe_metadata['columns']:
            if each_column['name'] == objective_column:

                if each_column['missed'] != 0:
                    cardinality = int(each_column['cardinality']) - 1
                else:
                    cardinality = int(each_column['cardinality'])

                if cardinality == 2 and (atype == 'binomial' or atype is None):
                    if atype is not None:
                        self._logging.log_info(self._ec.get_id_analysis(),
                                               'AdviserAStar',
                                               self._labels["sucess_specific"],
                                               '%s-%s' % (cardinality, atype))
                    return ATypesMetadata(binomial=True)
                elif atype is not None:
                    if atype == 'regression':
                        self._logging.log_info(self._ec.get_id_analysis(),
                                               'AdviserAStar',
                                               self._labels["sucess_specific"],
                                               '%s-%s' % (cardinality, atype))
                        return ATypesMetadata(regression=True)
                    if atype == 'multinomial':
                        self._logging.log_info(self._ec.get_id_analysis(),
                                               'AdviserAStar',
                                               self._labels["sucess_specific"],
                                               '%s-%s' % (cardinality, atype))
                        return ATypesMetadata(multinomial=True)
                    else:
                        self._logging.log_info(self._ec.get_id_analysis(),
                                               'AdviserAStar',
                                               self._labels["failed_specific"],
                                               '%s-%s' % (cardinality, atype))

                if each_column['type'] not in DTYPES:
                    if cardinality > 2:
                        return ATypesMetadata(multinomial=True)
                elif cardinality <= config['multi_cardinality_limit'] \
                        and cardinality <= (dataframe_metadata['rowcount']*config['multi_limit']):
                    return ATypesMetadata(multinomial=True)
                else:
                    return ATypesMetadata(regression=True)

                self._logging.log_critical(self._ec.get_id_analysis(),
                                           'AdviserAStar',
                                           self._labels["failed_mselection"],
                                           '%s-%s' % (cardinality, atype))
        return None

    ## Method oriented to analyze get increments on effort based on DF_metadata structure
    # @param self object pointer
    # @param df_metadata DfMetada
    # @return float increment
    def get_size_increment(self, df_metadata):
        base = self._config['common']['base_increment']
        increment = 1.0
        variabilizations = df_metadata['rowcount'] * df_metadata['cols']
        for _, pvalue in base.items():
            if variabilizations > pvalue['base'] and increment < pvalue[
                    'increment']:
                increment = pvalue['increment']
        self._logging.log_info(self._ec.get_id_analysis(), 'AdviserAStar',
                               self._labels["inc_application"], increment)
        return increment

    ## Method oriented to analyze choose models candidate and select analysis objective
    # @param self object pointer
    # @param atype ATypesMetadata
    # @param amode Analysismode
    # @param increment increment x size
    # @return FrameworkMetadata()
    def get_candidate_models(self, atype, amode, increment=1.0):
        defaultframeworks = self.load_frameworks()
        model_list = list()
        for fw, fw_value in defaultframeworks.items():
            if fw_value['conf']['enabled']:
                wfw_module = importlib.import_module(
                    self._frameworks[fw]['conf']['framework_metadata_module'])
                wfw = eval(
                    'wfw_module.' +
                    self._frameworks[fw]['conf']['framework_metadata_class'] +
                    '(defaultframeworks)')
                for each_base_model in wfw.get_default():
                    if each_base_model['enabled']:
                        for each_type in each_base_model['types']:
                            if each_type['active'] and each_type[
                                    'type'] == atype[0]['type']:
                                model_module = importlib.import_module(
                                    self._frameworks[fw]['conf']
                                    ['model_metadata_module'])
                                modelbase = eval('model_module.' +
                                                 self._frameworks[fw]['conf']
                                                 ['model_metadata_class'] +
                                                 '(self._ec)')
                                model = modelbase.generate_models(
                                    each_base_model['model'], atype, amode,
                                    increment)
                                wfw.models.append(model)
                                model_list.append((fw, model, None))
        return model_list

    ## Method oriented to select applicability of models over min_rows_limit
    # @param self object pointer
    # @param model_list List[ArMetadata]
    # @param nrows number of rows of dataframe
    # @param ncols number of cols of dataframe
    # @return implicit List[ArMetadata]
    def applicability(self, model_list, nrows, ncols):
        fw_config = self._ec.config.get_config()['frameworks']
        exclude_model = list()
        for iterator in range(0, len(model_list)):
            fw = model_list[iterator][0]
            model = model_list[iterator][1]
            if fw_config[fw]['conf']['min_rows_enabled'] and (
                    nrows < model['min_rows_applicability']):
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'AdviserAStar',
                    self._labels["exc_applicability"], model['model'] + ' - ' +
                    'rows < ' + str(model['min_rows_applicability']))
                exclude_model.append(model_list[iterator])
            if fw_config[fw]['conf']['max_cols_enabled'] and model['max_cols_applicability'] is not None \
                    and(ncols > model['max_cols_applicability']):
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'AdviserAStar',
                    self._labels["exc_applicability"], model['model'] + ' - ' +
                    'cols > ' + str(model['max_cols_applicability']))
                exclude_model.append(model_list[iterator])
        for model in exclude_model:
            model_list.remove(model)

    ##Method get train accuracy for generic model
    # @param model
    # @return accuracy metric, inverse rmse, objective or 0.0, 10e+8, objective if not exists
    @staticmethod
    def get_train_accuracy(model):
        try:
            return float(model['metrics']['accuracy']['train']),\
                   1/float(model['metrics']['execution']['train']['RMSE']),\
                   1.0
        except ZeroDivisionError:
            return float(model['metrics']['accuracy']['train']), \
                   -1.0, \
                   1.0
        except KeyError:
            return -1.0, -1.0, 1.0
        except Exception:
            return -1.0, -1.0, 1.0

    ##Method get test accuracy for generic model
    # @param model
    # @return accuracy metric, inverse rmse, objective or 0.0, 10e+308, objective if not exists
    @staticmethod
    def get_test_accuracy(model):
        try:
            return float(model['metrics']['accuracy']['test']),\
                   1/float(model['metrics']['execution']['test']['RMSE']),\
                   1.0
        except ZeroDivisionError:
            return float(model['metrics']['accuracy']['test']), \
                   -1.0, \
                   1.0
        except KeyError:
            return -1.0, -1.0, 1.0
        except Exception:
            return -1.0, -1.0, 1.0

    ##Method get averaged train and test  accuracy for generic model
    # @param model
    # @return accuracy metric, inverse rmse, objective or 0.0, 10e+308, objective if not exists
    @staticmethod
    def get_combined_accuracy(model):
        try:
            return float(model['metrics']['accuracy']['combined']),\
                   1/float(model['metrics']['execution']['train']['RMSE']),\
                   1.0
        except ZeroDivisionError:
            return float(model['metrics']['accuracy']['combined']), \
                   -1.0, \
                   1.0
        except KeyError:
            return -1.0, -1.0, 1.0
        except Exception:
            return -1.0, -1.0, 1.0

    ##Method get rmse for generic model
    # @param model
    # @return rsme metric, inverse combined accuracy, objective or 10e+308, 0.0, objective if not exists
    @staticmethod
    def get_train_rmse(model):
        if str(float(model['metrics']['execution']['train']
                     ['RMSE'])).lower() == 'nan':
            rmse = 1e+16
        else:
            rmse = float(model['metrics']['execution']['train']['RMSE'])
        try:
            return rmse,\
                   1/float(model['metrics']['accuracy']['combined']),\
                   0.0
        except ZeroDivisionError:
            return rmse,\
                   1e+16, \
                   0.0
        except KeyError:
            return 1e+16, 1e+16, 0.0
        except Exception:
            return 1e+16, 1e+16, 0.0

    ##Method get test rmse for generic model
    # @param model
    # @return rsme metric, inverse combined accuracy, objective or 10e+308, 0.0, objective if not exists
    @staticmethod
    def get_test_rmse(model):
        if str(float(model['metrics']['execution']['test']
                     ['RMSE'])).lower() == 'nan':
            rmse = 1e+16
        else:
            rmse = float(model['metrics']['execution']['test']['RMSE'])
        try:
            return rmse,\
                   1/float(model['metrics']['accuracy']['combined']),\
                   0.0
        except ZeroDivisionError:
            return rmse,\
                   1e+16, \
                   0.0
        except KeyError:
            return 1e+16, 1e+16, 0.0
        except Exception:
            return 1e+16, 1e+16, 0.0

    ##Method get clustering distance for generic model
    # @param model
    # @return The Total Within Cluster Sum-of-Square Error metric, inverse The Between Cluster Sum-of-Square Error,
    # objective or 10e+308, 0.0, objective if not exists
    @staticmethod
    def get_cdistance(model):
        try:
            return float(model['metrics']['execution']['train']['tot_withinss']), \
                   1/float(model['metrics']['execution']['train']['betweenss']), \
                   0.0
        except ZeroDivisionError:
            return float(model['metrics']['execution']['train']['tot_withinss']), \
                   1e+16, \
                   0.0
        except TypeError:
            return float(model['metrics']['execution']['train']['tot_withinss']), \
                   1e+16, \
                   0.0
        except KeyError:
            return 1e+16, 1e+16, 0.0

    ##Method get train accuracy for generic model
    # @param model
    # @return r2 metric, inverse rmse, objective or 0.0, 10e+8, objective if not exists
    @staticmethod
    def get_train_r2(model):
        try:
            return float(model['metrics']['execution']['train']['r2']),\
                   1/float(model['metrics']['execution']['train']['RMSE']),\
                   1.0
        except ZeroDivisionError:
            return float(model['metrics']['execution']['train']['r2']), \
                   -1.0, \
                   1.0
        except KeyError:
            return -1.0, -1.0, 1.0
        except Exception:
            return -1.0, -1.0, 1.0

    ##Method get test accuracy for generic model
    # @param model
    # @return r2 metric, inverse rmse, objective or 0.0, 10e+308, objective if not exists
    @staticmethod
    def get_test_r2(model):
        try:
            return float(model['metrics']['execution']['test']['r2']),\
                   1/float(model['metrics']['execution']['test']['RMSE']),\
                   1.0
        except ZeroDivisionError:
            return float(model['metrics']['execution']['test']['r2']), \
                   -1.0, \
                   1.0
        except KeyError:
            return -1.0, -1.0, 1.0
        except Exception:
            return -1.0, -1.0, 1.0

    ## Method managing scoring algorithm results
    # params: results for Handlers (gdayf.handlers)
    # @param model_list for models analyzed
    # @return (fw,model_list) (ArMetadata, normalization_set)
    def priorize_models(self, model_list):
        if self.metric == 'train_accuracy':
            return sorted(model_list,
                          key=self.get_train_accuracy,
                          reverse=True)
        elif self.metric == 'test_accuracy':
            return sorted(model_list, key=self.get_test_accuracy, reverse=True)
        elif self.metric == 'combined_accuracy':
            return sorted(model_list,
                          key=self.get_combined_accuracy,
                          reverse=True)
        elif self.metric == 'cdistance':
            return sorted(model_list, key=self.get_cdistance)
        elif self.metric == 'train_rmse':
            return sorted(model_list, key=self.get_train_rmse)
        elif self.metric == 'test_rmse':
            return sorted(model_list, key=self.get_test_rmse)
        elif self.metric == 'train_r2':
            return sorted(model_list, key=self.get_train_r2, reverse=True)
        elif self.metric == 'test_r2':
            return sorted(model_list, key=self.get_test_r2, reverse=True)
        else:
            return model_list

    ## Store executed model base parameters to check past executions
    # @param model - ArMetadata to be stored as executed
    # @param normalization_set
    # @return model_vector (fw, model_id, vector, normalizaton_set)
    def generate_vectors(self, model, normalization_set):
        vector = list()
        norm_vector = list()
        fw = get_model_fw(model)
        for parm, parm_value in model['model_parameters'][fw][
                'parameters'].items():
            if isinstance(parm_value, OrderedDict) and parm != 'model_id':
                vector.append(parm_value['value'])
        #added 31/08/2017
        if normalization_set == [None]:
            norm_vector = normalization_set
        else:
            for normalization in normalization_set:
                norm_vector.append(
                    md5(dumps(normalization).encode('utf8')).hexdigest())
        #print("Trace:%s-%s-%s-%s"%(fw, model['model_parameters'][fw]['model'], vector, norm_vector))
        return fw, model['model_parameters'][fw]['model'], vector, norm_vector

    ## Check if model has benn executed or is planned to execute
    # @param vector - model vector
    # @return True if executed False in other case
    def is_executed(self, vector):
        aux_analized_models = deepcopy(self.analyzed_models)
        analyzed = False
        while not analyzed and len(aux_analized_models) > 0:
            analyzed = analyzed or self.compare_vectors(
                vector, aux_analized_models.pop())
        return analyzed

    ## Compare to execution vectors
    # @param vector1 - model_execution vector
    # @param vector2 - model_execution vector
    # @return True if equal False if inequity
    @staticmethod
    def compare_vectors(vector1, vector2):
        return vector1[0] == vector2[0] and vector1[1] == vector2[1] \
               and vector1[2] == vector2[2] and vector1[3] == vector2[3]

    ## Check if model is previously executed. If it not append to list
    # @param model_list
    # @param  model json compatible
    def safe_append(self, model_list, model):
        vector = self.generate_vectors(model, model['normalizations_set'])
        if not self.is_executed(vector):
            model_list.append(model)
            self.analyzed_models.append(vector)
            self._logging.log_info(self._ec.get_id_analysis(), 'AdviserAStar',
                                   self._labels["new_vector"], str(vector))
        else:
            self.excluded_models.append(vector)
            self._logging.log_info(self._ec.get_id_analysis(), 'AdviserAStar',
                                   self._labels["exc_vector"], str(vector))