Exemplo n.º 1
0
 def save_models(self, arlist, mode=BEST, metric='accuracy'):
     if mode == BEST:
         model_list = [arlist[0]]
     elif mode == BEST_3:
         model_list = arlist[0:3]
     elif mode == EACH_BEST:
         exclusion = list()
         model_list = list()
         for model in arlist:
             if (get_model_fw(model), model['model_parameters'][
                     get_model_fw(model)]['model'],
                     model['normalizations_set']) not in exclusion:
                 model_list.append(model)
                 exclusion.append(
                     (get_model_fw(model), model['model_parameters'][
                         get_model_fw(model)]['model'],
                      model['normalizations_set']))
     elif mode == ALL:
         model_list = arlist
     elif mode == NONE:
         model_list = list()
     for fw in self._config['frameworks'].keys():
         self.init_handler(fw)
         for each_model in model_list:
             if fw in each_model['model_parameters'].keys():
                 self.model_handler[fw]['handler'].store_model(
                     each_model, user=self._ec.get_id_user())
         self.clean_handler(fw)
Exemplo n.º 2
0
    def base_specific(self, dataframe_metadata, list_ar_metadata):
        version = self._ec.config.get_config()['common']['version']
        for ar_metadata in list_ar_metadata:

            ar_structure = ArMetadata()
            if ar_metadata['dataset_hash_value'] == self.hash_dataframe:
                self._ec.set_id_analysis(ar_metadata['model_id'])
                ar_structure['predecessor'] = ar_metadata['model_parameters'][get_model_fw(ar_metadata)] \
                    ['parameters']['model_id']['value']
                ar_structure['round'] = int(ar_metadata['round']) + 1
            else:
                ar_structure['predecessor'] = 'root'

            ar_structure['model_id'] = self._ec.get_id_analysis()
            ar_structure['version'] = version
            ar_structure['user_id'] = self._ec.get_id_user()
            ar_structure['workflow_id'] = ar_metadata['workflow_id']
            ar_structure['objective_column'] = ar_metadata['objective_column']
            ar_structure['timestamp'] = self.timestamp
            ar_structure['normalizations_set'] = ar_metadata[
                'normalizations_set']
            ar_structure['dataset'] = self.dataframe_name
            ar_structure['dataset_hash_value'] = self.hash_dataframe
            ar_structure['data_initial'] = dataframe_metadata
            ar_structure['data_normalized'] = None
            ar_structure['model_parameters'] = ar_metadata['model_parameters']
            ar_structure['ignored_parameters'] = None
            ar_structure['full_parameters_stack'] = None
            ar_structure['status'] = -1
            self.next_analysis_list.append(ar_structure)
            self.analyzed_models.append(
                self.generate_vectors(ar_structure,
                                      ar_metadata['normalizations_set']))
Exemplo n.º 3
0
 def get_external_model(self, armetadata, type='pojo'):
     fw = get_model_fw(armetadata)
     self.init_handler(fw)
     results = self.model_handler[fw]['handler'].get_external_model(
         armetadata, type)
     self.clean_handler(fw)
     return results
Exemplo n.º 4
0
    def remove_models(self, arlist, mode=ALL):
        if mode == BEST:
            model_list = arlist[1:]
        elif mode == BEST_3:
            model_list = arlist[3:]
        elif mode == EACH_BEST:
            exclusion = list()
            model_list = list()
            for model in arlist:
                if (get_model_fw(model), model['model_parameters'][
                        get_model_fw(model)]['model'],
                        model['normalizations_set']) not in exclusion:
                    exclusion.append(
                        (get_model_fw(model), model['model_parameters'][
                            get_model_fw(model)]['model'],
                         model['normalizations_set']))
                else:
                    model_list.append(model)
        elif mode == ALL:
            model_list = arlist
        elif mode == NONE:
            model_list = list()
        fw_list = list()
        for models in model_list:
            if get_model_fw(models) not in fw_list:
                fw_list.append(get_model_fw(models))

        for fw in fw_list:
            self.init_handler(fw)
            self.model_handler[fw]['handler'].remove_models(model_list)
            self.clean_handler(fw)
Exemplo n.º 5
0
    def optimize_models(self, armetadata):
        metric_value, _, objective = eval('self.get_' + self.metric + '(armetadata)')
        engine = get_model_fw(armetadata)
        for engines in [*self._frameworks]:
            if engine == engines:
                optimizer_engine = importlib.import_module(self._frameworks[engine]['conf']['optimization_method'])
                model_list = optimizer_engine.Optimizer(self._ec).optimize_models(armetadata=armetadata,
                                                                                  metric_value=metric_value,
                                                                                  objective=objective,
                                                                                  deepness=self.deepness,
                                                                                  deep_impact=self.deep_impact)
                optimized_model_list = list()
                for model in model_list:
                    self.safe_append(optimized_model_list, model)

                return (optimized_model_list)
Exemplo n.º 6
0
 def generate_vectors(self, model, normalization_set):
     vector = list()
     norm_vector = list()
     fw = get_model_fw(model)
     for parm, parm_value in model['model_parameters'][fw][
             'parameters'].items():
         if isinstance(parm_value, OrderedDict) and parm != 'model_id':
             vector.append(parm_value['value'])
     #added 31/08/2017
     if normalization_set == [None]:
         norm_vector = normalization_set
     else:
         for normalization in normalization_set:
             norm_vector.append(
                 md5(dumps(normalization).encode('utf8')).hexdigest())
     #print("Trace:%s-%s-%s-%s"%(fw, model['model_parameters'][fw]['model'], vector, norm_vector))
     return fw, model['model_parameters'][fw]['model'], vector, norm_vector
Exemplo n.º 7
0
 def copy_template(self, increment=1):
     new_model = ArMetadata()
     new_model['model_id'] = deepcopy(self['model_id'])
     new_model['version'] = deepcopy(self['version'])
     new_model['workflow_id'] = deepcopy(self['workflow_id'])
     new_model['user_id'] = deepcopy(self['user_id'])
     new_model['type'] = deepcopy(self['type'])
     new_model['objective_column'] = deepcopy(self['objective_column'])
     new_model['timestamp'] = deepcopy(self['timestamp'])
     new_model['round'] = self['round'] + increment
     new_model['execution_seconds'] = 0.0
     new_model['tolerance'] = 0.0
     new_model['predecessor'] = self['model_parameters'][get_model_fw(
         self)]['parameters']['model_id']['value']
     new_model['normalizations_set'] = deepcopy(self['normalizations_set'])
     new_model['dataset'] = deepcopy(self['dataset'])
     new_model['dataset_hash_value'] = deepcopy(self['dataset_hash_value'])
     new_model['data_initial'] = deepcopy(self['data_initial'])
     new_model['data_normalized'] = deepcopy(self['data_normalized'])
     new_model['model_parameters'] = deepcopy(self['model_parameters'])
     new_model['ignored_parameters'] = deepcopy(self['ignored_parameters'])
     return new_model
Exemplo n.º 8
0
 def analysis_specific(self, dataframe_metadata, list_ar_metadata):
     self.next_analysis_list.clear()
     if self.deepness == 1:
         #Check_dataframe_metadata compatibility
         self.base_specific(dataframe_metadata, list_ar_metadata)
     # Added 22/09/1974
     elif self.deepness > self.deep_impact:
         self.next_analysis_list = None
     elif self.next_analysis_list is not None:
         fw_model_list = list()
         # Added 31/08/2017
         best_models = list()
         # End - Added 31/08/2017
         aux_loop_controller = len(self.analysis_recommendation_order)
         for indexer in range(0, aux_loop_controller):
             try:
                 # Modified 31/08/2017
                 model = self.analysis_recommendation_order[indexer]
                 if model['status'] == 'Executed':
                     model_type = model['model_parameters'][get_model_fw(
                         model)]['model']
                     if model_type not in best_models:
                         fw_model_list.extend(
                             self.optimize_models(
                                 self.analysis_recommendation_order[indexer]
                             ))
                         #print("Trace:%s-%s" % (model_type, best_models))
                         best_models.append(model_type)
                         # End - Modified 31/08/2017
             except TypeError:
                 ''' If all optimize_models doesn't return new models 
                 pass and look for next best model on this type'''
                 pass
         # if fw_model_list is not None:
         self.next_analysis_list.extend(fw_model_list)
         if len(self.next_analysis_list) == 0:
             self.next_analysis_list = None
     self.deepness += 1
     return self._ec.get_id_analysis(), self.next_analysis_list
Exemplo n.º 9
0
    def optimize_models(self, armetadata, metric_value, objective, deepness, deep_impact):
        model_list = list()
        model = armetadata['model_parameters'][get_model_fw(armetadata)]
        config = self._config
    
        if get_model_fw(armetadata) == 'spark' and metric_value != objective \
                and armetadata['status'] != self._labels['failed_op']:
            try:
                scoring_metric = decode_ordered_dict_to_dataframe(armetadata['metrics']['scoring'])
            except ValueError:
                print("TRACE: Not scoring: " + model)
            min_rows_limit = config['min_rows_limit']
            min_rows_increment = config['min_rows_increment']
            max_interactions_increment = config['max_interactions_increment']
            interactions_increment = config['interactions_increment']
            max_depth_increment = config['max_depth_increment']
            ntrees_increment = config['ntrees_increment']
            stepSize = config['stepSize']
            aggregationDepth_increment = config['aggregationDepth_increment']
            regParam = config['regParam']
            elastic_variation = config['elastic_variation']
            nv_smoothing = config['nv_smoothing']
            nv_improvement = config['nv_improvement']
            nv_divisor = config['nv_divisor']
            clustering_increment = config['clustering_increment']
            initstep_increment = config['initstep_increment']
    
            if model['model'] == 'LinearSVC':
                if deepness == 2 and len(regParam) != 0:
                    for elastic in regParam:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['regParam']['value'] = elastic['value']
                        model_list.append(new_armetadata)
    
                try:
                    if model['parameters']['maxIter']['value'] \
                            >= scoring_metric['totalIterations'][0] and \
                            scoring_metric['totalIterations'][0] <= max_interactions_increment:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['maxIter']['value'] *= interactions_increment
                        model_list.append(new_armetadata)
                except KeyError:
                    if model['parameters']['maxIter']['value'] \
                            <= max_interactions_increment:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['maxIter']['value'] *= interactions_increment
                        model_list.append(new_armetadata)
    
                new_armetadata = armetadata.copy_template()
                model_aux = new_armetadata['model_parameters']['spark']
                model_aux['parameters']['aggregationDepth']['value'] *= aggregationDepth_increment
                model_list.append(new_armetadata)
    
            elif model['model'] == 'LogisticRegression' or model['model'] == 'LinearRegression':
                if deepness == 2 and len(regParam) != 0:
                    for elastic in regParam:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['regParam']['value'] = elastic['value']
                        model_list.append(new_armetadata)
    
                if model['parameters']['elasticNetParam']['value'] \
                        * (1 + elastic_variation) <= 1.0:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['elasticNetParam']['value'] = \
                        model_aux['parameters']['elasticNetParam']['value'] * (1 + elastic_variation)
                    model_list.append(new_armetadata)
                if model['parameters']['elasticNetParam']['value'] \
                        * (1 - elastic_variation) >= 0.0:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['elasticNetParam']['value'] = \
                        model_aux['parameters']['elasticNetParam']['value'] * (1 + elastic_variation)
                    model_list.append(new_armetadata)
    
                try:
                    if model['parameters']['maxIter']['value'] \
                            >= scoring_metric['totalIterations'][0] and \
                            scoring_metric['totalIterations'][0] <= max_interactions_increment:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['maxIter']['value'] *= interactions_increment
                        model_list.append(new_armetadata)
                except KeyError:
                    if model['parameters']['maxIter']['value'] \
                            <= max_interactions_increment:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['maxIter']['value'] *= interactions_increment
                        model_list.append(new_armetadata)
    
                new_armetadata = armetadata.copy_template()
                model_aux = new_armetadata['model_parameters']['spark']
                model_aux['parameters']['aggregationDepth']['value'] *= aggregationDepth_increment
                model_list.append(new_armetadata)
    
            elif model['model'] == 'DecisionTreeClassifier' or model['model'] == 'DecisionTreeRegressor':

                if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2):
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['minInstancesPerNode']['value'] = round(
                        model_aux['parameters']['minInstancesPerNode']['value']
                        / min_rows_increment, 0)
                    model_list.append(new_armetadata)
    
                if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxDepth']['value'] = \
                        model_aux['parameters']['maxDepth']['value'] * max_depth_increment
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'GBTRegressor':
                if deepness == 2 and len(stepSize) != 0 and len(eval(model['parameters']['lossType']['type'])) != 0:
                    for stepsize in stepSize:
                        for element in eval(model['parameters']['lossType']['type']):
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters']['spark']
                            model_aux['parameters']['lossType']['value'] = element
                            model_aux['parameters']['stepSize']['value'] = stepsize['learn']
                            model_list.append(new_armetadata)
                elif deepness == 2 and len(stepSize) != 0:
                    for stepsize in stepSize:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['stepSize']['value'] = stepsize['learn']
                        model_list.append(new_armetadata)
                elif deepness == 2 and len(eval(model['parameters']['lossType']['type'])) != 0:
                    for element in eval(model['parameters']['lossType']['type']):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['impurity']['value'] = element
                        model_list.append(new_armetadata)
    
                if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2):
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['minInstancesPerNode']['value'] = round(
                        model_aux['parameters']['minInstancesPerNode']['value']
                        / min_rows_increment, 0)
                    model_list.append(new_armetadata)
    
                # 05/07/2018. Included platform base restriction maxDepth <=30
                if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value'] \
                        and model['parameters']['maxDepth']['value'] != 30:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    if model_aux['parameters']['maxDepth']['value'] * max_depth_increment > 30:
                        model_aux['parameters']['maxDepth']['value'] = 30
                    else:
                        model_aux['parameters']['maxDepth']['value'] *= max_depth_increment
    
                    model_list.append(new_armetadata)
    
                if scoring_metric['trees'][0] >= model['parameters']['maxIter']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxIter']['value'] *= ntrees_increment
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'GBTClassifier':
                if deepness == 2 and len(stepSize) != 0 and len(eval(model['parameters']['lossType']['type'])) != 0:
                    for stepsize in stepSize:
                        for element in eval(model['parameters']['lossType']['type']):
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters']['spark']
                            model_aux['parameters']['lossType']['value'] = element
                            model_aux['parameters']['stepSize']['value'] = stepsize['learn']
                            model_list.append(new_armetadata)
                elif deepness == 2 and len(stepSize) != 0:
                    for stepsize in stepSize:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['stepSize']['value'] = stepsize['learn']
                        model_list.append(new_armetadata)
                elif deepness == 2 and len(eval(model['parameters']['impurity']['type'])) != 0:
                    for element in eval(model['parameters']['lossType']['type']):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['lossType']['value'] = element
                        model_list.append(new_armetadata)
    
                if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2):
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['minInstancesPerNode']['value'] = round(
                        model_aux['parameters']['minInstancesPerNode']['value']
                        / min_rows_increment, 0)
                    model_list.append(new_armetadata)
    
                if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxDepth']['value'] *= max_depth_increment
                    model_list.append(new_armetadata)
    
                if scoring_metric['trees'][0] >= model['parameters']['maxIter']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxIter']['value'] *= ntrees_increment
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'RandomForestClassifier' or model['model'] == 'RandomForestRegressor':
    
                if deepness == 2 and len(eval(model['parameters']['featureSubsetStrategy']['type'])) != 0 \
                        and len(eval(model['parameters']['impurity']['type'])) != 0:
                    for featuresubsetstrategy in eval(model['parameters']['featureSubsetStrategy']['type']):
                        for element in eval(model['parameters']['impurity']['type']):
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters']['spark']
                            model_aux['parameters']['impurity']['value'] = element
                            model_aux['parameters']['featureSubsetStrategy']['value'] = featuresubsetstrategy
                            model_list.append(new_armetadata)
                elif deepness == 2 and len(eval(model['parameters']['featureSubsetStrategy']['type'])) != 0:
                    for featuresubsetstrategy in eval(model['parameters']['featureSubsetStrategy']['type']):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['featureSubsetStrategy']['value'] = featuresubsetstrategy
                        model_list.append(new_armetadata)
                elif deepness == 2 and len(eval(model['parameters']['impurity']['type'])) != 0:
                    for element in model['parameters']['impurity']['type']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['impurity']['value'] = element
                        model_list.append(new_armetadata)
    
                if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2):
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['minInstancesPerNode']['value'] = round(
                        model_aux['parameters']['minInstancesPerNode']['value']
                        / min_rows_increment, 0)
                    model_list.append(new_armetadata)
    
                if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxDepth']['value'] *= max_depth_increment
                    model_list.append(new_armetadata)
    
                if scoring_metric['trees'][0] >= model['parameters']['numTrees']['value']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['numTrees']['value'] *= ntrees_increment
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'GeneralizedLinearRegression':
                if deepness == 2 and len(regParam) != 0:
                    for elastic in regParam:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['regParam']['value'] = elastic['value']
                        model_list.append(new_armetadata)
                if deepness == 2:
                    if model['parameters']['family']['value'] in ['gaussian', 'gamma']:
                        linklist = ['log', 'inverse']
                    elif model['parameters']['family']['value'] in ['poisson']:
                        linklist = ['log', 'sqrt']
                    elif model['parameters']['family']['value'] in ['poisson', 'tweedie']:
                        linklist = []
                    for linkin in linklist:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['link']['value'] = linkin
                        model_list.append(new_armetadata)
    
                if model['parameters']['maxIter']['value'] \
                        <= max_interactions_increment:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    model_aux['parameters']['maxIter']['value'] *= interactions_increment
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'NaiveBayes':
                if deepness == 2 and len(nv_smoothing) != 0:
                    for elastic in nv_smoothing:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['smoothing']['value'] = elastic['value']
                        model_list.append(new_armetadata)
    
                for adjusting in ['improvement', 'decrement']:
                    new_armetadata = armetadata.copy_template()
                    model_aux = new_armetadata['model_parameters']['spark']
                    if adjusting == 'improvement':
                        model_aux['parameters']['smoothing']['value'] = model_aux['parameters']['smoothing'][
                                                                            'value'] * (1 + nv_improvement)
                    else:
                        model_aux['parameters']['smoothing']['value'] = model_aux['parameters']['smoothing'][
                                                                            'value'] * (1 - nv_divisor)
                    model_list.append(new_armetadata)
    
            elif model['model'] == 'BisectingKMeans':
    
                new_armetadata = armetadata.copy_template()
                model_aux = new_armetadata['model_parameters']['spark']
                model_aux['parameters']['maxIter']['value'] = \
                    int(model_aux['parameters']['maxIter']['value'] * clustering_increment)
                model_list.append(new_armetadata)
    
            elif model['model'] == 'KMeans':
    
                if deepness == 2 and len(eval(model['parameters']['initMode']['type'])) != 0:
                    for element in eval(model['parameters']['initMode']['type']):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['spark']
                        model_aux['parameters']['initMode']['value'] = element
                        model_list.append(new_armetadata)
    
                new_armetadata = armetadata.copy_template()
                model_aux = new_armetadata['model_parameters']['spark']
                model_aux['parameters']['maxIter']['value'] = \
                    int(model_aux['parameters']['maxIter']['value'] * clustering_increment)
                model_list.append(new_armetadata)

            else:
                return None
    
            if len(model_list) == 0:
                return None
            else:
                return model_list
Exemplo n.º 10
0
    def reconstruct_execution_tree(self,
                                   arlist=None,
                                   metric='combined',
                                   store=True):
        if (arlist is None
                or len(arlist) == 0) and self._ec.get_id_analysis() is None:
            self._logging.log_critical('gDayF', 'controller',
                                       self._labels["failed_model"])
            return None
        elif self._ec.get_id_analysis(
        ) is not None and self._ec.get_id_user() != 'guest':
            new_arlist = PersistenceHandler(
                self._ec).recover_experiment_mongoDB()
        else:
            analysis_id = arlist[0]['model_id']
            new_arlist = arlist

        ordered_list = self.priorize_list(arlist=new_arlist, metric=metric)

        root = OrderedDict()
        root['data'] = None
        root['ranking'] = 0
        root['successors'] = OrderedDict()
        variable_dict = OrderedDict()
        variable_dict[0] = {'root': root}

        ranking = 1
        for new_tree_structure in ordered_list:
            new_model = deep_ordered_copy(new_tree_structure)
            model_id = new_tree_structure['model_parameters'][get_model_fw(new_tree_structure)]\
                                         ['parameters']['model_id']['value']
            level = new_tree_structure['round']
            if level not in variable_dict.keys():
                variable_dict[level] = OrderedDict()

            new_tree_structure = OrderedDict()
            new_tree_structure['ranking'] = ranking
            new_tree_structure['data'] = new_model
            new_tree_structure['successors'] = OrderedDict()
            variable_dict[level][model_id] = new_tree_structure

            ranking += 1

        level = 1
        max_level = max(variable_dict.keys())
        while level in range(1, max_level + 1):
            for model_id, new_tree_structure in variable_dict[level].items():
                counter = 1
                found = False
                while not found or (level - counter) == 0:
                    if new_tree_structure['data'][
                            'predecessor'] in variable_dict[level -
                                                            counter].keys():
                        container = eval(
                            'variable_dict[level-counter][new_tree_structure[\'data\'][\'predecessor\']]'
                        )
                        container['successors'][model_id] = new_tree_structure
                        found = True
                    counter += 1
                if not found:
                    self._logging.log_debug(self._ec.get_id_analysis(),
                                            'controller',
                                            self._labels['fail_reconstruct'],
                                            model_id)
            level += 1

        #Store_json on primary path
        if store and self._config['storage']['primary_path'] != 'mongoDB':
            primary_path = self._config['storage']['primary_path']
            fstype = self._config['storage'][primary_path]['type']

            datafile = list()
            datafile.append(self._config['storage'][primary_path]['value'])
            datafile.append('/')
            datafile.append(self._ec.get_id_user())
            datafile.append('/')
            datafile.append(self._ec.get_id_workflow())
            datafile.append('/')
            datafile.append(self._config['common']['execution_tree_dir'])
            datafile.append('/')
            datafile.append(self._ec.get_id_analysis())
            datafile.append('.json')

            if self._config['persistence']['compress_json']:
                datafile.append('.gz')

            storage = StorageMetadata(self._ec)
            storage.append(value=''.join(datafile), fstype=fstype)
            PersistenceHandler(self._ec).store_json(storage, root)
        return root
Exemplo n.º 11
0
 def analysisnormal(self, dataframe_metadata, objective_column, amode):
     self.next_analysis_list.clear()
     if self.deepness == 1:
         self.base_iteration(amode, dataframe_metadata, objective_column)
     elif self.deepness > self.deep_impact:
         self.next_analysis_list = None
     elif self.deepness == 2:
         fw_model_list = list()
         # Added 31/08/2017
         best_models = list()
         # End - Added 31/08/2017
         aux_loop_controller = len(self.analysis_recommendation_order)
         for indexer in range(0, aux_loop_controller):
             try:
                 model = self.analysis_recommendation_order[indexer]
                 if model['status'] == 'Executed':
                     model_type = model['model_parameters'][get_model_fw(
                         model)]['model']
                     if model_type not in best_models and len(
                             best_models) < self._config['adviser_L2_wide']:
                         fw_model_list.extend(
                             self.optimize_models(
                                 self.analysis_recommendation_order[indexer]
                             ))
                         best_models.append(model_type)
             except TypeError:
                 ''' If all optimize_models doesn't return new models 
                 register it as evaluated and seleted'''
                 best_models.append(model_type)
         self.next_analysis_list.extend(fw_model_list)
         if len(self.next_analysis_list) == 0:
             self.next_analysis_list = None
     elif self.next_analysis_list is not None:
         fw_model_list = list()
         # Added 31/08/2017
         best_models = list()
         # End - Added 31/08/2017
         aux_loop_controller = len(self.analysis_recommendation_order)
         for indexer in range(0, aux_loop_controller):
             try:
                 model = self.analysis_recommendation_order[indexer]
                 if model['status'] == 'Executed':
                     model_type = model['model_parameters'][get_model_fw(
                         model)]['model']
                     if model_type not in best_models and len(
                             best_models
                     ) < self._config['adviser_normal_wide']:
                         fw_model_list.extend(
                             self.optimize_models(
                                 self.analysis_recommendation_order[indexer]
                             ))
                         #print("Trace:%s-%s" % (model_type, best_models))
                         best_models.append(model_type)
             except TypeError:
                 ''' If all optimize_models doesn't return new models 
                 register it as evaluated and seleted'''
                 best_models.append(model_type)
         '''' Modified 20/09/2017
         # Get two most potential best models
         fw_model_list = list()
         for indexer in range(0, 2):
             try:
                 fw_model_list.extend(self.optimize_models(self.analysis_recommendation_order[indexer]))
             except TypeError:
                 pass
         #if fw_model_list is not None:'''
         self.next_analysis_list.extend(fw_model_list)
         if len(self.next_analysis_list) == 0:
             self.next_analysis_list = None
     self.deepness += 1
     return self._ec.get_id_analysis(), self.next_analysis_list
Exemplo n.º 12
0
    def table_model_list(self, ar_list, metric):
        dataframe = list()
        normal_cols = [
            'Model', 'Train_accuracy', 'Test_accuracy', 'Combined_accuracy',
            'train_rmse', 'test_rmse'
        ]
        cluster_cols = ['Model', 'k', 'tot_withinss', 'betweenss']

        ordered_list = self.priorize_list(arlist=ar_list, metric=metric)
        for model in ordered_list:
            if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS:
                try:
                    dataframe.append({
                        'Model':
                        model['model_parameters'][get_model_fw(model)]
                        ['parameters']['model_id']['value'],
                        'Round':
                        model['round'],
                        'train_accuracy':
                        model['metrics']['accuracy']['train'],
                        'test_accuracy':
                        model['metrics']['accuracy']['test'],
                        'combined_accuracy':
                        model['metrics']['accuracy']['combined'],
                        'train_rmse':
                        model['metrics']['execution']['train']['RMSE'],
                        'test_rmse':
                        model['metrics']['execution']['test']['RMSE'],
                        'train_r2':
                        model['metrics']['execution']['train']['r2'],
                        'test_r2':
                        model['metrics']['execution']['test']['r2'],
                        'path':
                        model['json_path'][0]['value']
                    })
                # AutoEncoders metrics
                except KeyError:
                    dataframe.append({
                        'Model':
                        model['model_parameters'][get_model_fw(model)]
                        ['parameters']['model_id']['value'],
                        'Round':
                        model['round'],
                        'train_accuracy':
                        model['metrics']['accuracy']['train'],
                        'test_accuracy':
                        model['metrics']['accuracy']['test'],
                        'combined_accuracy':
                        model['metrics']['accuracy']['combined'],
                        'train_rmse':
                        model['metrics']['execution']['train']['RMSE'],
                        'path':
                        model['json_path'][0]['value']
                    })

            if metric in CLUSTERING_METRICS:
                try:
                    aux = model['metrics']['execution']['train']['k']
                except KeyError:
                    aux = 0

                dataframe.append({
                    'Model':
                    model['model_parameters'][get_model_fw(model)]
                    ['parameters']['model_id']['value'],
                    'Round':
                    model['round'],
                    'k':
                    aux,
                    'tot_withinss':
                    model['metrics']['execution']['train']['tot_withinss'],
                    'betweenss':
                    model['metrics']['execution']['train']['betweenss'],
                    'path':
                    model['json_path'][0]['value']
                })
        return DataFrame(dataframe)
Exemplo n.º 13
0
    def log_model_list(self, ar_list, metric):
        best_check = True
        ordered_list = self.priorize_list(arlist=ar_list, metric=metric)
        for model in ordered_list:
            if best_check:
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["best_model"], model['model_parameters']
                    [get_model_fw(model)]['parameters']['model_id']['value'])
                best_check = False
            else:
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["res_model"], model['model_parameters']
                    [get_model_fw(model)]['parameters']['model_id']['value'])

            self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                                   self._labels["round_reach"], model['round'])
            if model["normalizations_set"] is None:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller', self._labels["norm_app"],
                                       [])
            else:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller', self._labels["norm_app"],
                                       model["normalizations_set"])

            if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS:
                self._logging.log_info(self._ec.get_id_analysis(),
                                       'controller',
                                       self._labels["ametric_order"],
                                       model['metrics']['accuracy'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["pmetric_order"],
                    model['metrics']['execution']['train']['RMSE'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["pmetric_order"],
                    model['metrics']['execution']['test']['RMSE'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["rmetric_order"],
                    model['metrics']['execution']['train']['r2'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["rmetric_order"],
                    model['metrics']['execution']['test']['r2'])
            if metric in CLUSTERING_METRICS:
                try:
                    self._logging.log_info(
                        self._ec.get_id_analysis(), 'controller',
                        self._labels["ckmetric_order"],
                        model['metrics']['execution']['train']['k'])
                except KeyError:
                    self._logging.log_info(self._ec.get_id_analysis(),
                                           'controller',
                                           self._labels["ckmetric_order"], "0")
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["ctmetric_order"],
                    model['metrics']['execution']['train']['tot_withinss'])
                self._logging.log_info(
                    self._ec.get_id_analysis(), 'controller',
                    self._labels["cbmetric_order"],
                    model['metrics']['execution']['train']['betweenss'])
Exemplo n.º 14
0
    def exec_analysis(self,
                      datapath,
                      objective_column,
                      amode=POC,
                      metric='test_accuracy',
                      deep_impact=3,
                      **kwargs):
        # Clustering variables
        k = None
        estimate_k = False

        #Force analysis variable
        atype = None

        hash_dataframe = ''

        for pname, pvalue in kwargs.items():
            if pname == 'k':
                assert isinstance(pvalue, int)
                k = pvalue
            elif pname == 'estimate_k':
                assert isinstance(pvalue, bool)
                estimate_k = pvalue
            elif pname == 'atype':
                assert pvalue in atypes
                atype = pvalue

        supervised = True
        if objective_column is None:
            supervised = False

        self._logging.log_info('gDayF', "Controller", self._labels["start"])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["ana_param"], metric)
        self._logging.log_info('gDayF', "Controller",
                               self._labels["dep_param"], deep_impact)
        self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"],
                               amode)

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
                id_datapath = Path(datapath).name
                hash_dataframe = hash_key('MD5', datapath)
            except IOError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except OSError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except JSONDecodeError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            self._logging.log_info('gDayF', "Controller",
                                   self._labels["input_param"],
                                   str(datapath.shape))
            pd_dataset = datapath
            id_datapath = 'Dataframe' + \
                          '_' + str(pd_dataset.size) + \
                          '_' + str(pd_dataset.shape[0]) + \
                          '_' + str(pd_dataset.shape[1])
            #hash_dataframe = md5(datapath.to_msgpack()).hexdigest()
            hash_dataframe = md5(
                datapath.to_json().encode('utf-8')).hexdigest()
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input'], None

        pd_test_dataset = None
        ''' Changed 05/04/2018
        if metric == 'combined_accuracy' or 'test_accuracy':'''
        if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \
                and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS):
            pd_dataset, pd_test_dataset = pandas_split_data(
                pd_dataset,
                train_perc=self._config['common']['test_frame_ratio'])

        df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas')

        self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath +
                                 '_' + str(time()))
        adviser = self.adviser.AdviserAStar(e_c=self._ec,
                                            metric=metric,
                                            deep_impact=deep_impact,
                                            dataframe_name=id_datapath,
                                            hash_dataframe=hash_dataframe)

        adviser.set_recommendations(dataframe_metadata=df,
                                    objective_column=objective_column,
                                    amode=amode,
                                    atype=atype)

        while adviser.next_analysis_list is not None:
            for each_model in adviser.next_analysis_list:
                fw = get_model_fw(each_model)

                if k is not None:
                    try:
                        each_model["model_parameters"][fw]["parameters"]["k"][
                            "value"] = k
                        each_model["model_parameters"][fw]["parameters"]["k"][
                            "seleccionable"] = True
                        each_model["model_parameters"][fw]["parameters"][
                            "estimate_k"]["value"] = estimate_k
                        each_model["model_parameters"][fw]["parameters"][
                            "estimate_k"]["seleccionable"] = True
                    except KeyError:
                        pass

                self.init_handler(fw)
                if pd_test_dataset is not None:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_test_dataset,
                                                  filtering='STANDARDIZE')
                else:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_dataset,
                                                  filtering='STANDARDIZE')

                if analyzed_model is not None:
                    adviser.analysis_recommendation_order.append(
                        analyzed_model)
            adviser.next_analysis_list.clear()
            adviser.analysis_recommendation_order = adviser.priorize_models(
                model_list=adviser.analysis_recommendation_order)
            adviser.set_recommendations(dataframe_metadata=df,
                                        objective_column=objective_column,
                                        amode=amode)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["ana_models"],
                               str(len(adviser.analyzed_models)))
        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["exc_models"],
                               str(len(adviser.excluded_models)))

        self._logging.log_exec(self._ec.get_id_analysis(), 'controller',
                               self._labels["end"])

        self.clean_handlers()

        adviser.analysis_recommendation_order = adviser.priorize_models(
            model_list=adviser.analysis_recommendation_order)

        return self._labels[
            'success_op'], adviser.analysis_recommendation_order
Exemplo n.º 15
0
    def exec_prediction(self, datapath, armetadata=None, model_file=None):

        self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"],
                               'prediction')
        if armetadata is None and model_file is None:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_model"], datapath)
            return self._labels["failed_model"]
        elif armetadata is not None:
            try:
                assert isinstance(armetadata, ArMetadata)
                base_ar = deep_ordered_copy(armetadata)
            except AssertionError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           armetadata)
                return self._labels["failed_model"]
        elif model_file is not None:
            try:
                #json_file = open(model_file)
                persistence = PersistenceHandler(self._ec)
                invalid, base_ar = persistence.get_ar_from_engine(model_file)
                del persistence

                if invalid:
                    self._logging.log_critical('gDayF', "Controller",
                                               self._labels["failed_model"],
                                               model_file)
                    return self._labels["failed_model"]
            except IOError as iexecution_error:
                print(repr(iexecution_error))
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           model_file)
                return self._labels["failed_model"]
            except OSError as oexecution_error:
                print(repr(oexecution_error))
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_model"],
                                           model_file)
                return self._labels["failed_model"]

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
            except [IOError, OSError, JSONDecodeError]:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            pd_dataset = datapath
            self._logging.log_info('gDayF', "Controller",
                                   self._labels["input_param"],
                                   str(datapath.shape))
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input']

        fw = get_model_fw(base_ar)

        self.init_handler(fw)

        prediction_frame = None
        try:
            prediction_frame, _ = self.model_handler[fw]['handler'].predict(
                predict_frame=pd_dataset, base_ar=base_ar)
        except TypeError:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_model"],
                                       model_file)

        self.clean_handler(fw)

        self._logging.log_info('gDayF', 'controller', self._labels["pred_end"])

        return prediction_frame
Exemplo n.º 16
0
    def optimize_models(self, armetadata, metric_value, objective, deepness,
                        deep_impact):
        model_list = list()
        model = armetadata['model_parameters'][get_model_fw(armetadata)]
        config = self._config
        if get_model_fw(armetadata) == 'h2o' and metric_value != objective \
                and armetadata['status'] != self._labels['failed_op']:
            try:
                model_metric = decode_ordered_dict_to_dataframe(
                    armetadata['metrics']['model'])
                if model['model'] not in ['H2ONaiveBayesEstimator']:
                    scoring_metric = decode_ordered_dict_to_dataframe(
                        armetadata['metrics']['scoring'])
                nfold_limit = config['nfold_limit']
                min_rows_limit = config['min_rows_limit']
                cols_breakdown = config['cols_breakdown']
                nfold_increment = config['nfold_increment']
                min_rows_increment = config['min_rows_increment']
                max_interactions_rows_breakdown = config[
                    'max_interactions_rows_breakdown']
                max_interactions_increment = config[
                    'max_interactions_increment']
                max_depth_increment = config['max_depth_increment']
                ntrees_increment = config['ntrees_increment']
                dpl_rcount_limit = config['dpl_rcount_limit']
                dpl_divisor = config['dpl_divisor']
                h_dropout_ratio = config['h_dropout_ratio']
                epochs_increment = config['epochs_increment']
                dpl_min_batch_size = config['dpl_min_batch_size']
                dpl_batch_reduced_divisor = config['dpl_batch_reduced_divisor']
                deeper_increment = config['deeper_increment']
                wider_increment = config['wider_increment']
                learning_conf = config['learning_conf']
                rho_conf = config['rho_conf']
                nv_laplace = config['nv_laplace']
                nv_min_prob = config['nv_min_prob']
                nv_min_sdev = config['nv_min_sdev']
                nv_improvement = config['nv_improvement']
                nv_divisor = config['nv_divisor']
                clustering_increment = config['clustering_increment']
                sample_rate = config['sample_rate']

                if model['model'] == 'H2OGradientBoostingEstimator':
                    if (deepness == 2
                        ) and model['types'][0]['type'] == 'regression':
                        for tweedie_power in [1.1, 1.5, 1.9]:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['distribution'][
                                'value'] = 'tweedie'
                            model_aux['parameters'][
                                'tweedie_power'] = ParameterMetadata()
                            model_aux['parameters']['tweedie_power'].set_value(
                                tweedie_power)
                            model_list.append(new_armetadata)
                    if deepness == 2:
                        for learning in learning_conf:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['learn_rate'][
                                'value'] = learning['learn']
                            model_aux['parameters']['learn_rate_annealing'][
                                'value'] = learning['improvement']
                            model_list.append(new_armetadata)
                    if model_metric['number_of_trees'][0] >= model[
                            'parameters']['ntrees']['value']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['ntrees'][
                            'value'] *= ntrees_increment
                        model_list.append(new_armetadata)
                    if model_metric['max_depth'][0] >= model['parameters'][
                            'max_depth']['value']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['max_depth'][
                            'value'] *= max_depth_increment
                        model_list.append(new_armetadata)
                    if model['parameters']['nfolds']['value'] < nfold_limit:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['nfolds'][
                            'value'] += nfold_increment
                        model_list.append(new_armetadata)
                    if model['parameters']['min_rows'][
                            'value'] > min_rows_limit:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['min_rows']['value'] = round(
                            model_aux['parameters']['min_rows']['value'] /
                            min_rows_increment, 0)
                        model_list.append(new_armetadata)

                elif model['model'] == 'H2OGeneralizedLinearEstimator':

                    if model_metric['number_of_iterations'][0] >= model[
                            'parameters']['max_iterations']['value']:

                        if deepness == 2:
                            max_iterations = model['parameters']['max_iterations']['value'] * \
                                             max(round(
                                                 armetadata['data_initial']['rowcount'] / max_interactions_rows_breakdown),
                                                 1)
                        else:
                            max_iterations = model['parameters'][
                                'max_iterations'][
                                    'value'] * max_interactions_increment
                    else:
                        max_iterations = model['parameters']['max_iterations'][
                            'value']

                    if (deepness == 2
                        ) and model['types'][0]['type'] == 'regression':
                        for tweedie_power in [1.0, 1.5, 2.0, 2.5, 3.0]:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['tweedie_variance_power'][
                                'value'] = tweedie_power
                            model_aux['parameters']['max_iterations'][
                                'value'] = max_iterations
                            model_list.append(new_armetadata)
                    if deepness == 2:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['alpha']['value'] = 0.0
                        model_aux['parameters']['max_iterations'][
                            'value'] = max_iterations
                        model_list.append(new_armetadata)
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['alpha']['value'] = 1.0
                        model_aux['parameters']['max_iterations'][
                            'value'] = max_iterations
                        model_list.append(new_armetadata)
                        if armetadata['data_initial']['cols'] > cols_breakdown:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['solver'][
                                'value'] = 'L_BFGS'
                            model_aux['parameters']['max_iterations'][
                                'value'] = max_iterations
                            model_list.append(new_armetadata)
                    if deepness == 2:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['balance_classes']['value'] = \
                            not model_aux['parameters']['balance_classes']['value']
                        model_aux['parameters']['max_iterations'][
                            'value'] = max_iterations
                        model_list.append(new_armetadata)
                    if model['parameters']['nfolds']['value'] < nfold_limit:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['nfolds'][
                            'value'] += nfold_increment
                        model_aux['parameters']['max_iterations'][
                            'value'] = max_iterations
                        model_list.append(new_armetadata)

                elif model['model'] == 'H2ODeepLearningEstimator':

                    if scoring_metric.shape[0] == 0 or \
                            (scoring_metric['epochs'].max() >=
                             model['parameters']['epochs']['value']):
                        epochs = model['parameters']['epochs'][
                            'value'] * epochs_increment
                    else:
                        epochs = model['parameters']['epochs']['value']

                    if deepness == 2:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        if armetadata['data_initial'][
                                'rowcount'] > dpl_rcount_limit:
                            model_aux['parameters']['hidden']['value'] = \
                                round(armetadata['data_initial']['rowcount'] / (dpl_divisor * 0.5))
                        else:
                            model_aux['parameters']['hidden']['value'][0] = \
                                round(model['parameters']['hidden']['value'][0] * wider_increment)
                        model_list.append(new_armetadata)

                        for learning in rho_conf:
                            new_armetadata = new_armetadata.copy_template(
                                increment=0)

                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['rho']['value'] = learning[
                                'learn']
                            model_aux['parameters']['epsilon'][
                                'value'] = learning['improvement']
                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']

                        if armetadata['data_initial'][
                                'rowcount'] > dpl_rcount_limit:
                            model_aux['parameters']['hidden']['value'] = \
                                [round(armetadata['data_initial']['rowcount'] / (dpl_divisor * 0.5)),
                                 round(armetadata['data_initial']['rowcount'] / (dpl_divisor * deep_impact))]
                        else:
                            model_aux['parameters']['hidden']['value'] = [
                                model['parameters']['hidden']['value'][0],
                                round(
                                    model['parameters']['hidden']['value'][0] /
                                    wider_increment)
                            ]
                        model_aux['parameters']['hidden_dropout_ratios'][
                            'value'] = [h_dropout_ratio, h_dropout_ratio]
                        model_list.append(new_armetadata)

                        for learning in rho_conf:
                            new_armetadata = new_armetadata.copy_template(
                                increment=0)
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['rho']['value'] = learning[
                                'learn']
                            model_aux['parameters']['epsilon'][
                                'value'] = learning['improvement']
                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                    if (deepness == 3
                        ) and model['types'][0]['type'] == 'regression':
                        for tweedie_power in [1.1, 1.5, 1.9]:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['distribution'][
                                'value'] = 'tweedie'
                            model_aux['parameters'][
                                'tweedie_power'] = ParameterMetadata()
                            model_aux['parameters']['tweedie_power'].set_value(
                                tweedie_power)
                            model_aux['parameters']['activation'][
                                'value'] = 'tanh_with_dropout'
                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                    if deepness == 3 and not model['parameters']['sparse'][
                            'value']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['sparse'][
                            'value'] = not model_aux['parameters']['sparse'][
                                'value']
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                    '''Eliminado 19/09/2017
                    if deepness == 3 and model['parameters']['activation']['value'] == "rectifier_with_dropout":
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['activation']['value'] = 'tanh_with_dropout'
                        model_list.append(new_armetadata)'''

                    if deepness == 3 and model['parameters'][
                            'initial_weight_distribution']['value'] == "normal":
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['initial_weight_distribution'][
                            'value'] = "uniform"
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                    elif deepness == 3:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['initial_weight_distribution'][
                            'value'] = "normal"
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)

                    if deepness > 2 and deepness <= deep_impact:
                        if len(armetadata['model_parameters']['h2o']
                               ['parameters']['hidden']['value']) < 4:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']

                            if len(model_aux['parameters']['hidden']['value']) > 1 \
                                    and model_aux['parameters']['hidden']['value'][0] > \
                                    model_aux['parameters']['hidden']['value'][1]:
                                model_aux['parameters']['hidden'][
                                    'value'].insert(
                                        0,
                                        round(model_aux['parameters']['hidden']
                                              ['value'][0] * deeper_increment))
                                model_aux['parameters'][
                                    'hidden_dropout_ratios']['value'].insert(
                                        0, h_dropout_ratio)
                            elif len(model_aux['parameters']['hidden']['value']) > 1 \
                                    and model_aux['parameters']['hidden']['value'][0] < \
                                    model_aux['parameters']['hidden']['value'][1]:
                                model_aux['parameters']['hidden'][
                                    'value'].append(
                                        round(model_aux['parameters']['hidden']
                                              ['value'][-1] *
                                              deeper_increment))
                                model_aux['parameters'][
                                    'hidden_dropout_ratios']['value'].append(
                                        h_dropout_ratio)
                            elif len(model_aux['parameters']['hidden']
                                     ['value']) == 1:
                                model_aux['parameters']['hidden']['value'][0] = \
                                    round(model_aux['parameters']['hidden']['value'][0] * deeper_increment)

                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']

                            for iterador in range(
                                    0,
                                    len(model_aux['parameters']['hidden']
                                        ['value'])):
                                model_aux['parameters']['hidden']['value'][iterador] = \
                                    int(round(model_aux['parameters']['hidden']['value'][iterador]) * wider_increment)

                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                    if model['parameters']['mini_batch_size'][
                            'value'] >= dpl_min_batch_size:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['mini_batch_size']['value'] = \
                            round(model_aux['parameters']['mini_batch_size']['value'] / dpl_batch_reduced_divisor)
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)

                elif model['model'] == 'H2ORandomForestEstimator':
                    if deepness == 2:
                        for size in sample_rate:
                            for size2 in sample_rate:
                                new_armetadata = armetadata.copy_template()
                                model_aux = new_armetadata['model_parameters'][
                                    'h2o']
                                model_aux['parameters']['sample_rate'][
                                    'value'] = size['size']
                                model_aux['parameters'][
                                    'col_sample_rate_per_tree'][
                                        'value'] = size2['size']
                                model_list.append(new_armetadata)

                    if model_metric['number_of_trees'][0] == model[
                            'parameters']['ntrees']['value']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['ntrees'][
                            'value'] *= ntrees_increment
                        model_list.append(new_armetadata)
                    if model_metric['max_depth'][0] == model['parameters'][
                            'max_depth']['value']:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['max_depth'][
                            'value'] *= max_depth_increment
                        model_list.append(new_armetadata)
                    if model['parameters']['nfolds']['value'] < nfold_limit:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['nfolds'][
                            'value'] += nfold_increment
                        model_list.append(new_armetadata)
                    if model['parameters']['mtries']['value'] not in [
                            round(armetadata['data_initial']['cols'] / 2),
                            round(armetadata['data_initial']['cols'] * 3 / 4)
                    ]:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['mtries']['value'] = round(
                            armetadata['data_initial']['cols'] / 2)
                        model_list.append(new_armetadata)
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['mtries']['value'] = round(
                            armetadata['data_initial']['cols'] * 3 / 4)
                        model_list.append(new_armetadata)
                    if model['parameters']['min_rows']['value'] > (
                            min_rows_limit / 2):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['min_rows']['value'] = round(
                            model_aux['parameters']['min_rows']['value'] /
                            min_rows_increment, 0)
                        model_list.append(new_armetadata)

                elif model['model'] == 'H2ONaiveBayesEstimator':
                    if deepness == 2:
                        for laplace in nv_laplace:
                            for min_prob in nv_min_prob:
                                for min_sdev in nv_min_sdev:
                                    new_armetadata = armetadata.copy_template()
                                    model_aux = new_armetadata[
                                        'model_parameters']['h2o']
                                    model_aux['parameters']['laplace'][
                                        'value'] = laplace
                                    model_aux['parameters']['min_prob'][
                                        'value'] = min_prob
                                    model_aux['parameters']['min_sdev'][
                                        'value'] = min_sdev
                                    model_list.append(new_armetadata)
                    elif deepness >= 2:
                        if deepness == deep_impact:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['balance_classes']['value'] = \
                                not model_aux['parameters']['balance_classes']['value']
                            model_list.append(new_armetadata)
                        if model['parameters']['nfolds']['value'] < nfold_limit:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['nfolds'][
                                'value'] += nfold_increment
                            model_list.append(new_armetadata)

                        for laplace in ['improvement', 'decrement']:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            if laplace == 'improvement':
                                model_aux['parameters']['laplace'][
                                    'value'] = model_aux['parameters'][
                                        'laplace']['value'] * (1 +
                                                               nv_improvement)
                            else:
                                model_aux['parameters']['laplace'][
                                    'value'] = model_aux['parameters'][
                                        'laplace']['value'] * (1 - nv_divisor)
                            model_list.append(new_armetadata)

                elif model['model'] == 'H2OAutoEncoderEstimator':
                    if scoring_metric.shape[0] == 0 or \
                            (scoring_metric['epochs'].max() >=
                             model['parameters']['epochs']['value']):
                        epochs = model['parameters']['epochs'][
                            'value'] * epochs_increment
                    else:
                        epochs = model['parameters']['epochs']['value']

                    if deepness == 2:
                        for learning in rho_conf:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            model_aux['parameters']['rho']['value'] = learning[
                                'learn']
                            model_aux['parameters']['epsilon'][
                                'value'] = learning['improvement']
                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                    if deepness == 3:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['sparse'][
                            'value'] = not model_aux['parameters']['sparse'][
                                'value']
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                    if deepness > 1 and model['parameters']['activation'][
                            'value'] == "rectifier_with_dropout":
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['activation'][
                            'value'] = 'tanh_with_dropout'
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                    if deepness == 3 and model['parameters'][
                            'initial_weight_distribution']['value'] == "normal":
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['initial_weight_distribution'][
                            'value'] = "uniform"
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                    elif deepness == 3:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['initial_weight_distribution'][
                            'value'] = "normal"
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)

                    if deepness <= deep_impact:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']

                        for iterador in range(
                                0,
                                len(model_aux['parameters']['hidden']
                                    ['value'])):
                            if iterador != int((float(
                                    len(model_aux['parameters']['hidden']
                                        ['value'])) / 2) - 0.5):
                                model_aux['parameters']['hidden']['value'][iterador] = \
                                    int(round(model_aux['parameters']['hidden']['value'][iterador] * wider_increment, 0))
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)
                        if len(model_aux['parameters']['hidden']['value']) < 5:
                            new_armetadata = armetadata.copy_template()
                            model_aux = new_armetadata['model_parameters'][
                                'h2o']
                            next_hidden = int(
                                round(
                                    model_aux['parameters']['hidden']['value']
                                    [0] * deeper_increment, 0))
                            model_aux['parameters']['hidden']['value'].insert(
                                0, next_hidden)
                            model_aux['parameters']['hidden_dropout_ratios'][
                                'value'].insert(0, h_dropout_ratio)
                            model_aux['parameters']['hidden']['value'].append(
                                next_hidden)
                            model_aux['parameters']['hidden_dropout_ratios'][
                                'value'].append(h_dropout_ratio)
                            model_aux['parameters']['epochs']['value'] = epochs
                            model_list.append(new_armetadata)

                    if model['parameters']['mini_batch_size'][
                            'value'] >= dpl_min_batch_size:
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['mini_batch_size']['value'] = \
                            round(model_aux['parameters']['mini_batch_size']['value'] / dpl_batch_reduced_divisor)
                        model_aux['parameters']['epochs']['value'] = epochs
                        model_list.append(new_armetadata)

                elif model['model'] == 'H2OKMeansEstimator':
                    if scoring_metric.shape[0] == 0 or \
                            (int(scoring_metric['number_of_reassigned_observations'][-1:]) >= 0):
                        new_armetadata = armetadata.copy_template()
                        model_aux = new_armetadata['model_parameters']['h2o']
                        model_aux['parameters']['max_iterations']['value'] = \
                            int(model_aux['parameters']['max_iterations']['value'] * clustering_increment)
                        model_list.append(new_armetadata)
            except KeyError:
                return None
        else:
            return None
        if len(model_list) == 0:
            return None
        else:
            return model_list
Exemplo n.º 17
0
def generate_json_path(e_c, armetadata, json_type='json'):
    config = e_c.config.get_config()
    fw = get_model_fw(armetadata)

    model_id = armetadata['model_parameters'][fw]['parameters']['model_id'][
        'value']
    compress = config['persistence']['compress_json']
    json_storage = StorageMetadata(e_c)

    command = 'json_storage.get_' + json_type + '_path()'
    for each_storage_type in eval(command):
        if each_storage_type['type'] in ['localfs', 'hdfs']:
            primary_path = config['storage'][
                each_storage_type['type']]['value']
            source_data = list()
            source_data.append(primary_path)
            source_data.append('/')
            source_data.append(armetadata['user_id'])
            source_data.append('/')
            source_data.append(armetadata['workflow_id'])
            source_data.append('/')
            source_data.append(armetadata['model_id'])
            source_data.append('/')
            source_data.append(fw)
            source_data.append('/')
            source_data.append(armetadata['type'])
            source_data.append('/')
            source_data.append(str(armetadata['timestamp']))
            source_data.append('/')

            specific_data = list()
            specific_data.append(each_storage_type['value'])
            specific_data.append('/')
            specific_data.append(model_id)
            specific_data.append('.json')
            if compress:
                specific_data.append('.gz')

            json_path = ''.join(source_data)
            json_path += ''.join(specific_data)
            json_storage.append(value=json_path,
                                fstype=each_storage_type['type'],
                                hash_type=each_storage_type['hash_type'])

        else:
            if json_type == 'json':
                source_data = list()
                source_data.append('/')
                source_data.append(armetadata['user_id'])
                source_data.append('/')
                source_data.append(armetadata['workflow_id'])
                source_data.append('/')
                source_data.append(armetadata['model_id'])
                source_data.append('/')
                source_data.append(model_id)
                json_path = ''.join(source_data)
                json_storage.append(value=json_path,
                                    fstype=each_storage_type['type'],
                                    hash_type=each_storage_type['hash_type'])
            else:
                json_storage.append(value=each_storage_type['value'],
                                    fstype=each_storage_type['type'],
                                    hash_type=each_storage_type['hash_type'])

    command = json_type + '_path'
    armetadata[command] = json_storage
Exemplo n.º 18
0
    def exec_sanalysis(self,
                       datapath,
                       list_ar_metadata,
                       metric='combined_accuracy',
                       deep_impact=1,
                       **kwargs):

        self._logging.log_info('gDayF', "Controller", self._labels["start"])
        self._logging.log_info('gDayF', "Controller",
                               self._labels["ana_param"], metric)
        self._logging.log_info('gDayF', "Controller",
                               self._labels["dep_param"], deep_impact)

        if isinstance(datapath, str):
            try:
                self._logging.log_info('gDayF', "Controller",
                                       self._labels["input_param"], datapath)
                pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
                id_datapath = Path(datapath).name
                hash_dataframe = hash_key('MD5', datapath)
            except IOError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except OSError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
            except JSONDecodeError:
                self._logging.log_critical('gDayF', "Controller",
                                           self._labels["failed_input"],
                                           datapath)
                return self._labels['failed_input']
        elif isinstance(datapath, DataFrame):
            hash_dataframe = None
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["input_param"],
                                       str(datapath.shape))
            pd_dataset = datapath
            id_datapath = 'Dataframe' + \
                          '_' + str(pd_dataset.size) + \
                          '_' + str(pd_dataset.shape[0]) + \
                          '_' + str(pd_dataset.shape[1])
        else:
            self._logging.log_critical('gDayF', "Controller",
                                       self._labels["failed_input"], datapath)
            return self._labels['failed_input'], None

        pd_test_dataset = None
        if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \
                and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS):
            pd_dataset, pd_test_dataset = pandas_split_data(
                pd_dataset,
                train_perc=self._config['common']['test_frame_ratio'])

        df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas')
        self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath +
                                 '_' + str(time()))
        adviser = self.adviser.AdviserAStar(e_c=self._ec,
                                            metric=metric,
                                            deep_impact=deep_impact,
                                            dataframe_name=id_datapath,
                                            hash_dataframe=hash_dataframe)

        adviser.analysis_specific(dataframe_metadata=df,
                                  list_ar_metadata=list_ar_metadata)

        while adviser.next_analysis_list is not None:

            for each_model in adviser.next_analysis_list:
                fw = get_model_fw(each_model)

                self.init_handler(fw)

                if pd_test_dataset is not None:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  test_frame=pd_test_dataset,
                                                  filtering='NONE')
                else:
                    _, analyzed_model = self.model_handler[fw][
                        'handler'].order_training(training_pframe=pd_dataset,
                                                  base_ar=each_model,
                                                  filtering='NONE')
                if analyzed_model is not None:
                    adviser.analysis_recommendation_order.append(
                        analyzed_model)

            adviser.next_analysis_list.clear()
            adviser.analysis_recommendation_order = adviser.priorize_models(
                model_list=adviser.analysis_recommendation_order)
            adviser.analysis_specific(
                dataframe_metadata=df,
                list_ar_metadata=adviser.analysis_recommendation_order)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["ana_models"],
                               str(len(adviser.analyzed_models)))
        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["exc_models"],
                               str(len(adviser.excluded_models)))

        self.log_model_list(adviser.analysis_recommendation_order, metric)

        self._logging.log_info(self._ec.get_id_analysis(), 'controller',
                               self._labels["end"])

        self.clean_handlers()

        adviser.analysis_recommendation_order = adviser.priorize_models(
            model_list=adviser.analysis_recommendation_order)

        return self._labels[
            'success_op'], adviser.analysis_recommendation_order