def test_attribute_selected_classifier_1(self):
     algorithm = "AttributeSelectedClassifier"
     service = "classification"
     params = {  '-D': False,'-U':False,'-R':False,'-B':False,'-L':False,'-A':False}
     if hyper_parameter_check(self.library, service, algorithm, params):
         self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
     else:
         self.assertFalse(hyper_parameter_check(self.library, service, algorithm, params))
 def test_naivebayesbinomial_1(self):
     algorithm = "NaiveBayesBinomial"
     params = {
         'auc_type': 'auto',
         'balance_classes': False,
         'compute_metrics': True,
         'eps_prob': 30,
         'eps_sdev': 0,
         'fold_assignment': 'auto',
         'gainslift_bins': -1,
         'ignore_const_cols': True,
         'keep_cross_validation_fold_assignment': False,
         'keep_cross_validation_models': True,
         'keep_cross_validation_predictions': False,
         'laplace': 0,
         'max_after_balance_size': 5,
         'max_confusion_matrix_size': 0,
         'max_runtime_secs': 0,
         'min_prob': 0.001,
         'min_sdev': 0.001,
         'score_each_iteration': False,
         'seed': -1
     }
     self.assertTrue(
         hyper_parameter_check(self.library, self.service, algorithm,
                               params))
 def test_bagging_1(self):
     algorithm = "Bagging"
     service = "classification"
     params = {'-P': 50, '-O': True, '-print': False,
               '-store-out-of-bag-predictions': False, '-output-out-of-bag-complexity-statistics': True, '-represent-copies-using-weights': True, '-S': 2, '-num-slots': 1, '-I': 2,
               '-D': False, '-R':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_gradientboostingmachines_1(self):
     algorithm = "GradientBoostingMachines"
     params = {
         'auc_type': 'auto',
         'balance_classes': False,
         'build_tree_one_node': False,
         'calibrate_model': False,
         'categorical_encoding': 'auto',
         'check_constant_response': True,
         'col_sample_rate': 1,
         'col_sample_rate_change_per_level': 1,
         'col_sample_rate_per_tree': 1,
         'distribution': 'auto',
         'fold_assignment': 'auto',
         'gainslift_bins': -1,
         'histogram_type': 'auto',
         'huber_alpha': 0.9,
         'ignore_const_cols': True,
         'keep_cross_validation_fold_assignment': False,
         'keep_cross_validation_models': True,
         'keep_cross_validation_predictions': False,
         'learn_rate': 0.1,
         'learn_rate_annealing': 1,
         'max_abs_leafnode_pred': 1.7976,
         'max_after_balance_size': 5.0,
         'max_confusion_matrix_size': 20,
         'max_depth': 5,
         'max_runtime_secs': 0,
         'min_rows': 10,
         'min_split_improvement': 1e-05,
         'nbins': 20,
         'nbins_cats': 1024,
         'nbins_top_level': 1024,
         'ntrees': 50,
         'pred_noise_bandwidth': 0,
         'quantile_alpha': 0.5,
         'r2_stopping': 1.7976,
         'sample_rate': 1,
         'score_each_iteration': False,
         'score_tree_interval': 0,
         'seed': -1,
         'stopping_metric': 'auto',
         'stopping_rounds': 0,
         'stopping_tolerance': 0.001,
         'tweedie_power': 1.5
     }
     self.assertTrue(
         hyper_parameter_check(self.library, self.service, algorithm,
                               params))
 def test_linearregression_1(self):
     algorithm = "LinearRegression"
     params = {
         'HGLM': False,
         'auc_type': 'auto',
         'balance_classes': False,
         'beta_epsilon': 0.0001,
         'calc_like': False,
         'cold_start': False,
         'compute_p_values': False,
         'early_stopping': True,
         'family': 'auto',
         'fold_assignment': 'auto',
         'gradient_epsilon': -1,
         'ignore_const_cols': True,
         'intercept': True,
         'keep_cross_validation_fold_assignment': False,
         'keep_cross_validation_models': True,
         'keep_cross_validation_predictions': False,
         'lambda_min_ratio': -1,
         'lambda_search': False,
         'link': 'family_default',
         'max_active_predictors': -1,
         'max_after_balance_size': 5.0,
         'max_confusion_matrix_size': 20,
         'max_iterations': -1,
         'max_runtime_secs': 0,
         'missing_values_handling': 'mean_imputation',
         'nlambdas': -1,
         'non_negative': False,
         'obj_reg': -1,
         'objective_epsilon': -1,
         'prior': -1,
         'remove_collinear_columns': False,
         'score_each_iteration': False,
         'score_iteration_interval': -1,
         'seed': -1,
         'solver': 'auto',
         'standardize': True,
         'stopping_metric': 'auto',
         'stopping_rounds': 0,
         'stopping_tolerance': 0.001,
         'theta': 1e-10,
         'tweedie_link_power': 1,
         'tweedie_variance_power': 0
     }
     self.assertTrue(
         hyper_parameter_check(self.library, self.service, algorithm,
                               params))
    def test_randomforest_1(self):
        algorithm = "RandomForest"
        params = {
            'auc_type': 'auto',
            'balance_classes': False,
            'binomial_double_trees': False,
            'build_tree_one_node': False,
            'calibrate_model': False,
            'categorical_encoding': 'auto',
            'check_constant_response': True,
            'col_sample_rate_change_per_level': 1,
            'col_sample_rate_per_tree': 1,
            'distribution': 'auto',
            'fold_assignment': 'auto',
            'histogram_type': 'auto',
            'keep_cross_validation_fold_assignment': False,
            'keep_cross_validation_models': True,
            'keep_cross_validation_predictions': False,
            'max_after_balance_size': 1.0,
            'max_confusion_matrix_size': 0,
            'max_depth': 0,
            'max_runtime_secs': 0,
            'min_rows': 1,
            'min_split_improvement': 1e-05,
            'mtries': -1,
            'nbins': 2,
            'nbins_cats': 1024,
            'nbins_top_level': 1024,
            'ntrees': 50,
            'r2_stopping': -1.7976,
            'sample_rate': 0.632,
            'score_each_iteration': False,
            'score_tree_interval': 10,
            'stopping_metric': 'auto',
            'stopping_rounds': 10,
            'stopping_tolerance': 0.001,
            'class_sampling_factors': None,
            'sample_rate_per_class': None
        }

        self.assertTrue(
            hyper_parameter_check(self.library, self.service, algorithm,
                                  params))
 def test_kmeans_1(self):
     algorithm = "SimpleKMeans"
     service = "clustering"
     params = {'-init':0,'-C':False,'-max-candidates':100,'-periodic-pruning':10000,'-min-density':2,'-t2':-1.0,'-t1':-1.5,'-V':False,'-M':False,'-I':1,'-O':False,'-fast':False,'-num-slots':1,'-S':10,'-output-debug-info':False,'-do-not-check-capabilities':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
示例#8
0
    def cluster(self,
                service,
                algorithm,
                dataset,
                features,
                lib='weka',
                number_of_clusters=2,
                cluster_type="Centroid",
                model_name=None,
                params=None,
                dataset_source=None):
        """
        :param lib: Library for clustering the model. Currently we are supporting DLTK, weka, H2O, scikit-learn
                    libraries. Valid values for this parameter: DLTK, weka, h2o, scikit
        :param service: Valid parameter values are CLUSTER.
        :param model_name: Model name and with this name model will be saved.
        :param algorithm: algorithm by which model will be trained.
        :param dataset: dataset file location in DLTK storage.
        :param features: column name list which is used to train classification model.
        :param number_of_clusters: the dataset will be clustered into number of clusters.
        :param dataset_source : metabase address for dataset
        :param params:
        :return:
            obj: A json obj containing model info.

        Args:
            dataset_source:
            dataset_source:
            features: Feature list used while model training
            dataset_source: To specify data source,
                None: Dataset file will from DLTK storage will be used
                database: Query from connected database will be used
        """
        service, library, algorithm, features, label, train_percentage = validate_parameters(
            service, lib, algorithm, features, "None", cluster=True)

        # if additional parameters passed, check whether those are valid or not
        if params is not None:
            hyper_parameter_flag = hyper_parameter_check(
                library, service, algorithm, params)
            assert hyper_parameter_flag, "Please check the params, training failed due to incorrect values"

        url = self.base_url + '/machine/cluster/'
        headers = {'ApiKey': self.api_key, 'Content-type': 'application/json'}
        if params is None:
            params = {}
        if model_name is None:
            model_name = algorithm
        if dataset_source == "database":
            body = {
                'library': lib,
                'task': 'CLUSTER',
                'service': service,
                "jobType": "DATABASE",
                "queryId": dataset,
                'config': {
                    'name': model_name,
                    'algorithm': algorithm,
                    'numOfClusters': int(number_of_clusters),
                    'epsilon': 0.1,
                    'features': features,
                    'params': params,
                    'clusterType': cluster_type
                }
            }
        else:
            body = {
                'library': lib,
                'task': 'CLUSTER',
                'service': service,
                'config': {
                    'name': model_name,
                    'algorithm': algorithm,
                    'datasetUrl': dataset,
                    'numOfClusters': int(number_of_clusters),
                    'epsilon': 0.1,
                    'features': features,
                    'params': params,
                    'clusterType': cluster_type
                }
            }
        body = json.dumps(body)
        response = requests.post(url=url, data=body, headers=headers)
        response = response.json()
        return response
 def test_kstar_1(self):
     algorithm = "KStar"
     service = "classification"
     params = {'-B': 50, '-E': True, '-M': 'a'}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_libsvm_1(self):
     algorithm = "LibSVM"
     service = "classification"
     params = {'-S': 3,'-K': 1,'-D': 1,'-R':0,'-C':0.5,'-N':1,'-Z':True,'-J':True,'-V':True,'-P':0.5,'-M':20,'-E':0.1,'-H':False,'-W':1,'-B':False,'-seed':1}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_naive_bayes_multinomial_1(self):
     algorithm = "NaiveBayesMultinomial"
     service = "classification"
     params = {'-output-debug-info': True,'-do-not-check-capabilities': False,'-num-decimal-places': 3,'-batch-size': 50}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_canopy_1(self):
     algorithm = "Canopy"
     service = "clustering"
     params = {'-max-candidates':100,'-periodic-pruning':10000,'-min-density':2,'-t2':-1.0,'-t1':-1.5,'-M':False,'-S':1,'-output-debug-info':False,'-do-not-check-capabilities':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_random_tree_1(self):
     algorithm = "RandomTree"
     service = "classification"
     params = {'-K': 0, '-M': 2, '-V': 0.1,'-S':2,'-depth':1,'-N':1,'-U':True,'-B':True,'-output-debug-info':False,'-do-not-check-capabilities':True,'-num-decimal-places':1}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_ibk_1(self):
     algorithm = "IBk"
     service = "classification"
     params = {'-I': False, '-F': True, '-K': 2,'-E':True,'-W':True,'-X':False,}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_deeplearning_1(self):
     algorithm = "DeepLearning"
     params = {
         'activation': 'rectifier',
         'adaptive_rate': True,
         'auc_type': 'auto',
         'autoencoder': False,
         'average_activation': 0.0,
         'balance_classes': False,
         'categorical_encoding': 'auto',
         'classification_stop': 0,
         'col_major': False,
         'diagnostics': True,
         'distribution': 'auto',
         'elastic_averaging': False,
         'elastic_averaging_moving_rate': 0.9,
         'elastic_averaging_regularization': 0.001,
         'epochs': 10,
         'epsilon': 1e-08,
         'export_weights_and_biases': False,
         'fast_mode': True,
         'fold_assignment': 'auto',
         'force_load_balance': True,
         'huber_alpha': 0.9,
         'ignore_const_cols': True,
         'initial_weight_distribution': 'uniform_adaptive',
         'initial_weight_scale': 0,
         'input_dropout_ratio': 0,
         'keep_cross_validation_fold_assignment': False,
         'keep_cross_validation_models': True,
         'keep_cross_validation_predictions': False,
         'l1': 0,
         'l2': 0,
         'loss': 'automatic',
         'max_after_balance_size': 5.0,
         'max_categorical_features': 2147483647,
         'max_confusion_matrix_size': 20,
         'max_runtime_secs': 0.0,
         'max_w2': 3.4028235e+38,
         'mini_batch_size': 1,
         'missing_values_handling': 'mean_imputation',
         'momentum_ramp': 1000000,
         'momentum_stable': 0,
         'momentum_start': 0,
         'nesterov_accelerated_gradient': True,
         'overwrite_with_best_model': True,
         'quantile_alpha': 0.5,
         'quiet_mode': False,
         'rate': 0.005,
         'rate_annealing': 1e-06,
         'rate_decay': 1,
         'regression_stop': 1e-06,
         'replicate_training_data': True,
         'reproducible': False,
         'rho': 0.99,
         'score_duty_cycle': 0.1,
         'score_each_iteration': False,
         'score_interval': 5,
         'score_training_samples': 10000,
         'score_validation_samples': 0,
         'score_validation_sampling': 'uniform',
         'seed': -1,
         'shuffle_training_data': False,
         'single_node_mode': False,
         'sparse': False,
         'sparsity_beta': 0,
         'standardize': True,
         'stopping_metric': 'auto',
         'stopping_rounds': 5,
         'stopping_tolerance': 0,
         'target_ratio_comm_to_comp': 0.05,
         'train_samples_per_iteration': -2,
         'tweedie_power': 1.5,
         'use_all_factor_levels': True,
         'variable_importances': True
     }
     self.assertTrue(
         hyper_parameter_check(self.library, self.service, algorithm,
                               params))
 def test_make_density_based_clusterer_1(self):
     algorithm = "MakeDensityBasedClusterer"
     service = "clustering"
     params = {'-M':1e-06,'-S':10,'-V':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_farthest_first_1(self):
     algorithm = "FarthestFirst"
     service = "clustering"
     params = {'-S':1}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_smo_1(self):
     algorithm = "SMO"
     service = "classification"
     params = {'-no-checks': True, '-C': 2, '-N': 1, '-L': 0.1, '-P': 0.001, '-M': False, '-V': 1, '-W': 1,
                 '-output-debug-info': False,'-do-not-check-capabilities':True,'-num-decimal-places':2}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_multilayer_perceptron_1(self):
     algorithm = "MultilayerPerceptron"
     service = "classification"
     params = {'-L': 0.4,'-M': 0.5,'-N': 200,'-V': 50,'-S': 1,'-E': 21,'-A': False,'-B': False,'-H': "a",'-C': False,'-I': False,'-R': False,'-D': False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_linear_regression_1(self):
     algorithm = "LinearRegression"
     service = "regression"
     params = {'-S': 2, '-C': True, '-R': 0.00001, '-minimal': False, '-additional-stats': True, '-output-debug-info': False, '-do-not-check-capabilities': False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_random_forest_1(self):
     algorithm = "RandomForest"
     service = "classification"
     params = {'-P': 99, '-O': False, '-store-out-of-bag-predictions': False,'-output-out-of-bag-complexity-statistics': False,'-print':False,'-attribute-importance':False,'-I':80,'-num-slots':1,'-K':0,'-M':2,'-V':0.1,'-S':1,'-depth':1,'-N':0,'-U':True,'-B':True,'-output-debug-info':True,'-do-not-check-capabilities':False,'-num-decimal-places':1}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_logistic_1(self):
     algorithm = "Logistic"
     service = "classification"
     params = { '-S': False, '-M': 2}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_adaboostm1_1(self):
     algorithm = "AdaBoostM1"
     service = "classification"
     params = {'-P': 99, '-Q': False, '-S': 2,'-I': 20,'-D':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_additive_regression_1(self):
     algorithm = "AdditiveRegression"
     service = "regression"
     params = {'-S': 0.4, '-I': 5, '-A': False, '-D': False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_hierarchical_clusterer_1(self):
     algorithm = "HierarchicalClusterer"
     service = "clustering"
     params = {'-L':'SINGLE','-P':False,'-D':False,'-B':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_em_1(self):
     algorithm = "EM"
     service = "clustering"
     params = {'-X':5,'-K':10,'-max':-1,'-ll-cv':1e-06,'-I':100,'-ll-iter':1e-06,'-V':False,'-M':1e-06,'-O':False,'-num-slots':1,'-S':100,'-output-debug-info':False,'-do-not-check-capabilities':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
 def test_decision_table_1(self):
     algorithm = "DecisionTable"
     service = "classification"
     params = {'-X': 2, '-E': 'mae','-I':True,'-R':True,'-P':False}
     self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
示例#28
0
    def train(self,
              task,
              algorithm,
              dataset,
              label,
              features,
              model_name=None,
              lib="weka",
              train_percentage=80,
              folds=5,
              cross_validation=False,
              params=None,
              dataset_source=None,
              evaluation_plots=False):
        """
        :param task: Training task to perform. Valid parameter values are classification, regression.
        :param algorithm: Algorithm used for training the model.
        :param dataset: dataset file location in DLTK storage.
        :param label: Target variable.
        :param features: List of features used for training the model.
        :param model_name: Model will be saved with the name specified in this parameter.
        :param lib: Library for training the model. Currently we are supporting scikit, h2o and weka.
        :param train_percentage: Percentage of data used for training the model. Rest of the data will be used to test the model.
        :param dataset_source: To specify data source,
                None: Dataset file from DLTK storage will be used
                database: Query from connected database will be used
        :param folds: number of folds for cross validation
        :param cross_validation: Evaluates model using crossvalidation if set to True.
        :rtype: A json object containing the file path in storage.
        
        """

        task, library, algorithm, features, label, train_percentage = validate_parameters(
            task, lib, algorithm, features, label, train_percentage)

        # if additional parameters passed, check whether those are valid or not
        if params is not None:
            hyper_parameter_flag = hyper_parameter_check(
                library, task, algorithm, params)
            assert hyper_parameter_flag, "Please check the params, training failed due to incorrect values"

        url = self.base_url + '/machine/' + task + '/train/'
        headers = {"ApiKey": self.api_key, "Content-type": "application/json"}
        if params is None:
            params = {}
        if model_name is None:
            model_name = algorithm

        if dataset_source == "database":
            body = {
                "library": lib,
                "task": "train",
                "jobType": "DATABASE",
                "queryId": dataset,
                "config": {
                    "name": model_name,
                    "algorithm": algorithm,
                    "label": label,
                    "trainPercentage": train_percentage,
                    "features": features,
                    "params": params,
                    "folds": folds,
                    "crossValidation": cross_validation,
                    "evalPlots": evaluation_plots
                }
            }
        else:
            body = {
                "library": lib,
                "task": "train",
                "config": {
                    "name": model_name,
                    "algorithm": algorithm,
                    "datasetUrl": dataset,
                    "label": label,
                    "trainPercentage": train_percentage,
                    "features": features,
                    "params": params,
                    "folds": folds,
                    "crossValidation": cross_validation,
                    "evalPlots": evaluation_plots
                }
            }
        body = json.dumps(body)
        response = requests.post(url=url, data=body, headers=headers)
        response = response.json()
        return response