def landmark_decision_tree(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN import sklearn.tree # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Decision Tree could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def class_probability_std(y): # pylint: disable=C0103 """Compute statistic.""" try: occurence_dict = StatisticalInformation.class_ocurrences(y) if len(y.shape) == 2: stds = [] for i in range(y.shape[1]): std = np.array([ occurrence for occurrence in occurence_dict[i].values() ], dtype=np.float64) std = (std / y.shape[0]).std() stds.append(std) return np.mean(stds) else: occurences = np.array( [occurrence for occurrence in occurence_dict.values()], dtype=np.float64) return (occurences / y.shape[0]).std() except Exception as ex: # pylint: disable=W0703 automl_log( "Class probability std could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def landmark_random_node_learner(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN import sklearn.tree # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) node = sklearn.tree.DecisionTreeClassifier( criterion="entropy", max_depth=1, random_state=random_state, min_samples_split=2, min_samples_leaf=1, max_features=1) node.fit(X[train], y[train]) predictions = node.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Random Tree Node Learner could not be computed. \ Returning 0 instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def add_attribute_key(self, column=None): """Add an attribute to the list of keys. A key is the identifier in the dataset, similar to what we call primary key in Databases. Args: column (str): The column to add as a key in the dataset. Defaults to `None`. Raises: ValueError: If column is None or not in the attributes. """ if column is None: raise ValueError("column cannot be None") if column not in self.attribute_names(): raise ValueError("Invalid column. Column is not in the dataset") if column in self.key_attributes: log_msg = "Column '{col}' already existed in key_attributes. \ Skipping ...".format(col=column) automl.automl_log(log_msg, 'WARNING') else: self.key_attributes.append(column)
def skewnesses(X, categorical_indicators): # pylint: disable=C0103 """Compute statistic.""" if scipy.sparse.issparse(X): skews = [] X_new = X.tocsc() # pylint: disable=C0103 for i in range(X_new.shape[1]): if not categorical_indicators[i]: start = X_new.indptr[i] stop = X_new.indptr[i + 1] try: skews.append(scipy.stats.skew(X_new.data[start:stop])) except Exception as ex: # pylint: disable=W0703 automl_log( "Skewness of row {i} could not be computed. \ Returning 0 instead. Originally failed with exception \ '{ex}'".format(i=i, ex=ex), 'WARNING') skews.append(0) return skews else: skews = [] for i in range(X.shape[1]): if not categorical_indicators[i]: try: skews.append(scipy.stats.skew(X[:, i])) except Exception as ex: # pylint: disable=W0703 automl_log( "Skewness of row {i} could not be computed. \ Returning 0 instead. Originally failed with exception \ '{ex}'".format(i=i, ex=ex), 'WARNING') skews.append(0) return skews
def landmark_naive_bayes(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN import sklearn.naive_bayes # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): nb = sklearn.naive_bayes.GaussianNB() # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: nb.fit(X[train], y[train]) else: nb = OneVsRestClassifier(nb) # pylint: disable=C0103 nb.fit(X[train], y[train]) predictions = nb.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Naive Bayes could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def landmark_1NN(X, y): # pylint: disable=C0103 """Compute statistic.""" try: import sklearn.neighbors # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): kNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1) if len(y.shape) == 1 or y.shape[1] == 1: kNN.fit(X[train], y[train].ravel()) else: kNN = OneVsRestClassifier(kNN) kNN.fit(X[train], y[train]) predictions = kNN.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark 1NN could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def number_of_categorical_features(categorical_indicators): """Compute statistic.""" try: return np.sum(categorical_indicators) except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Decision Node Learner could not be computed. \ Returning 0 instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def kurtosis_min(X, categorical_indicators): # pylint: disable=C0103 """Compute statistic.""" try: kurts = StatisticalInformation.kurtosisses(X, categorical_indicators) # pylint: disable=C1801 minimum = np.nanmin(kurts) if len(kurts) > 0 else 0 return minimum if np.isfinite(minimum) else 0 except Exception as ex: # pylint: disable=W0703 automl_log( "Kurtosis min could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def skewness_std(X, categorical_indicators): # pylint: disable=C0103 """Compute statistic.""" try: skews = StatisticalInformation.skewnesses(X, categorical_indicators) # pylint: disable=C1801 std = np.nanstd(skews) if len(skews) > 0 else 0 return std if np.isfinite(std) else 0 except Exception as ex: # pylint: disable=W0703 automl_log( "Skewness std could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def metafeatures_vector(self): """Return the metafeatures of this dataset as a vector (numpy array). Returns: np.array: The metafeatures as a numpy array. """ res = MetaFeaturesManager(self).metafeatures_as_numpy_array() if np.count_nonzero(~np.isnan(res)) > 0: automl_log("It was not possible to compute all metafeatures (see \ log messages). We will replace the NaN values in the meta-features vector \ with 0", 'WARNING') np.nan_to_num(res, False) return res
def _internal_validation(self): # Check for missing values nan_sum_x = self.X.isna().sum().sum() nan_sum_y = self.y.isna().sum().sum() # Log missing values messages if nan_sum_x > 0: automl_log("Features set (X) in dataset '{d_id}' contains {n_na} \ missing values. Please not that with missing data, results can be innacurate. \ Fix the missing values yourself.".format(d_id=self.dataset_id, n_na=nan_sum_x), 'WARNING') if nan_sum_y > 0: automl_log("Target set (y) in dataset '{d_id}' contains {n_na} \ missing values. Please not that with missing data, results can be innacurate. \ Fix the missing values yourself.".format(d_id=self.dataset_id, n_na=nan_sum_y), 'WARNING') # Check for inifinite values inf_sum_x = np.isinf(self.X.values).ravel().sum() inf_sum_y = np.isinf(self.y.values).ravel().sum() # Log infinite values messages if inf_sum_x > 0: automl_log("Features set (X) in dataset '{d_id}' contains {n_na} \ missing values. Please not that with missing data, results can be innacurate. \ Fix the missing values yourself.".format(d_id=self.dataset_id, n_na=nan_sum_x)) if inf_sum_y > 0: automl_log("Target set (y) in dataset '{d_id}' contains {n_na} \ missing values. Please not that with missing data, results can be innacurate. \ Fix the missing values yourself.".format(d_id=self.dataset_id, n_na=nan_sum_y))
def build_configuration(self): """Build a ML Suggestion with the row passed at instaciation time. Returns: MLSuggestion: A suggestion with classifiers, pre-processors, scalers, encoders and imputation methods. """ imputation_col = _PF_IMPUTATION + _CSV_COL_SEP + _PF_STRATEGY classifier_choice_col = _PF_CLASSIFIER + _CSV_COL_SEP + _CSV_CHOICE preprocessor_choice_col = _PF_PREPROCESSOR + _CSV_COL_SEP + _CSV_CHOICE rescaler_choice_col = _PF_RESCALING + _CSV_COL_SEP + _CSV_CHOICE encoding_choice_col = _PF_CATEGORICAL_ENCODING + _CSV_COL_SEP + \ _CSV_CHOICE suggestions_dict = { _PF_CLASSIFIER: [], _PF_PREPROCESSOR: [], _PF_RESCALING: [], _PF_CATEGORICAL_ENCODING: [], _PF_IMPUTATION: [], } for attribute in [ imputation_col, classifier_choice_col, preprocessor_choice_col, rescaler_choice_col, encoding_choice_col ]: if attribute in self.model_row.index: list_name = attribute.split(_CSV_COL_SEP)[0] suggestion = self._from_internal_list(attribute) suggestions_dict[list_name].append(suggestion) else: msg = "No attribute '{attr}' in current element\ ".format(attr=attribute) automl.automl_log(msg, 'WARNING') mlsuggestion = MLSuggestion( classifiers=suggestions_dict[_PF_CLASSIFIER], preprocessors=suggestions_dict[_PF_PREPROCESSOR], encoders=suggestions_dict[_PF_CATEGORICAL_ENCODING], rescalers=suggestions_dict[_PF_RESCALING], imputations=suggestions_dict[_PF_IMPUTATION], ) return mlsuggestion
def landmark_lda(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN # pylint: disable=C0103 import sklearn.model_selection from sklearn.discriminant_analysis \ import LinearDiscriminantAnalysis if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. try: for train, test in kf.split(X, y): lda = LinearDiscriminantAnalysis() if len(y.shape) == 1 or y.shape[1] == 1: lda.fit(X[train], y[train]) else: lda = OneVsRestClassifier(lda) lda.fit(X[train], y[train]) predictions = lda.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except scipy.linalg.LinAlgError as ex: # pylint: disable=W1201 logging.warning("LDA failed: %s Returned 0 instead!" % ex) return np.NaN except ValueError as ex: # pylint: disable=W1201 logging.warning("LDA failed: %s Returned 0 instead!" % ex) return np.NaN except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark LDA could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def _init_attributes(self, arff_dataset): """Initialize the attributes of the class. We map each of the arff dict keys into an attribute. """ self.description = arff_dataset['description'] self.name = arff_dataset['relation'] self.data = self._parse_data(arff_dataset) if arff_dataset['attributes']: if isinstance(arff_dataset['attributes'][0], tuple): self.key_attributes = [arff_dataset['attributes'][0][0]] else: automl.automl_log( "First element in 'attributes' is not a tuple'. Skipping \ 'key_attributes' assignment.", 'WARNING') else: automl.automl_log( "'attributes' field is an empty list. Errors may occur. \ Skipping 'key_attributes' assignment.", 'WARNING')
def class_ocurrences(y): # pylint: disable=C0103 """Compute statistic.""" try: if len(y.shape) == 2: occurences = [] for i in range(y.shape[1]): occurences.append( StatisticalInformation.class_ocurrences(y[:, i])) return occurences else: occurence_dict = defaultdict(float) for value in y: occurence_dict[value] += 1 return occurence_dict except Exception as ex: # pylint: disable=W0703 automl_log( "Class ocurrences could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def class_probability_min(y): # pylint: disable=C0103 """Compute statistic.""" try: occurences = StatisticalInformation.class_ocurrences(y) min_value = np.iinfo(np.int64).max if len(y.shape) == 2: for i in range(y.shape[1]): for num_occurences in occurences[i].values(): if num_occurences < min_value: min_value = num_occurences else: for num_occurences in occurences.values(): if num_occurences < min_value: min_value = num_occurences return float(min_value) / float(y.shape[0]) except Exception as ex: # pylint: disable=W0703 automl_log( "Class probability min could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def pca(X): # pylint: disable=C0103 """Compute statistic.""" try: import sklearn.decomposition rs = np.random.RandomState(42) # pylint: disable=C0103 indices = np.arange(X.shape[0]) if scipy.sparse.issparse(X): pca = sklearn.decomposition.PCA(copy=True) for i in range(10): try: rs.shuffle(indices) pca.fit(X[indices]) return pca except LinAlgError: pass automl_log("Failed to compute a Principle Component Analysis", 'WARNING') return None else: # This is expensive, but necessary with scikit-learn 0.15 Xt = X.astype(np.float64) # pylint: disable=C0103 for i in range(10): try: rs.shuffle(indices) truncated_svd = sklearn.decomposition.TruncatedSVD( n_components=X.shape[1] - 1, random_state=i, algorithm="randomized") truncated_svd.fit(Xt[indices]) return truncated_svd except LinAlgError: pass logging.warning("Failed to compute a Truncated SVD") except Exception as ex: # pylint: disable=W0703 automl_log( "PCA could not be computed. Returning None instead. Originally\ failed with exception '{ex}'".format(ex=ex), 'WARNING') return None
def class_entropy(y): # pylint: disable=C0103 """Compute statistic.""" try: labels = 1 if len(y.shape) == 1 else y.shape[1] if labels == 1: y = y.reshape((-1, 1)) entropies = [] for i in range(labels): occurence_dict = defaultdict(float) for value in y[:, i]: occurence_dict[value] += 1 entropies.append( scipy.stats.entropy( [occurence_dict[key] for key in occurence_dict], base=2)) return np.mean(entropies) except Exception as ex: # pylint: disable=W0703 automl_log( "Class entropy could not be computed. Returning 0 instead. \ Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def get_openml_dataset(openml_id, problem_type): """Fetch a dataset from OpenML and return a Dataset object. Args: openml_id (int): ID for the dataset, as stored in OpenML. problem_type (int): Type of problem to solve in the dataset. 0 for classification, 1 for regression. Returns: Dataset: An auto-ml Dataset, as defined in this module. Its default ID will be the concatenation of the OpenML dataset name and ID. """ automl_log( "Loading dataset {d_id} from OpenML:".format(d_id=openml_id), 'INFO') openml_dataset = oml.datasets.get_dataset(openml_id) features, target, categorical_indicators, attribute_names = \ openml_dataset.get_data( target=openml_dataset.default_target_attribute, return_attribute_names=True, return_categorical_indicator=True ) if scipy.sparse.issparse(features): raise CurrentlyNonSupportedError("Sparse datasets are not \ supported yet in Achmea's auto-ml solution") features = pd.DataFrame(features, columns=attribute_names) target = pd.DataFrame(target, columns=[_TARGET_NAME]) return Dataset(dataset_id="{}-{}".format(openml_dataset.dataset_id, openml_dataset.name), X=features, y=target, categorical_indicators=categorical_indicators, problem_type=problem_type)
def models_by_metric(instances_ids=None, dataset=None, metric='accuracy'): """Return the models for a list of instances by the given accuracy. Attributes: instances_ids (list): List of integers with the ids of the instances (datasets). dataset (Dataset): The dataset to work with. metric (str) Name of the metric to use. It must be one of the metrics returned by LandmarkModelParser.metrics_available(). Results: list: List of models. One element per instance. """ # Validation of arguments if instances_ids is None: raise ValueError("A list of instances' ids must be specified.") if dataset is None: raise ValueError("Please provide a valid dataset.") if not isinstance(dataset, Dataset): raise TypeError("Dataset must be of type Dataset (automl pkg)") if dataset.is_regression_problem(): raise CurrentlyNonSupportedError("Meta-learning for regression is \ not supported yet") # Create helper variables if dataset.is_classification_problem(): problem_type = "classification" classif_type = "multiclass" if dataset.n_labels > 2 else "binary" # sparse or not data_type = "sparse" if dataset.is_sparse() else "dense" # metric to use is composed of `metric`_`binary/multiclass`. e.g. # accuracy_binary internal_metric = "{me}_{c_type}".format(me=metric, c_type=classif_type) # problem_description is classification_`sparse/dense`. E.g. # classficiation_sparse problem_desc = "{p_type}_{d_type}".format(p_type=problem_type, d_type=data_type) # Then the final basename_dir (name of the metric in auto-sklearn) is # the mix of the above. E.g. accuracy_binary_classficiation_sparse basename_dir = \ "{metric}.{problem}".format(metric=internal_metric, problem=problem_desc) # Get the available metrics to validate the resolved metric is part of # list metrics_available = LandmarkModelParser.metrics_available() if internal_metric not in metrics_available: raise ValueError("Metric '{argument}' is not supported. Try any \ of the following metrics: {available}".format( argument=metric, available=metrics_available)) # Get the corresponding configurations.csv file path configs_csv = LandmarkModelParser._configs_file_by_metric(basename_dir) # Get the corresponding algorithm_runs.arff file path algoruns_arff = \ LandmarkModelParser._algorithm_runs_file_by_metric(basename_dir) # Validate we found valid files if configs_csv is None or algoruns_arff is None: raise ValueError("Some of the meta-learning files was not found \ in the database for metric '{metric}'".format(metric=basename_dir)) # Start to resolve the result res = [] # For each of the instances requested for instance_id in instances_ids: # instanciate the correspondant algorithm runs file try: algoruns_file = AlgorithmRunsFile(algoruns_arff) # get the configuration id for that instance config_id = \ algoruns_file.get_associated_configuration_id(instance_id) # And then load the configurations.csv file config_file = ConfigurationsFile(configs_csv) # Resolve the configuration as a list mmb = ConfigurationBuilder( config_file.get_configuration(config_id)) # and append it to the list res.append(mmb.build_configuration()) except ValueError: automl_log( "Instance (dataset) with id={inst_id} has no \ meta-knowledge associated for metric '{metric}'. We will ignore this dataset \ and you should expect fewer ML Suggestions.".format(inst_id=instance_id, metric=basename_dir), 'WARNING') return res