def get_target_type(targets_data): from sklearn.metrics.classification import type_of_target if isinstance(targets_data, pd.DataFrame): if len(targets_data.columns) > 1: tmp = pd.DataFrame() tmp[targets_data.columns[0]] = targets_data[targets_data.columns[0]] targets_data = tmp target_type = type_of_target(targets_data.as_matrix().ravel()) else: target_type = type_of_target(targets_data.ravel()) return target_type
def fit(self, X, y, metric=None, loss=None, feat_type=None, dataset_name=None): X = sklearn.utils.check_array(X, accept_sparse="csr", force_all_finite=False) y = sklearn.utils.check_array(y, ensure_2d=False) if scipy.sparse.issparse(X): X.sort_indices() y_task = type_of_target(y) task_mapping = { 'multilabel-indicator': MULTILABEL_CLASSIFICATION, 'multiclass': MULTICLASS_CLASSIFICATION, 'binary': BINARY_CLASSIFICATION } task = task_mapping.get(y_task) if task is None: raise ValueError('Cannot work on data of type %s' % y_task) if metric is None: if task == MULTILABEL_CLASSIFICATION: metric = f1_macro else: metric = accuracy y = self._process_target_classes(y) return self._automl.fit(X, y, task, metric, feat_type, dataset_name)
def fit( self, X: np.ndarray, y: np.ndarray, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, metric: Optional[Scorer] = None, feat_type: Optional[List[bool]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: bool = False, load_models: bool = True, ): X, y = self._perform_input_checks(X, y) if X_test is not None: X_test, y_test = self._perform_input_checks(X_test, y_test) if len(y.shape) != len(y_test.shape): raise ValueError('Target value shapes do not match: %s vs %s' % (y.shape, y_test.shape)) y_task = type_of_target(y) task = self._task_mapping.get(y_task) if task is None: raise ValueError('Cannot work on data of type %s' % y_task) if metric is None: if task == MULTILABEL_CLASSIFICATION: metric = f1_macro else: metric = accuracy y, self._classes, self._n_classes = self._process_target_classes(y) if y_test is not None: # Map test values to actual values - TODO: copy to all kinds of # other parts in this code and test it!!! y_test_new = [] for output_idx in range(len(self._classes)): mapping = { self._classes[output_idx][idx]: idx for idx in range(len(self._classes[output_idx])) } enumeration = y_test if len( self._classes) == 1 else y_test[output_idx] y_test_new.append( np.array([mapping[value] for value in enumeration])) y_test = np.array(y_test_new) if self._n_outputs == 1: y_test = y_test.flatten() return super().fit( X, y, X_test=X_test, y_test=y_test, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, only_return_configuration_space=only_return_configuration_space, load_models=load_models, )
def detect_task(y, type='none'): task = type_of_target(y) if 'continuous' in task or type == 'regression': return REGRESSION elif 'multicalss' in task or type == 'multiclass': return MULTICLASS_CLASSIFICATION elif 'multilabel' in task or type == 'multilabel': return MULTILABEL_CLASSIFICATION else: return BINARY_CLASSIFICATION
def fit( self, X, y, X_test=None, y_test=None, metric=None, feat_type=None, dataset_name=None, only_return_configuration_space=False, ): X, y = self._perform_input_checks(X, y) if X_test is not None: X_test, y_test = self._perform_input_checks(X_test, y_test) if len(y.shape) != len(y_test.shape): raise ValueError('Target value shapes do not match: %s vs %s' % (y.shape, y_test.shape)) y_task = type_of_target(y) task = self._task_mapping.get(y_task) if task is None: raise ValueError('Cannot work on data of type %s' % y_task) if metric is None: if task == MULTILABEL_CLASSIFICATION: metric = f1_macro else: metric = accuracy y, self._classes, self._n_classes = self._process_target_classes(y) if y_test is not None: # Map test values to actual values - TODO: copy to all kinds of # other parts in this code and test it!!! y_test_new = [] for output_idx in range(len(self._classes)): mapping = {self._classes[output_idx][idx]: idx for idx in range(len(self._classes[output_idx]))} enumeration = y_test if len(self._classes) == 1 else y_test[output_idx] y_test_new.append( np.array([mapping[value] for value in enumeration]) ) y_test = np.array(y_test_new) if self._n_outputs == 1: y_test = y_test.flatten() return super().fit( X, y, X_test=X_test, y_test=y_test, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, only_return_configuration_space=only_return_configuration_space, )
def get_summary(self, fields_formatting=None, do_language_detection=True): """ :param do_language_detection: :param fields_formatting: :return: """ dtypes = dict(self.df.dtypes) scheme = { k: resolve_type( v, fields_formatting[k] if (fields_formatting is not None and k in fields_formatting) else None) for k, v in dtypes.items() } desc = {} text_cols = [] for col in self.df: from sklearn.metrics.classification import type_of_target tmp = pd.DataFrame() tmp[col] = self.df[col] col_data = tmp.as_matrix().ravel() target_type = type_of_target(col_data) col_meta = {'target_type': target_type} if scheme[col] == 'str' and target_type != 'binary': text_cols.append(col) if scheme[col] == 'bool' or scheme[col] == 'object' or scheme[ col] == 'str': desc[col] = freq(self.df, col) elif 'datetime' in scheme[col]: desc[col] = { 'type': 'range', 'values': { 'min': str(self.df[col].min()), 'max': str(self.df[col].max()) } } else: desc[col] = hist(self.df, col, 'fd') for meta_prop in col_meta: desc[col][meta_prop] = col_meta[meta_prop] if do_language_detection: text_stats = enrich_text_columns(self.df, text_cols) lang_stats = text_stats['lang'] columns_lang_stats = lang_stats['by_cols'] for txt_col in lang_stats['by_cols']: desc[txt_col]['languages'] = columns_lang_stats[txt_col] else: for txt_col in text_cols: desc[txt_col]['languages'] = {} return {'scheme': scheme, 'desc': desc}
def fit(self, X, y, metric=None, loss=None, feat_type=None, dataset_name=None): X, y = self._perform_input_checks(X, y) y_task = type_of_target(y) task = self._task_mapping.get(y_task) if task is None: raise ValueError('Cannot work on data of type %s' % y_task) if metric is None: if task == MULTILABEL_CLASSIFICATION: metric = f1_macro else: metric = accuracy y, self._classes, self._n_classes = self._process_target_classes(y) return super().fit(X, y, task, metric, feat_type, dataset_name)
def pac_score(solution, prediction): """ Probabilistic Accuracy based on log_loss metric. We assume the solution is in {0, 1} and prediction in [0, 1]. Otherwise, run normalize_array. :param solution: :param prediction: :param task: :return: """ def normalize_array(solution, prediction): """ Use min and max of solution as scaling factors to normalize prediction, then threshold it to [0, 1]. Binarize solution to {0, 1}. This allows applying classification scores to all cases. In principle, this should not do anything to properly formatted classification inputs and outputs. :param solution: :param prediction: :return: """ # Binarize solution sol = np.ravel(solution) # convert to 1-d array maxi = np.nanmax(sol[np.isfinite(sol)]) mini = np.nanmin(sol[np.isfinite(sol)]) if maxi == mini: logger.debug('Warning: cannot normalize array') return [solution, prediction] diff = maxi - mini mid = (maxi + mini) / 2. solution[solution >= mid] = 1 solution[solution < mid] = 0 # Normalize and threshold predictions (takes effect only if solution not # in {0, 1}) prediction -= float(mini) prediction /= float(diff) # and if predictions exceed the bounds [0, 1] prediction[prediction > 1] = 1 prediction[prediction < 0] = 0 # Make probabilities smoother # new_prediction = np.power(new_prediction, (1./10)) return [solution, prediction] def log_loss(solution, prediction, task): """Log loss for binary and multiclass.""" [sample_num, label_num] = solution.shape # Lower gives problems with float32! eps = 0.00000003 if (task == 'multiclass') and (label_num > 1): # Make sure the lines add up to one for multi-class classification norma = np.sum(prediction, axis=1) for k in range(sample_num): prediction[k, :] /= np.maximum(norma[k], eps) sample_num = solution.shape[0] for i in range(sample_num): j = np.argmax(solution[i, :]) solution[i, :] = 0 solution[i, j] = 1 solution = solution.astype(np.int32, copy=False) # For the base prediction, this solution is ridiculous in the # multi-label case # Bounding of predictions to avoid log(0),1/0,... prediction = np.minimum(1 - eps, np.maximum(eps, prediction)) # Compute the log loss pos_class_log_loss = -np.mean(solution * np.log(prediction), axis=0) if (task != 'multiclass') or (label_num == 1): # The multi-label case is a bunch of binary problems. # The second class is the negative class for each column. neg_class_log_loss = -np.mean( (1 - solution) * np.log(1 - prediction), axis=0) log_loss = pos_class_log_loss + neg_class_log_loss # Each column is an independent problem, so we average. # The probabilities in one line do not add up to one. # log_loss = mvmean(log_loss) # print('binary {}'.format(log_loss)) # In the multilabel case, the right thing i to AVERAGE not sum # We return all the scores so we can normalize correctly later on else: # For the multiclass case the probabilities in one line add up one. log_loss = pos_class_log_loss # We sum the contributions of the columns. log_loss = np.sum(log_loss) # print('multiclass {}'.format(log_loss)) return log_loss def prior_log_loss(frac_pos, task): """Baseline log loss. For multiple classes ot labels return the values for each column """ eps = 1e-15 frac_pos_ = np.maximum(eps, frac_pos) if task != 'multiclass': # binary case frac_neg = 1 - frac_pos frac_neg_ = np.maximum(eps, frac_neg) pos_class_log_loss_ = -frac_pos * np.log(frac_pos_) neg_class_log_loss_ = -frac_neg * np.log(frac_neg_) base_log_loss = pos_class_log_loss_ + neg_class_log_loss_ # base_log_loss = mvmean(base_log_loss) # print('binary {}'.format(base_log_loss)) # In the multilabel case, the right thing i to AVERAGE not sum # We return all the scores so we can normalize correctly later on else: # multiclass case fp = frac_pos_ / sum( frac_pos_) # Need to renormalize the lines in multiclass case # Only ONE label is 1 in the multiclass case active for each line pos_class_log_loss_ = -frac_pos * np.log(fp) base_log_loss = np.sum(pos_class_log_loss_) return base_log_loss y_type = type_of_target(solution) if y_type == 'binary': if len(solution.shape) == 1: solution = solution.reshape((-1, 1)) if len(prediction.shape) == 1: prediction = prediction.reshape((-1, 1)) if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError( f'A prediction array with probability values ' f'for {prediction.shape[1]} classes is not a binary ' f'classification problem') # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError(f'Invalid prediction shape {prediction.shape}') elif y_type == 'multiclass': if len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError(f'Solution array must only contain one class ' f'label, but contains {solution.shape[1]}') elif len(solution.shape) == 1: pass else: raise ValueError('Solution.shape %s' % solution.shape) # Need to create a multiclass solution and a multiclass predictions max_class = int(np.max((np.max(solution), np.max(prediction)))) solution_binary = np.zeros((len(solution), max_class + 1)) for i in range(len(solution)): solution_binary[i, int(solution[i])] = 1 solution = solution_binary elif y_type == 'multilabel-indicator': solution = solution.copy() else: raise NotImplementedError(f'pac_score does not support task {y_type}') solution, prediction = normalize_array(solution, prediction.copy()) sample_num, _ = solution.shape eps = 1e-7 # Compute the base log loss (using the prior probabilities) pos_num = 1. * np.sum(solution, axis=0, dtype=float) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, y_type) the_log_loss = log_loss(solution, prediction, y_type) # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation pac = np.mean(np.exp(-the_log_loss)) base_pac = np.mean(np.exp(-the_base_log_loss)) # Normalize: 0 for random, 1 for perfect score = (pac - base_pac) / np.maximum(eps, (1 - base_pac)) return score
def pac_score(solution, prediction): """ Probabilistic Accuracy based on log_loss metric. We assume the solution is in {0, 1} and prediction in [0, 1]. Otherwise, run normalize_array. :param solution: :param prediction: :param task: :return: """ def normalize_array(solution, prediction): """ Use min and max of solution as scaling factors to normalize prediction, then threshold it to [0, 1]. Binarize solution to {0, 1}. This allows applying classification scores to all cases. In principle, this should not do anything to properly formatted classification inputs and outputs. :param solution: :param prediction: :return: """ # Binarize solution sol = np.ravel(solution) # convert to 1-d array maxi = np.nanmax(sol[np.isfinite(sol)]) mini = np.nanmin(sol[np.isfinite(sol)]) if maxi == mini: print('Warning, cannot normalize') return [solution, prediction] diff = maxi - mini mid = (maxi + mini) / 2. solution[solution >= mid] = 1 solution[solution < mid] = 0 # Normalize and threshold predictions (takes effect only if solution not # in {0, 1}) prediction -= float(mini) prediction /= float(diff) # and if predictions exceed the bounds [0, 1] prediction[prediction > 1] = 1 prediction[prediction < 0] = 0 # Make probabilities smoother # new_prediction = np.power(new_prediction, (1./10)) return [solution, prediction] def log_loss(solution, prediction, task): """Log loss for binary and multiclass.""" [sample_num, label_num] = solution.shape # Lower gives problems with float32! eps = 0.00000003 if (task == 'multiclass') and (label_num > 1): # Make sure the lines add up to one for multi-class classification norma = np.sum(prediction, axis=1) for k in range(sample_num): prediction[k, :] /= sp.maximum(norma[k], eps) sample_num = solution.shape[0] for i in range(sample_num): j = np.argmax(solution[i, :]) solution[i, :] = 0 solution[i, j] = 1 solution = solution.astype(np.int32, copy=False) # For the base prediction, this solution is ridiculous in the # multi-label case # Bounding of predictions to avoid log(0),1/0,... prediction = sp.minimum(1 - eps, sp.maximum(eps, prediction)) # Compute the log loss pos_class_log_loss = -np.mean(solution * np.log(prediction), axis=0) if (task != 'multiclass') or (label_num == 1): # The multi-label case is a bunch of binary problems. # The second class is the negative class for each column. neg_class_log_loss = -np.mean( (1 - solution) * np.log(1 - prediction), axis=0) log_loss = pos_class_log_loss + neg_class_log_loss # Each column is an independent problem, so we average. # The probabilities in one line do not add up to one. # log_loss = mvmean(log_loss) # print('binary {}'.format(log_loss)) # In the multilabel case, the right thing i to AVERAGE not sum # We return all the scores so we can normalize correctly later on else: # For the multiclass case the probabilities in one line add up one. log_loss = pos_class_log_loss # We sum the contributions of the columns. log_loss = np.sum(log_loss) # print('multiclass {}'.format(log_loss)) return log_loss def prior_log_loss(frac_pos, task): """Baseline log loss. For multiplr classes ot labels return the volues for each column """ eps = 1e-15 frac_pos_ = sp.maximum(eps, frac_pos) if task != 'multiclass': # binary case frac_neg = 1 - frac_pos frac_neg_ = sp.maximum(eps, frac_neg) pos_class_log_loss_ = -frac_pos * np.log(frac_pos_) neg_class_log_loss_ = -frac_neg * np.log(frac_neg_) base_log_loss = pos_class_log_loss_ + neg_class_log_loss_ # base_log_loss = mvmean(base_log_loss) # print('binary {}'.format(base_log_loss)) # In the multilabel case, the right thing i to AVERAGE not sum # We return all the scores so we can normalize correctly later on else: # multiclass case fp = frac_pos_ / sum( frac_pos_ ) # Need to renormalize the lines in multiclass case # Only ONE label is 1 in the multiclass case active for each line pos_class_log_loss_ = -frac_pos * np.log(fp) base_log_loss = np.sum(pos_class_log_loss_) return base_log_loss y_type = type_of_target(solution) if y_type == 'binary': if len(solution.shape) == 1: solution = solution.reshape((-1, 1)) if len(prediction.shape) == 1: prediction = prediction.reshape((-1, 1)) if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError('A prediction array with probability values ' 'for %d classes is not a binary ' 'classification problem' % prediction.shape[1]) # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError('Invalid prediction shape %s' % prediction.shape) elif y_type == 'multiclass': if len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) elif len(solution.shape) == 1: pass else: raise ValueError('Solution.shape %s' % solution.shape) # Need to create a multiclass solution and a multiclass predictions max_class = int(np.max((np.max(solution), np.max(prediction)))) solution_binary = np.zeros((len(solution), max_class + 1)) for i in range(len(solution)): solution_binary[i, int(solution[i])] = 1 solution = solution_binary elif y_type == 'multilabel-indicator': solution = solution.copy() else: raise NotImplementedError('pac_score does not support task type %s' % y_type) solution, prediction = normalize_array(solution, prediction.copy()) sample_num, _ = solution.shape eps = 1e-7 # Compute the base log loss (using the prior probabilities) pos_num = 1. * np.sum(solution, axis=0, dtype=float) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, y_type) the_log_loss = log_loss(solution, prediction, y_type) # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation pac = np.mean(np.exp(-the_log_loss)) base_pac = np.mean(np.exp(-the_base_log_loss)) # Normalize: 0 for random, 1 for perfect score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac)) return score