def binary_roc_graph(y_true, y_pred, **kwargs): """ This function plots a ROC graph of a binary-class predictor. AUC calculation are presented as-well. Data can be either: (1) one dimensional, where the values of y_true represent the true class and y_pred the predicted probability of that class, or (2) two-dimensional, where each line in y_true is a one-hot-encoding of the true class and y_pred holds the predicted probabilities of each class. For example, consider a data-set of two data-points where the true class of the first line is class 0, which was predicted with a probability of 0.6, and the second line's true class is 1, with predicted probability of 0.8. In the first configuration, the input will be: y_true = [0,1], y_pred = [0.6,0.8]. In the second configuration, the input will be: y_true = [[1,0],[0,1]], y_pred = [[0.6,0.4],[0.2,0.8]]. Based on sklearn examples (as was seen on April 2018): http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html Parameters ---------- y_true : list / NumPy ndarray The true classes of the predicted data y_pred : list / NumPy ndarray The predicted classes kwargs : any key-value pairs Different options and configurations """ y_true = convert(y_true, 'array') y_pred = convert(y_pred, 'array') if y_pred.shape != y_true.shape: raise ValueError('y_true and y_pred must have the same shape') elif len(y_pred.shape) == 1: y_t = y_true y_p = y_pred else: y_t = [np.argmax(x) for x in y_true] y_p = [x[1] for x in y_pred] fpr, tpr, _ = roc_curve(y_t, y_p) auc_score = auc(fpr, tpr) color = kwargs.get('color', 'darkorange') lw = kwargs.get('lw', 2) ls = kwargs.get('ls', '-') fmt = kwargs.get('fmt', '.2f') if 'class_label' in kwargs: class_label = ': {}'.format(kwargs['class_label']) else: class_label = '' if kwargs.get('new_figure', True): plt.figure() plt.plot(fpr, tpr, color=color, lw=lw, ls=ls, label='ROC curve{class_label} (AUC = {auc:{fmt}})'.format( class_label=class_label, auc=auc_score, fmt=fmt)) if kwargs.get('show_graphs', True): _display_plot() if kwargs.get('return_pr', False): return {'fpr': fpr, 'tpr': tpr}
def correlation_ratio(categories, measurements): """ Calculates the Correlation Ration (sometimes marked by the greek letter Eta) for categorical-continuous association. Answers the question - given a continuous value of a measurement, is it possible to know which category is it associated with? Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means a category can be determined with absolute certainty. Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio :param categories: list / NumPy ndarray / Pandas DataFrame A sequence of categorical measurements :param measurements: list / NumPy ndarray / Pandas DataFrame A sequence of continuous measurements :return: float in the range of [0,1] """ categories = convert(categories, 'array') measurements = convert(measurements, 'array') fcat, _ = pd.factorize(categories) cat_num = np.max(fcat)+1 y_avg_array = np.zeros(cat_num) n_array = np.zeros(cat_num) for i in range(0,cat_num): cat_measures = measurements[np.argwhere(fcat == i).flatten()] n_array[i] = len(cat_measures) y_avg_array[i] = np.average(cat_measures) y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array) numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2))) denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2)) if numerator == 0: eta = 0.0 else: eta = numerator/denominator return eta
def roc_graph(y_true, y_pred, micro=True, macro=True, **kwargs): """ Plot a ROC graph of predictor's results (inclusding AUC scores), where each row of y_true and y_pred represent a single example. If there are 1 or two columns only, the data is treated as a binary classification, in which the result is similar to the `binary_roc_graph` method, see its documentation for more information. If there are more then 2 columns, each column is considered a unique class, and a ROC graph and AUC score will be computed for each. A Macro-ROC and Micro-ROC are computed and plotted too by default. Based on sklearn examples (as was seen on April 2018): http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html **Example:** See `roc_graph_example` under `dython.examples` Parameters ---------- y_true : list / NumPy ndarray The true classes of the predicted data y_pred : list / NumPy ndarray The predicted classes micro : Boolean, default = True Whether to calculate a Micro ROC graph (not applicable for binary cases) macro : Boolean, default = True Whether to calculate a Macro ROC graph (not applicable for binary cases) kwargs : any key-value pairs Different options and configurations """ all_fpr = list() all_tpr = list() y_true = convert(y_true, 'array') y_pred = convert(y_pred, 'array') if y_pred.shape != y_true.shape: raise ValueError('y_true and y_pred must have the same shape') elif len(y_pred.shape) == 1 or y_pred.shape[1] <= 2: return binary_roc_graph(y_true, y_pred, **kwargs) else: colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] n = y_pred.shape[1] plt.figure() kwargs['new_figure'] = False kwargs['show_graphs'] = False kwargs['return_pr'] = True for i in range(0, n): pr = binary_roc_graph(y_true[:, i], y_pred[:, i], color=colors[i % len(colors)], class_label=i, **kwargs) all_fpr.append(pr['fpr']) all_tpr.append(pr['tpr']) if micro: binary_roc_graph(y_true.ravel(), y_pred.ravel(), ls=':', color='deeppink', class_label='micro', **kwargs) if macro: _plot_macro_roc(all_fpr, all_tpr, n) _display_plot()
def correlation_ratio(categories, measurements, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association. Answers the question - given a continuous value of a measurement, is it possible to know which category is it associated with? Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means a category can be determined with absolute certainty. Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio **Returns:** float in the range of [0,1] Parameters ---------- categories : list / NumPy ndarray / Pandas Series A sequence of categorical measurements measurements : list / NumPy ndarray / Pandas Series A sequence of continuous measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. """ if nan_strategy == REPLACE: categories, measurements = replace_nan_with_value( categories, measurements, nan_replace_value) elif nan_strategy == DROP: categories, measurements = remove_incomplete_samples( categories, measurements) categories = convert(categories, 'array') measurements = convert(measurements, 'array') fcat, _ = pd.factorize(categories) cat_num = np.max(fcat) + 1 y_avg_array = np.zeros(cat_num) n_array = np.zeros(cat_num) for i in range(0, cat_num): cat_measures = measurements[np.argwhere(fcat == i).flatten()] n_array[i] = len(cat_measures) y_avg_array[i] = np.average(cat_measures) y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array) numerator = np.sum( np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2))) denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2)) if numerator == 0: eta = 0.0 else: eta = np.sqrt(numerator / denominator) return eta
def numerical_encoding(dataset, nominal_columns='all', drop_single_label=False, drop_fact_dict=True): """ Encoding a data-set with mixed data (numerical and categorical) to a numerical-only data-set, using the following logic: * categorical with only a single value will be marked as zero (or dropped, if requested) * categorical with two values will be replaced with the result of Pandas `factorize` * categorical with more than two values will be replaced with the result of Pandas `get_dummies` * numerical columns will not be modified **Returns:** DataFrame or (DataFrame, dict). If `drop_fact_dict` is True, returns the encoded DataFrame. else, returns a tuple of the encoded DataFrame and dictionary, where each key is a two-value column, and the value is the original labels, as supplied by Pandas `factorize`. Will be empty if no two-value columns are present in the data-set Parameters ---------- dataset : NumPy ndarray / Pandas DataFrame The data-set to encode nominal_columns : sequence / string A sequence of the nominal (categorical) columns in the dataset. If string, must be 'all' to state that all columns are nominal. If None, nothing happens. Default: 'all' drop_single_label : Boolean, default = False If True, nominal columns with a only a single value will be dropped. drop_fact_dict : Boolean, default = True If True, the return value will be the encoded DataFrame alone. If False, it will be a tuple of the DataFrame and the dictionary of the binary factorization (originating from pd.factorize) """ dataset = convert(dataset, 'dataframe') if nominal_columns is None: return dataset elif nominal_columns == 'all': nominal_columns = dataset.columns converted_dataset = pd.DataFrame() binary_columns_dict = dict() for col in dataset.columns: if col not in nominal_columns: converted_dataset.loc[:, col] = dataset[col] else: unique_values = pd.unique(dataset[col]) if len(unique_values) == 1 and not drop_single_label: converted_dataset.loc[:, col] = 0 elif len(unique_values) == 2: converted_dataset.loc[:, col], binary_columns_dict[ col] = pd.factorize(dataset[col]) else: dummies = pd.get_dummies(dataset[col], prefix=col) converted_dataset = pd.concat([converted_dataset, dummies], axis=1) if drop_fact_dict: return converted_dataset else: return converted_dataset, binary_columns_dict
def identify_nominal_columns(dataset, include=['object', 'category']): """Given a dataset, identify categorical columns. Parameters: ----------- dataset : a pandas dataframe include : which column types to filter by; default: ['object', 'category']) Returns: -------- categorical_columns : a list of categorical columns Example: -------- >> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]}) >> identify_nominal_columns(df) ['col1'] """ dataset = convert(dataset, 'dataframe') nominal_columns = list(dataset.select_dtypes(include=include).columns) return nominal_columns
def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True, return_results=False, **kwargs): """ Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and continuous features using: - Pearson's R for continuous-continuous cases - Correlation Ratio for categorical-continuous cases - Cramer's V or Theil's U for categorical-categorical cases :param dataset: NumPy ndarray / Pandas DataFrame The data-set for which the features' correlation is computed :param nominal_columns: string / list / NumPy ndarray Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all columns are categorical, or None (default) to state none are categorical :param mark_columns: Boolean (default: False) if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or continuous), as provided by nominal_columns :param theil_u: Boolean (default: False) In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V :param plot: Boolean (default: True) If True, plot a heat-map of the correlation matrix :param return_results: Boolean (default: False) If True, the function will return a Pandas DataFrame of the computed associations :param kwargs: Arguments to be passed to used function and methods :return: Pandas DataFrame A DataFrame of the correlation/strength-of-association between all features """ dataset = convert(dataset, 'dataframe') columns = dataset.columns if nominal_columns is None: nominal_columns = list() elif nominal_columns == 'all': nominal_columns = columns corr = pd.DataFrame(index=columns, columns=columns) for i in range(0, len(columns)): for j in range(i, len(columns)): if i == j: corr[columns[i]][columns[j]] = 1.0 else: if columns[i] in nominal_columns: if columns[j] in nominal_columns: if theil_u: corr[columns[j]][columns[i]] = theils_u( dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = theils_u( dataset[columns[j]], dataset[columns[i]]) else: cell = cramers_v(dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: if columns[j] in nominal_columns: cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell corr.fillna(value=np.nan, inplace=True) if mark_columns: marked_columns = [ '{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns ] corr.columns = marked_columns corr.index = marked_columns if plot: plt.figure(figsize=kwargs.get('figsize', None)) sns.heatmap(corr, annot=kwargs.get('annot', True), fmt=kwargs.get('fmt', '.2f')) plt.show() if return_results: return corr
def numerical_encoding(dataset, nominal_columns='auto', drop_single_label=False, drop_fact_dict=True, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Encoding a data-set with mixed data (numerical and categorical) to a numerical-only data-set using the following logic: * categorical with only a single value will be marked as zero (or dropped, if requested) * categorical with two values will be replaced with the result of Pandas `factorize` * categorical with more than two values will be replaced with the result of Pandas `get_dummies` * numerical columns will not be modified **Returns:** DataFrame or (DataFrame, dict). If `drop_fact_dict` is True, returns the encoded DataFrame. else, returns a tuple of the encoded DataFrame and dictionary, where each key is a two-value column, and the value is the original labels, as supplied by Pandas `factorize`. Will be empty if no two-value columns are present in the data-set Parameters ---------- dataset : NumPy ndarray / Pandas DataFrame The data-set to encode nominal_columns : sequence / string. default = 'all' A sequence of the nominal (categorical) columns in the dataset. If string, must be 'all' to state that all columns are nominal. If None, nothing happens. If 'auto', categorical columns will be identified based on dtype. drop_single_label : Boolean, default = False If True, nominal columns with a only a single value will be dropped. drop_fact_dict : Boolean, default = True If True, the return value will be the encoded DataFrame alone. If False, it will be a tuple of the DataFrame and the dictionary of the binary factorization (originating from pd.factorize) nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop_samples' to remove samples with missing values, 'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan _strategy is set to 'replace' """ dataset = convert(dataset, 'dataframe') if nan_strategy == REPLACE: dataset.fillna(nan_replace_value, inplace=True) elif nan_strategy == DROP_SAMPLES: dataset.dropna(axis=0, inplace=True) elif nan_strategy == DROP_FEATURES: dataset.dropna(axis=1, inplace=True) if nominal_columns is None: return dataset elif nominal_columns == 'all': nominal_columns = dataset.columns elif nominal_columns == 'auto': nominal_columns = identify_nominal_columns(dataset) converted_dataset = pd.DataFrame() binary_columns_dict = dict() for col in dataset.columns: if col not in nominal_columns: converted_dataset.loc[:, col] = dataset[col] else: unique_values = pd.unique(dataset[col]) if len(unique_values) == 1 and not drop_single_label: converted_dataset.loc[:, col] = 0 elif len(unique_values) == 2: converted_dataset.loc[:, col], binary_columns_dict[ col] = pd.factorize(dataset[col]) else: dummies = pd.get_dummies(dataset[col], prefix=col) converted_dataset = pd.concat([converted_dataset, dummies], axis=1) if drop_fact_dict: return converted_dataset else: return converted_dataset, binary_columns_dict
def associations(dataset, nominal_columns='auto', mark_columns=False, theil_u=False, plot=True, return_results=False, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE, ax=None, **kwargs): """ Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and continuous features using: * Pearson's R for continuous-continuous cases * Correlation Ratio for categorical-continuous cases * Cramer's V or Theil's U for categorical-categorical cases **Returns:** a DataFrame of the correlation/strength-of-association between all features **Example:** see `associations_example` under `dython.examples` Parameters ---------- dataset : NumPy ndarray / Pandas DataFrame The data-set for which the features' correlation is computed nominal_columns : string / list / NumPy ndarray Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all columns are categorical, 'auto' (default) to try to identify nominal columns, or None to state none are categorical mark_columns : Boolean, default = False if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or continuous), as provided by nominal_columns theil_u : Boolean, default = False In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V plot : Boolean, default = True If True, plot a heat-map of the correlation matrix return_results : Boolean, default = False If True, the function will return a Pandas DataFrame of the computed associations nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop_samples' to remove samples with missing values, 'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace' ax : matplotlib ax, default = None Matplotlib Axis on which the heat-map will be plotted kwargs : any key-value pairs Arguments to be passed to used function and methods """ dataset = convert(dataset, 'dataframe') if nan_strategy == REPLACE: dataset.fillna(nan_replace_value, inplace=True) elif nan_strategy == DROP_SAMPLES: dataset.dropna(axis=0, inplace=True) elif nan_strategy == DROP_FEATURES: dataset.dropna(axis=1, inplace=True) columns = dataset.columns if nominal_columns is None: nominal_columns = list() elif nominal_columns == 'all': nominal_columns = columns elif nominal_columns == 'auto': nominal_columns = identify_nominal_columns(dataset) corr = pd.DataFrame(index=columns, columns=columns) for i in range(0, len(columns)): for j in range(i, len(columns)): if i == j: corr[columns[i]][columns[j]] = 1.0 else: if columns[i] in nominal_columns: if columns[j] in nominal_columns: if theil_u: corr[columns[j]][columns[i]] = theils_u( dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = theils_u( dataset[columns[j]], dataset[columns[i]], nan_strategy=SKIP) else: cell = cramers_v(dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: if columns[j] in nominal_columns: cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell corr.fillna(value=np.nan, inplace=True) if mark_columns: marked_columns = [ '{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns ] corr.columns = marked_columns corr.index = marked_columns if plot: if ax is None: plt.figure(figsize=kwargs.get('figsize', None)) sns.heatmap(corr, annot=kwargs.get('annot', True), fmt=kwargs.get('fmt', '.2f'), ax=ax) if ax is None: plt.show() if return_results: return corr