Exemplo n.º 1
0
Arquivo: cw1.py Projeto: arlyon/dmml
def variance_sweep(features, labels, seed, save_plot, show_plot, step=500):
    """
    Performs a sweep across the range of variance to establish the value of removing
    low variance pixels
    """
    model = KMeans(n_clusters=10)
    variance_analysis = []
    selector = VarianceThreshold()
    selector.fit(features)

    # Sweeps through variance range
    print("Performing sweep of variance thresholding...")
    # The bounds correspond to approximately 0 features selected and all features selected
    # lower bound 2900
    # upper bound 6450
    with click.progressbar(range(2900, 6450, step)) as variance_range:
        for variance in variance_range:
            selector.set_params(threshold=variance)
            selected_features = selector.transform(features)
            numpy.random.set_state(seed)
            predictions = model.fit_predict(selected_features)
            variance_analysis.append(
                (variance, score_clustering(labels, predictions)))

    # Plots results from variance sweep
    if show_plot or save_plot:
        data = list(zip(*[(x, *y.values()) for x, y in variance_analysis]))
        name = "Variance"
        handles = plt.plot(data[0], data[3], '-b', label=name + " V Score")
        handles += plt.plot(data[0], data[4], '--b', label=name + " Rand")
        plt.legend(handles, loc="lower left")
        plt.xlabel("Variance Threshold for Feature Selection")
        plt.title("Effect of Variance Threshold Feature Selection")
        if save_plot is not None:
            path = os.path.join(save_plot, "variance_sweep.png")
            plt.savefig(path)
            print("")
            print("saved figure to " + path)
        if show_plot:
            plt.show()
        plt.clf()

    # Plots the variances of each pixel as a heatmap
    plt.imshow(selector.variances_.reshape(48, -1),
               cmap='hot',
               interpolation='lanczos')
    plt.title("Heatmap of Variances between Images")
    if save_plot is not None:
        path = os.path.join(save_plot, "variance_heatmap.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()
Exemplo n.º 2
0
class VarianceThreshold(FeatureSelectionAlgorithm):
    r"""Implementation of feature selection using variance threshold.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

    See Also:
        * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm`
    """
    Name = 'Variance Threshold'

    def __init__(self, **kwargs):
        r"""Initialize VarianceThreshold feature selection algorithm.
        """
        self._params = dict(
            threshold=ParameterDefinition(MinMax(0, 0.1), np.float))
        self.__variance_threshold = VarThr()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__variance_threshold.set_params(**kwargs)

    def select_features(self, x, y, **kwargs):
        r"""Perform the feature selection process.

        Arguments:
            x (pandas.core.frame.DataFrame): Array of original features.
            y (pandas.core.series.Series) Expected classifier results.

        Returns:
            numpy.ndarray[bool]: Mask of selected features.
        """
        self.__variance_threshold.fit(x)
        return self.__variance_threshold.get_support()

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureSelectionAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(
                self.__variance_threshold.get_params()))
Exemplo n.º 3
0
def main():
    # get_ipython().run_line_magic('matplotlib', 'inline')
    data_dir = 'data'
    file_name = 'credit_card_default.csv'
    #column_names = ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
    file_url = ''
    if not is_running_from_ipython():
        abspath = os.path.abspath('.')
        file_url = 'file://' + abspath + os.path.sep + data_dir + os.path.sep + file_name
    else:
        file_url = os.path.join('../', data_dir, file_name)
    # logging.debug('abspath %s', abspath)
    pp = PredictiveProcessor()
    pp.file_url = file_url
    df = pp.data_read_csv()
    pp.df = df
    # pp.problem_understanding()
    pp.data_preparation()
    pp.data_analysis_exploratory()
    pp.data_model_building()
    pp.model_evaluation()
    pp.model_deployment()
    logger.info("LL: -----------------------------------------------")

    # check is any null values
    for column in df.columns:
        # logger.info("LL: column %s datatype %s is having null : %s , about %s", column, df.dtypes[column], df[column].isnull().values.any(), df[column].isnull().values.sum())
        # logger.info("LL: column %s datatype %s is having NA : %s , about %s", column, df.dtypes[column], df[column].isna().values.any(), df[column].isna().values.sum())
        if (df.dtypes[column], df[column].isnull().values.any()
                and df.dtypes[column], df[column].isna().values.any()):
            if (df.dtypes[column] == 'int64'):
                logger.info("LL: yes int64")
                # df_example = df[column].fillna(0)
            else:
                logger.info("LL: not int64")
    #alternatively use missingno
    #-----------missingno

    logger.info("LL: df.shape \n%s", df.shape)

    logger.info("LL: df correlation \n%s", df.corr().round(2))
    logger.info("LL: df correlation columns \n%s", df.corr().columns)
    logger.info("LL: df correlation index \n%s", df.corr().index)
    logger.info("LL: df covariance \n%s", df.cov().round(2))
    logger.info("LL: df covariance columns \n%s", df.cov().columns)
    logger.info("LL: df covariance index \n%s", df.cov().index)
    # drop rows with null values
    df_dropped_rows_na = df.dropna(axis=0)
    logger.info("LL: df_dropped_rows_na.shape %s", df_dropped_rows_na.shape)
    # drop columns with null values
    df_dropped_cols_na = df.dropna(axis=1)
    logger.info("LL: df_dropped_cols_na.shape %s", df_dropped_cols_na.shape)

    #impute nan with new values
    # missing_values = form of missing values in your data. (For example nan, 0, or "n/a".
    # strategy = how to impute (choices are "mean", "median", "most frequent", and "constant".
    # If you pass strategy=constant, then you can use the optional argument fill_value to pass your constant.

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    cols_to_impute = df.columns
    out_imp = imputer.fit_transform(df[cols_to_impute])
    df_new = pd.DataFrame(data=out_imp, columns=cols_to_impute)
    # df_new = pd.concat([df_new, df[['species']]], axis = 1)

    minmax_scaler = MinMaxScaler()
    cols_to_minmaxscale = df.columns
    out_scaled_minmax = minmax_scaler.fit_transform(df[cols_to_minmaxscale])

    standard_scaler = StandardScaler()
    cols_to_standardscale = df.columns
    out_scaled_standard = standard_scaler.fit_transform(
        df[cols_to_standardscale])

    # encode categorical nonint features
    categorical_features_nonint = []
    for column in df.columns:
        if (df.dtypes[column] != 'int64' or df.dtypes[column] != 'int32'):
            logger.info("LL: no int64 or int32")
            categorical_features_nonint.append(column)
        # df_example = df[column].fillna(0)
        else:
            logger.info("LL: yes int64 or int32")

    from sklearn.preprocessing import OrdinalEncoder
    enc_ordinal = OrdinalEncoder()
    out_enc_ord_catg_feat_nonint = enc_ordinal.fit_transform(
        df[categorical_features_nonint])
    logger.info("LL: out_enc categories \n%s", enc_ordinal.categories_)
    logger.info("LL: out_enc  \n%s", out_enc_ord_catg_feat_nonint)
    df[categorical_features_nonint] = out_enc_ord_catg_feat_nonint
    logger.info("LL: df_new  \n%s", df.head())

    # One-hot Enconding
    from sklearn.preprocessing import OneHotEncoder
    enc_onehot = OneHotEncoder(sparse=False)
    out_enc_onehot_catg_feat_nonint = enc_onehot.fit_transform(
        df[categorical_features_nonint])
    new_cols_onehot_catg_feat_nonint = enc_onehot.get_feature_names(
        categorical_features_nonint).tolist()
    logger.info("LL: new_cols \n%s", new_cols_onehot_catg_feat_nonint)

    # Label encoding
    from sklearn import preprocessing
    enc_label = preprocessing.LabelEncoder()
    out_enc_label = enc_label.fit_transform(categorical_features_nonint)

    # Dimension Reduction
    #  Feature Selection
    #  Feature Filtering
    #   Variance Treshold
    #   Correlation Coefficient

    #Variance Treshold
    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold()
    cols = df.columns
    # cols = categorical_features_nonint
    selector.fit(df[cols])

    # check feature variances before selection
    logger.info("LL: variance treshold \n%s", selector.variances_)

    # set threshold into selector object
    selector.set_params(threshold=1.0)
    out_sel = selector.fit_transform(df[cols])

    logger.info("LL: selector.get_support() \n%s", selector.get_support)
    df_sel = df.iloc[:, selector.get_support()]

    # add labels to new dataframe and sanity check
    df_sel = pd.concat([df_sel, df[['default payment next month']]], axis=1)
    logger.info("LL: df_sel.head() \n%s", df_sel.head())

    #Correlation Coefficient
    cor = df.corr()
    sns.heatmap(cor, annot=False, cmap=plt.cm.Blues)
    logger.info("LL: plt.show() \n%s", plt.show())

    # get correlation values with target variable
    cor_target = abs(cor['default payment next month'])
    logger.info("LL: cor_target \n%s", cor_target)

    #For demonstration purposes, we will choose 0.6 as the threshold and then filter. From the output, you should expect columns 5 and 12 (0.69 and 0.74) to be selected:
    selected_cols = cor_target[cor_target > 0.6]
    logger.info("LL: selected columns, correlation with target > 0.6")
    logger.info("LL: selected_cols \n%s", selected_cols)
    # filter in the selected features
    df_sel = df[selected_cols.index]
    logger.info("LL: df_sel.head() \n%s", def_sel.head())

    # Wrapper Methods
    #  Sequential Feature Selection
    #   Forward Sequential Selection and Backward Sequential Selection
    #   LinearRegression() for continuous target variables and RandomForestClassifier() for categorical target variables

    # We will use the Support Vector Machine Classifier ("SVC") as the estimator for our example RFE.
    # Now let's import our modules and define the independent (X) and dependent (y) variables for the SVC:
    from sklearn.feature_selection import RFE
    from sklearn.svm import SVC

    cols = df.columns
    X = df[cols]
    y = df['default payment next month']

    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=2, step=1)
    rfe.fit(X, y)

    logger.info("LL: cols \n%s", cols)
    logger.info("LL: rfe.ranking_")

    logger.info("LL: -----------------------------------------------")