예제 #1
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def stripplot_(self, adj_left=.1, adj_bottom=.1):
     dd = pd.melt(self.df, [self.key], var_name='Features')
     customplot(adj_bottom=adj_bottom, adj_left=adj_left)
     sns.stripplot(x="value", y="Features", hue=self.key,
                   data=dd, dodge=True, jitter=True,
                   alpha=.25, zorder=1)
     ax = sns.pointplot(x="value", y="Features", hue=self.key,
                        data=dd, dodge=.532, join=False, palette="dark",
                        markers="d", scale=.75, ci=None)
     handles, labels = ax.get_legend_handles_labels()
     n = len(np.unique(self.y))
     ax.legend(handles[0:n], labels[0:n], loc='best',
               handletextpad=0, columnspacing=1,
               frameon=True)
     return ax.get_figure()
예제 #2
0
    def plotregression_(self, **options):
        a = options.get('adj_left', 0.12)
        b = options.get('adj_bottom', 0.12)
        mode = options.get('cross_val_predict', False)

        fig, ax = customplot(adj_left=a, adj_bottom=b)
        if mode:
            ax.scatter(self.predictCVDF.Actual,
                       self.predictCVDF.Prediction,
                       s=70,
                       c='b',
                       marker='o',
                       label='Training')
        else:
            ax.scatter(self.predictTrainDF.Actual,
                       self.predictTrainDF.Prediction,
                       s=70,
                       c='b',
                       marker='o',
                       label='Training')

        if not self.intercv:
            ax.scatter(self.predictTestDF.Actual,
                       self.predictTestDF.Prediction,
                       s=70,
                       c='r',
                       marker='v',
                       label='Testing')
            ax.legend()
        self.abline(1, 0)
        plt.xlabel('Actual')
        plt.ylabel('Prediction')
        return fig
예제 #3
0
파일: optimize.py 프로젝트: Shidiq/nanolib
    def plot_(self, adj=None, yerr=True):
        from torclib2 import customplot
        import matplotlib.pyplot as plt

        if adj is None:
            aleft = .12
            abottom = .12
        else:
            aleft, abottom = adj[0], adj[1]

        fig, ax = customplot(adj_left=aleft, adj_bottom=abottom)

        if yerr:
            plt.bar(range(self.X.shape[1]),
                    self.important[self.indices],
                    color='r',
                    yerr=self.std[self.indices],
                    align='center')
        else:
            plt.bar(range(self.X.shape[1]),
                    self.important[self.indices],
                    color='r')

        plt.xticks(range(self.X.shape[1]), self.indices)
        plt.xlim([-1, self.X.shape[1]])
        plt.xlabel('Feature')
        plt.ylabel('Score')
        return fig
예제 #4
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def boxplot_(self, separate=False, adj_left=.1, adj_bottom=.1):
     if separate:
         params = {'font.family': 'serif',
                   'font.serif': 'DejaVu Serif',
                   'xtick.labelsize': 20,
                   'ytick.labelsize': 20,
                   'axes.labelsize': 28,
                   'figure.figsize': [10.72, 8.205],
                   'legend.loc': 'best',
                   'legend.fontsize': 18,
                   'legend.fancybox': False}
         matplotlib.rcParams.update(params)
         self.df.groupby(self.key).boxplot()
     else:
         customplot(adj_bottom=adj_bottom, adj_left=adj_left)
         dd = pd.melt(self.df, id_vars=[self.key], value_vars=list(self.df)[0:self.feature], var_name='Features')
         ax = sns.boxplot(x=self.key, y='value', data=dd, hue='Features')
         return ax.get_figure()
예제 #5
0
    def pointplot_(self, **options):
        import math
        yint = range(min(self.y_train), math.ceil(max(self.y_train)) + 1)
        a = options.get('adj_left', 0.12)
        b = options.get('adj_bottom', 0.12)
        mode = options.get('cross_val_predict', False)

        if mode:
            trainDF = self.predictCVDF
        else:
            trainDF = self.predictTrainDF

        trainDF = trainDF.sort_values(by='Actual')
        if self.intercv:
            fig, ax = customplot(adj_left=a, adj_bottom=b)
            plt.plot(trainDF.Actual.values, '--r')
            plt.plot(trainDF.Prediction.values, '-bo')
            plt.xlabel('Samples')
            plt.ylabel('Class prediction')
            plt.yticks(yint)
            return fig
        else:
            testDF = self.predictTestDF
            testDF = testDF.sort_values(by='Actual')
            fig, ax = customplot(adj_bottom=b, adj_left=a)

            plt.subplot(2, 1, 1)
            plt.plot(trainDF.Actual.values, '--r')
            plt.plot(trainDF.Prediction.values, '-bo')
            plt.ylabel('Train prediction')
            plt.yticks(yint)

            plt.subplot(2, 1, 2)
            plt.plot(testDF.Actual.values, '--r')
            plt.plot(testDF.Prediction.values, '-bo')
            plt.ylabel('Test prediction')
            plt.xlabel('Samples')
            plt.yticks(yint)
            return fig
예제 #6
0
    def plotcm(self, external=False, intercv=False, **options):
        from nanolib.utils import plot_confusion_matrix

        a = options.get('adj_left', 0.1)
        b = options.get('adj_bottom', 0.2)
        ps = options.get('print_stats', False)
        title = options.get('title', False)
        figsize = options.get('figsize', [5, 5])
        axes_size = options.get('axes_size', 22)

        if self.mode == 'classification':
            if external:
                y_true = self.y_test
                y_pred = self.clf.predict(self.X_test)
            else:
                y_true = self.y_train
                if intercv:
                    y_pred = self.crossValPredict
                else:
                    y_pred = self.clf.predict(self.X_train)

            cm_ = confusion_matrix(y_true, y_pred)

            np.set_printoptions(precision=2)
            class_names = np.unique(y_true)

            self.CM = ConfusionMatrix(actual_vector=y_true,
                                      predict_vector=y_pred)
            if ps:
                print(self.CM)

            fig, _ = customplot(adj_bottom=b,
                                adj_left=a,
                                figsize=figsize,
                                axes_size=axes_size)

            if title:
                if external:
                    plt.title('External-Validation')
                else:
                    if intercv:
                        plt.title('Internal-Validation')
                    else:
                        plt.title('Training Results')

            plot_confusion_matrix(cm_, classes=class_names)
            plt.show()
            return fig
        else:
            print('Just for classification')
예제 #7
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def hierarchical_clustering(self, max_d=0):
     from scipy.cluster.hierarchy import dendrogram, linkage
     X = self.X
     linked = linkage(X, 'ward')
     labelList = np.arange(0, X.shape[0])
     fig, ax = customplot()
     dendrogram(linked,
                truncate_mode='lastp',
                p=X.shape[0],
                orientation='top',
                labels=labelList,
                distance_sort='descending',
                show_leaf_counts=True)
     plt.axhline(y=max_d, c='k')
     return ax.get_figure()
예제 #8
0
    def screenplot(self, **options):
        a = options.get('adj_left', 0.1)
        b = options.get('adj_bottom', 0.2)
        lim = options.get('PC', None)

        if lim is None:
            data_ = self.vardf
        else:
            data_ = self.vardf.loc[:lim, :]

        fig, _ = customplot(adj_bottom=b, adj_left=a)
        plt.bar(x='PC', height='Var (%)', data=data_)
        plt.xticks(rotation='vertical')
        plt.xlabel('Principal Component')
        plt.ylabel('Percentage of Variance')
        return fig
예제 #9
0
    def plotpc(self, **options):
        PC = options.get('PC', ['PC1', 'PC2'])
        a = options.get('adj_left', 0.1)
        b = options.get('adj_bottom', 0.15)
        s = options.get('size', 90)
        ascending = options.get('ascending', True)
        self.pcadf = self.pcadf.sort_values(by=['label'], ascending=ascending)

        colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'C0', 'C1', 'C2']
        markers = ["o", "v", "s", "p", "P", "*", "h", "H", "X", "D"]
        targets = list(self.pcadf['label'].unique())

        if len(targets) > 10:
            raise TooMuchUnique(str(targets))

        colors = colors[:len(targets)]
        markers = markers[:len(targets)]

        xlabs = f'{PC[0]} ({float(self.vardf.values[self.vardf["PC"] == PC[0], 0])}%)'
        ylabs = f'{PC[1]} ({float(self.vardf.values[self.vardf["PC"] == PC[1], 0])}%)'

        fig, ax = customplot(adj_left=a, adj_bottom=b)
        for target, color, mark in zip(targets, colors, markers):
            indicesToKeep = self.pcadf['label'] == target
            ax.scatter(
                self.pcadf.loc[indicesToKeep, PC[0]],
                self.pcadf.loc[indicesToKeep, PC[1]],
                c=color,
                marker=mark,
                s=s,
            )

        plt.xlabel(xlabs)
        plt.ylabel(ylabs)
        plt.legend(targets)
        return fig
예제 #10
0
    def plotlda(self, **options):
        import seaborn as sns

        a = options.get('adj_left', 0.1)
        b = options.get('adj_bottom', 0.15)
        ascending = options.get('ascending', True)

        self.ldadf = self.ldadf.sort_values(by=['label'], ascending=ascending)
        nlabel = np.unique(self.y)
        if len(nlabel) < 3:
            fig, _ = customplot(adj_left=a, adj_bottom=b)
            s = options.get('size', 10)

            if self.dual:
                self.ldaval = self.ldaval.sort_values(by=['label'],
                                                      ascending=ascending)
                sns.stripplot(x="label",
                              y="LD1",
                              color='k',
                              size=s,
                              data=self.ldadf)
                sns.stripplot(x="label",
                              y="LD1",
                              marker='^',
                              color='red',
                              size=s,
                              data=self.ldaval)
            else:
                sns.stripplot(x="label", y="LD1", size=s, data=self.ldadf)

            plt.xlabel('Classes')
            plt.axhline(y=0, linewidth=1.5, color='black', linestyle='--')
            return fig
        else:
            colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'C0', 'C1', 'C2']
            markers = ["o", "v", "s", "p", "P", "*", "h", "H", "X", "D"]
            targets = list(self.ldadf['label'].unique())
            s = options.get('size', 90)
            if len(targets) > 10:
                raise TooMuchUnique(str(targets))

            colors = colors[:len(targets)]
            markers = markers[:len(targets)]

            xlabs = f'LD1 ({self.vardf.values[0, 0]}%)'
            ylabs = f'LD2 ({self.vardf.values[1, 0]}%)'
            fig, ax = customplot(adj_left=a, adj_bottom=b)
            for target, color, mark in zip(targets, colors, markers):
                indicesToKeep = self.ldadf['label'] == target
                ax.scatter(
                    self.ldadf.loc[indicesToKeep, 'LD1'],
                    self.ldadf.loc[indicesToKeep, 'LD2'],
                    c=color,
                    marker=mark,
                    s=s,
                )
            if self.dual:
                # plot tidak urut, tambahkan line ini: (14/05)
                self.ldaval = self.ldaval.sort_values(by=['label'],
                                                      ascending=ascending)
                for target, color, mark in zip(targets, colors, markers):
                    indicesToKeep = self.ldaval['label'] == target
                    ax.scatter(
                        self.ldaval.loc[indicesToKeep, 'LD1'],
                        self.ldaval.loc[indicesToKeep, 'LD2'],
                        # c=color,
                        marker=mark,
                        s=s,
                        facecolors='none',
                        edgecolors=color,
                    )
            plt.legend(targets)
            plt.xlabel(xlabs)
            plt.ylabel(ylabs)

            return fig
예제 #11
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def corrplot_(self, adj_left=.1, adj_bottom=.1, size=20):
     corr_ = self.df.corr(method='pearson')
     customplot(adj_bottom=adj_bottom, adj_left=adj_left)
     ax = sns.heatmap(corr_, vmax=1, vmin=-1, cmap='YlGnBu', annot=True, annot_kws={"size": size})
     return ax.get_figure()
예제 #12
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def swarmplot_(self, adj_left=.1, adj_bottom=.1):
     customplot(adj_bottom=adj_bottom, adj_left=adj_left)
     dd = pd.melt(self.df, [self.key], var_name='Features')
     ax = sns.swarmplot(x='Features', y='value', data=dd, hue=self.key)
     return ax.get_figure()
예제 #13
0
파일: eda.py 프로젝트: Shidiq/nanolib
 def matrixplot_(self, adj_left=.1, adj_bottom=.1):
     fig, _ = customplot(adj_bottom=adj_bottom, adj_left=adj_left)
     matplotlib.pyplot.close()
     fig = sns.pairplot(self.df, hue=self.key)
     return fig
예제 #14
0
파일: optimize.py 프로젝트: Shidiq/nanolib
    def plot_learning(self):
        from sklearn.model_selection import learning_curve

        train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
            self.grid.best_estimator_,
            self.X,
            self.y,
            cv=self.cv,
            n_jobs=-1,
            train_sizes=np.linspace(.6, 1.0, 5),
            return_times=True,
        )

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        fit_times_mean = np.mean(fit_times, axis=1)
        fit_times_std = np.std(fit_times, axis=1)

        # plot learning curve
        fig1, ax1 = customplot(adj_bottom=.13, adj_left=.12)
        plt.xlabel('Training examples')
        plt.ylabel('Score')
        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 lw=2,
                 label="Training score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 lw=2,
                 label="Cross-validation score")
        plt.legend(loc="best")

        # Plot n_samples vs fit_times
        fig2, ax2 = customplot(adj_bottom=.13, adj_left=.15)
        plt.xlabel('Training examples')
        plt.ylabel('Fit times')
        plt.plot(train_sizes, fit_times_mean, 'o-', lw=2)
        plt.fill_between(train_sizes,
                         fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std,
                         alpha=0.1)

        # Plot fit_time vs score
        fig3, ax3 = customplot(adj_bottom=.13, adj_left=.12)
        plt.xlabel('Fit times')
        plt.ylabel('Score')
        plt.plot(fit_times_mean, test_scores_mean, 'o-', lw=2)
        plt.fill_between(fit_times_mean,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1)

        return fig1, fig2, fig3
예제 #15
0
파일: optimize.py 프로젝트: Shidiq/nanolib
    def plot_validation_curve(self, ylim=None):

        if ylim is None:
            ylim = [0.5, 1.1]

        from sklearn.model_selection import validation_curve

        mode = self.grid.best_params_['svm__kernel']

        if mode == 'linear':
            print('linear')
            param_range = self.params[0].get('svm__C')
            train_scores, test_scores = validation_curve(
                self.grid.best_estimator_,
                self.X,
                self.y,
                param_name='svm__C',
                param_range=param_range,
                scoring='accuracy',
                cv=self.cv,
                n_jobs=-1)
            train_scores_mean = np.mean(train_scores, axis=1)
            train_scores_std = np.std(train_scores, axis=1)
            test_scores_mean = np.mean(test_scores, axis=1)
            test_scores_std = np.std(test_scores, axis=1)

            fig, ax = customplot(adj_bottom=.13, adj_left=.12)
            plt.xlabel(
                f"Cost (with Gamma: {self.grid.best_params_['svm__gamma']})")
            plt.ylabel("Score")
            plt.ylim(ylim[0], ylim[1])
            lw = 2
            plt.semilogx(param_range,
                         train_scores_mean,
                         label="Training score",
                         color="r",
                         lw=lw)
            plt.fill_between(param_range,
                             train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std,
                             alpha=0.1,
                             color="r",
                             lw=lw)
            plt.semilogx(param_range,
                         test_scores_mean,
                         label="Cross-validation score",
                         color="g",
                         lw=lw)
            plt.fill_between(param_range,
                             test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std,
                             alpha=0.1,
                             color="g",
                             lw=lw)
            plt.axvline(x=self.grid.best_params_['svm__C'],
                        color='k',
                        linestyle='--')
            plt.plot(self.grid.best_params_['svm__C'], self.grid.best_score_,
                     'ok')
            ax.text(self.grid.best_params_['svm__C'],
                    self.grid.best_score_,
                    f' {round(self.grid.best_score_, 2)}',
                    fontsize=20)
            plt.legend(loc="best")
            return fig

        else:
            print('radial')
            param_range = self.params[0].get('svm__C')
            train_scores, test_scores = validation_curve(
                self.grid.best_estimator_,
                self.X,
                self.y,
                param_name='svm__C',
                param_range=param_range,
                scoring='accuracy',
                cv=self.cv,
                n_jobs=-1)
            train_scores_mean = np.mean(train_scores, axis=1)
            train_scores_std = np.std(train_scores, axis=1)
            test_scores_mean = np.mean(test_scores, axis=1)
            test_scores_std = np.std(test_scores, axis=1)

            fig1, ax1 = customplot(adj_bottom=.13, adj_left=.12)
            plt.xlabel(
                f"Cost (with Gamma: {self.grid.best_params_['svm__gamma']})")
            plt.ylabel("Score")
            plt.ylim(ylim[0], ylim[1])
            lw = 2
            plt.semilogx(param_range,
                         train_scores_mean,
                         label="Training score",
                         color="r",
                         lw=lw)
            plt.fill_between(param_range,
                             train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std,
                             alpha=0.1,
                             color="r",
                             lw=lw)
            plt.semilogx(param_range,
                         test_scores_mean,
                         label="Cross-validation score",
                         color="g",
                         lw=lw)
            plt.fill_between(param_range,
                             test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std,
                             alpha=0.1,
                             color="g",
                             lw=lw)
            plt.axvline(x=self.grid.best_params_['svm__C'],
                        color='k',
                        linestyle='--')
            plt.plot(self.grid.best_params_['svm__C'], self.grid.best_score_,
                     'ok')
            ax1.text(self.grid.best_params_['svm__C'],
                     self.grid.best_score_,
                     f' {round(self.grid.best_score_, 2)}',
                     fontsize=20)
            plt.legend(loc="best")

            # gamma
            param_range = self.params[0].get('svm__gamma')
            train_scores, test_scores = validation_curve(
                self.grid.best_estimator_,
                self.X,
                self.y,
                param_name='svm__gamma',
                param_range=param_range,
                scoring='accuracy',
                cv=self.cv,
                n_jobs=-1)
            train_scores_mean = np.mean(train_scores, axis=1)
            train_scores_std = np.std(train_scores, axis=1)
            test_scores_mean = np.mean(test_scores, axis=1)
            test_scores_std = np.std(test_scores, axis=1)

            fig2, ax2 = customplot(adj_bottom=.13, adj_left=.12)
            plt.xlabel(
                f"Gamma (with Cost: {self.grid.best_params_['svm__C']})")
            plt.ylabel("Score")
            plt.ylim(ylim[0], ylim[1])
            lw = 2
            plt.semilogx(param_range,
                         train_scores_mean,
                         label="Training score",
                         color="r",
                         lw=lw)
            plt.fill_between(param_range,
                             train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std,
                             alpha=0.1,
                             color="r",
                             lw=lw)
            plt.semilogx(param_range,
                         test_scores_mean,
                         label="Cross-validation score",
                         color="g",
                         lw=lw)
            plt.fill_between(param_range,
                             test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std,
                             alpha=0.1,
                             color="g",
                             lw=lw)
            plt.axvline(x=self.grid.best_params_['svm__gamma'],
                        color='k',
                        linestyle='--')
            plt.plot(self.grid.best_params_['svm__gamma'],
                     self.grid.best_score_, 'ok')
            ax2.text(self.grid.best_params_['svm__gamma'],
                     self.grid.best_score_,
                     f' {round(self.grid.best_score_, 2)}',
                     fontsize=20)
            plt.legend(loc="best")
            return fig1, fig2