Python RandomUnderSampler示例，unbalanced_dataset.under_sampling.RandomUnderSampler Python示例

示例#1

0

显示文件

def test_rus_fit_transform():
    """Test the fit transform routine"""

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

示例#2

0

显示文件

文件： test_random_under_sampler.py 项目： StefanKal/CancerDataChallenge

def test_rus_fit_transform():
    """Test the fit transform routine"""

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

示例#3

0

显示文件

def test_rus_fit():
    """Test the fitting method"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Fit the data
    rus.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(rus.min_c_, 0)
    assert_equal(rus.maj_c_, 1)
    assert_equal(rus.stats_c_[0], 500)
    assert_equal(rus.stats_c_[1], 4500)

示例#4

0

显示文件

文件： test_random_under_sampler.py 项目： StefanKal/CancerDataChallenge

def test_rus_fit():
    """Test the fitting method"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Fit the data
    rus.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(rus.min_c_, 0)
    assert_equal(rus.maj_c_, 1)
    assert_equal(rus.stats_c_[0], 500)
    assert_equal(rus.stats_c_[1], 4500)

示例#5

0

显示文件

def test_rus_fit_transform_with_indices():
    """Test the fit transform routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)

示例#6

0

显示文件

文件： test_random_under_sampler.py 项目： StefanKal/CancerDataChallenge

def test_rus_fit_transform_with_indices():
    """Test the fit transform routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)

示例#7

0

显示文件

def test_rus_transform_wt_fit():
    """Test either if an error is raised when transform is called before
    fitting"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    assert_raises(RuntimeError, rus.transform, X, Y)

示例#8

0

显示文件

文件： classifier.py 项目： dvtailor/dal-emotion-speech

    def train(self, data):
        self._cls = svm.SVC(C=self._complexity,
                            kernel='linear',
                            probability=self._prob_enabled)

        # Data unchanged if resample type 'none' or 'cls-wgt'
        X_resample = data.X
        y_resample = data.y

        if self._resample_type == 'cls-wgt':
            self._cls.class_weight = 'balanced'
        elif self._resample_type == 'over':
            resample = RandomOverSampler(ratio='auto',
                                         random_state=self._resample_seed)
            resample.verbose = False  # Req. due to mistake in library
            X_resample, y_resample = resample.fit_transform(data.X, data.y)
        elif self._resample_type == 'under':
            resample = RandomUnderSampler(ratio='auto',
                                          random_state=self._resample_seed,
                                          verbose=False)
            X_resample, y_resample = resample.fit_transform(data.X, data.y)

        if self._norm_type == 'std':
            self._scaler = preprocessing.StandardScaler().fit(X_resample)
        else:
            self._scaler = preprocessing.MinMaxScaler().fit(X_resample)

        X_norm = self._scaler.transform(X_resample)

        self._cls.fit(X_norm, y_resample)

示例#9

0

显示文件

def test_rus_fit_invalid_ratio():
    """Test either if an error is raised when the balancing ratio to fit is
    smaller than the one of the data"""

    # Create the object
    ratio = 1. / 10000.
    rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    assert_raises(RuntimeError, rus.fit, X, Y)

示例#10

0

显示文件

def test_rus_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_raises(RuntimeError, rus.fit, X, y_single_class)

示例#11

0

显示文件

def test_rus_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    ratio = 'auto'
    rus = RandomUnderSampler(ratio=ratio,
                             random_state=RND_SEED,
                             verbose=verbose)

    assert_equal(rus.rs_, RND_SEED)
    assert_equal(rus.verbose, verbose)
    assert_equal(rus.min_c_, None)
    assert_equal(rus.maj_c_, None)
    assert_equal(rus.stats_c_, {})

示例#12

0

显示文件

                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],

示例#13

0

显示文件

文件： featureFunctions.py 项目： iven5880/Insight

def preProcess(theFileName):
    df = pd.read_csv(str(theFileName))
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)
    labBin = sklearn.preprocessing.LabelBinarizer()
    df['y'] = labBin.fit_transform(df['y'])
    dp = pd.get_dummies(df)
    X = dp.drop('y', axis=1)
    y = dp[['y']]

    # get the features
    theFeatures = X.columns

    # convert the dataframes to arrays
    X = X.values
    y = y.values
    y.shape = np.shape(y)[0]

    yOrig = y[:]  # need this later for plotting feature impacts

    # and carry out feature scaling
    X = StandardScaler().fit_transform(X)

    #=======================================================================

    # apply random undersampling if labels are imbalanced
    labelSkewness = 100 * np.sum(y) * 1. / np.shape(y)[0]
    if np.min([labelSkewness, 100 - labelSkewness]) < (100. / 3.):
        rus = RandomUnderSampler(verbose=0)
        X, y = rus.fit_sample(X, y)

    #=======================================================================

    # select optimal number of features
    thisModel = LogisticRegression(penalty='l1', C=1)
    rfecv = RFECV(estimator=thisModel,
                  step=1,
                  cv=StratifiedKFold(y, n_folds=3),
                  scoring='f1')
    Xt = rfecv.fit_transform(X, y)

    optimalNumberOfFeatures = rfecv.n_features_
    introReport = [
        'Optimal Number of Attributes: ' + str(optimalNumberOfFeatures),
        'The following attributes are the most influential to the outcome'
    ]

    #=======================================================================

    # plot number of selected features VS cross-validation scores
    plt.figure(figsize=(12, 8))

    plt.xlabel("Number of Attributes", fontsize=20)
    plt.ylabel("Score", fontsize=20)
    plt.title("Attribute Selection", fontsize=25)
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    imgOne = 'static/thePlot.jpg'
    plt.savefig('flask_files/' + imgOne, dpi=300)

    #=======================================================================

    # get the feature feature importance rankings
    model = RandomForestClassifier(n_estimators=300)
    model.fit(X, y)
    theImportances = list(model.feature_importances_)
    sortedImportances = sorted(theImportances, reverse=True)

    # ...and print the selected features along with their weights and ranks
    tableOne = []
    for ii in range(1, optimalNumberOfFeatures + 1):
        tableOne.append(
            dict(Feature=str(theFeatures[theImportances.index(
                sortedImportances[ii - 1])]),
                 Weight=str(sortedImportances[ii - 1]),
                 Rank=str(ii)))

    #=======================================================================

    # plot histogram of the most important feature
    thisFeature = 0
    allThoseFeatures = dp[theFeatures[theImportances.index(
        sortedImportances[thisFeature])]]

    plt.figure(figsize=(12, 8))

    combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

    #    plt.hist(allThoseFeatures, bins=10)
    plt.xlabel('Attribute: ' +
               theFeatures[theImportances.index(sortedImportances[0])],
               fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.title('Impact of the Most Influential Attribute', fontsize=25)

    imgTwo = 'static/theHist.jpg'
    plt.savefig('flask_files/' + imgTwo, dpi=300)

    #=======================================================================

    # plot impact of the most important feature
    positiv = allThoseFeatures[yOrig == 1]
    negativ = allThoseFeatures[yOrig == 0]

    plt.figure(figsize=(12, 8))

    negA = plt.hist(negativ, bins=combinedOutcomes[1])
    posA = plt.hist(positiv, bins=combinedOutcomes[1])
    #    yUpperLimit = np.max([negA[0], posA[0]])*1.01

    #    plt.subplot(1,2,1)
    #    plt.hist(negativ,bins=combinedOutcomes[1])
    #    plt.ylim(ymax = yUpperLimit*1.01, ymin = 0)
    #    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    #    plt.ylabel('Count', fontsize=16)
    #    plt.title('Negative', fontsize=20)
    #
    #    plt.subplot(1,2,2)
    #    plt.hist(positiv,bins=combinedOutcomes[1])
    #    plt.ylim(ymax = yUpperLimit, ymin = 0)
    #    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    #    plt.title('Positive',fontsize=20)
    #
    #    imgThree = 'static/theNegPosHist.jpg'
    #    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================

    a = posA[0]
    b = negA[0]
    c = combinedOutcomes[0]

    posImpact = np.divide(a, c)
    negImpact = np.divide(b, c)

    midPoints = []
    for i in range(1, len(combinedOutcomes[1])):
        midPoints.append(
            (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.)

    for i in range(len(posImpact)):
        if np.isnan(posImpact[i]):
            posImpact[i] = 0
        if np.isnan(negImpact[i]):
            negImpact[i] = 0

    plt.figure(figsize=(12, 8))
    plt.hold(True)
    plt.plot(midPoints, posImpact, '.', markersize=20, label='Positive')
    plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative')
    plt.legend(prop={'size': 20})
    plt.xlabel(theFeatures[theImportances.index(
        sortedImportances[thisFeature])],
               fontsize=16)
    plt.ylabel('Relative Impact', fontsize=20)
    plt.grid()

    imgThree = 'static/theNegPosHist.jpg'
    plt.savefig('flask_files/' + imgThree, dpi=300)

    #=======================================================================

    # generate plots for report (this is save to an "html" file)

    from bokeh.charts import Histogram, output_file, show, save, gridplot
    from bokeh.plotting import figure

    plotList = []

    for i in range(optimalNumberOfFeatures):
        thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])]
        allThoseFeatures = dp[thisFeatureIs]
        combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

        positiv = allThoseFeatures[yOrig == 1]
        negativ = allThoseFeatures[yOrig == 0]
        negA = plt.hist(negativ, bins=combinedOutcomes[1])
        posA = plt.hist(positiv, bins=combinedOutcomes[1])
        posImpact = np.divide(posA[0], combinedOutcomes[0])
        negImpact = np.divide(negA[0], combinedOutcomes[0])

        midPoints = []
        for i in range(1, len(combinedOutcomes[1])):
            midPoints.append(
                (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.)

        for i in range(len(posImpact)):
            if np.isnan(posImpact[i]):
                posImpact[i] = 0
            if np.isnan(negImpact[i]):
                negImpact[i] = 0

        hist0 = Histogram(dp,
                          values=thisFeatureIs,
                          color='blue',
                          title="Impact of " + thisFeatureIs,
                          bins=10)
        plot0 = figure()
        plot0.xaxis.axis_label = thisFeatureIs
        plot0.yaxis.axis_label = "Relative Impact"
        #     plot0.title = "Relative Impact of " + thisFeatureIs
        plot0.circle(midPoints,
                     list(negImpact),
                     size=10,
                     color="red",
                     alpha=0.9,
                     legend='Negative')
        plot0.circle(midPoints,
                     list(posImpact),
                     size=10,
                     color="green",
                     alpha=0.9,
                     legend='Positive')
        plotList.append([hist0, plot0])

    output_file("flask_files/static/Report.html", title="Report")
    hist = gridplot(plotList)
    save(hist)

    #=======================================================================

    # specify the models to run tests with
    theModels = {
        'Logistic Regression': LogisticRegression(penalty='l1'),
        'LDA': LinearDiscriminantAnalysis(),
        'SVM': SVC(kernel='linear'),
        'Random Forest': RandomForestClassifier(n_estimators=300)
    }

    # ...then display the results of the tests
    classifierComparisons = []
    for aModel in theModels:
        model = theModels[aModel]
        results = cross_validation.cross_val_score(model,
                                                   Xt,
                                                   y,
                                                   scoring='f1',
                                                   cv=StratifiedKFold(
                                                       y, n_folds=3))
        classifierComparisons.append(
            dict(Classifier=aModel, Score=np.max(results)))

    #=======================================================================

    # display the plots
    theJPGs = [imgOne, imgTwo, imgThree]

    #=======================================================================

    return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs

示例#14

0

显示文件

def preProcess(theFileName):
    df = pd.read_csv(str(theFileName))
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis = 1)
    labBin = sklearn.preprocessing.LabelBinarizer()
    df['y'] = labBin.fit_transform(df['y'])
    dp = pd.get_dummies(df)
    X = dp.drop('y', axis = 1) 
    y = dp[['y']]

    # get the features
    theFeatures = X.columns

    # convert the dataframes to arrays
    X = X.values
    y = y.values
    y.shape = np.shape(y)[0]

    yOrig = y[:] # need this later for plotting feature impacts

    # and carry out feature scaling
    X = StandardScaler().fit_transform(X)

    #=======================================================================

    # apply random undersampling if labels are imbalanced
    labelSkewness = 100*np.sum(y)*1./np.shape(y)[0]
    if np.min([labelSkewness, 100-labelSkewness]) < (100./3.):
        rus = RandomUnderSampler(verbose=0)
        X, y = rus.fit_sample(X, y)

    #=======================================================================

    # select optimal number of features
    thisModel = LogisticRegression(penalty='l1', C=1)
    rfecv = RFECV(estimator=thisModel, step=1, cv=StratifiedKFold(y, n_folds=3), scoring='f1')
    Xt = rfecv.fit_transform(X, y);

    optimalNumberOfFeatures = rfecv.n_features_
    introReport = ['Optimal Number of Attributes: ' + str(optimalNumberOfFeatures), 'The following attributes are the most influential to the outcome']

    #=======================================================================

    # plot number of selected features VS cross-validation scores
    plt.figure(figsize=(12, 8))

    plt.xlabel("Number of Attributes", fontsize=20)
    plt.ylabel("Score", fontsize=20)
    plt.title("Attribute Selection", fontsize=25)
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    imgOne = 'static/thePlot.jpg'
    plt.savefig('flask_files/'+imgOne, dpi=300)
    
    #=======================================================================

    # get the feature feature importance rankings
    model = RandomForestClassifier(n_estimators=300)
    model.fit(X,y)
    theImportances = list(model.feature_importances_)
    sortedImportances = sorted(theImportances,reverse = True)

    # ...and print the selected features along with their weights and ranks
    tableOne = []
    for ii in range(1,optimalNumberOfFeatures+1):
        tableOne.append(dict(Feature = str(theFeatures[theImportances.index(sortedImportances[ii-1])]), Weight = str(sortedImportances[ii-1]), Rank = str(ii)))

    #=======================================================================

    # plot histogram of the most important feature
    thisFeature = 0
    allThoseFeatures = dp[theFeatures[theImportances.index(sortedImportances[thisFeature])]]

    plt.figure(figsize=(12, 8))
    
    combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

#    plt.hist(allThoseFeatures, bins=10)
    plt.xlabel('Attribute: ' + theFeatures[theImportances.index(sortedImportances[0])], fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.title('Impact of the Most Influential Attribute', fontsize=25)

    imgTwo = 'static/theHist.jpg'
    plt.savefig('flask_files/'+imgTwo, dpi=300)

    #=======================================================================

    # plot impact of the most important feature
    positiv = allThoseFeatures[yOrig==1]
    negativ = allThoseFeatures[yOrig==0]

    plt.figure(figsize=(12, 8))
    
    negA = plt.hist(negativ,bins=combinedOutcomes[1])
    posA = plt.hist(positiv,bins=combinedOutcomes[1])
#    yUpperLimit = np.max([negA[0], posA[0]])*1.01

#    plt.subplot(1,2,1)
#    plt.hist(negativ,bins=combinedOutcomes[1])
#    plt.ylim(ymax = yUpperLimit*1.01, ymin = 0)
#    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
#    plt.ylabel('Count', fontsize=16)
#    plt.title('Negative', fontsize=20)
#
#    plt.subplot(1,2,2)
#    plt.hist(positiv,bins=combinedOutcomes[1])
#    plt.ylim(ymax = yUpperLimit, ymin = 0)
#    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
#    plt.title('Positive',fontsize=20)
#
#    imgThree = 'static/theNegPosHist.jpg'
#    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================
    
    a = posA[0]
    b = negA[0]
    c = combinedOutcomes[0]

    posImpact = np.divide(a,c)
    negImpact = np.divide(b,c)

    midPoints=[]
    for i in range(1,len(combinedOutcomes[1])):
        midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.)

    for i in range(len(posImpact)):
        if np.isnan(posImpact[i]):
            posImpact[i]=0
        if np.isnan(negImpact[i]):
            negImpact[i]=0

    plt.figure(figsize=(12, 8))
    plt.hold(True)
    plt.plot(midPoints, posImpact,'.', markersize=20, label='Positive')
    plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative')
    plt.legend(prop={'size':20})
    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    plt.ylabel('Relative Impact', fontsize=20)
    plt.grid()

    imgThree = 'static/theNegPosHist.jpg'
    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================

    # generate plots for report (this is save to an "html" file)

    from bokeh.charts import Histogram, output_file, show, save, gridplot
    from bokeh.plotting import figure

    plotList=[]

    for i in range(optimalNumberOfFeatures):
        thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])]
        allThoseFeatures = dp[thisFeatureIs]
        combinedOutcomes = plt.hist(allThoseFeatures, bins=10)
        
        positiv = allThoseFeatures[yOrig==1]
        negativ = allThoseFeatures[yOrig==0]
        negA = plt.hist(negativ,bins=combinedOutcomes[1])
        posA = plt.hist(positiv,bins=combinedOutcomes[1])
        posImpact = np.divide(posA[0],combinedOutcomes[0])
        negImpact = np.divide(negA[0],combinedOutcomes[0])
        
        midPoints=[]
        for i in range(1,len(combinedOutcomes[1])):
            midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.)
        
        for i in range(len(posImpact)):
            if np.isnan(posImpact[i]):
                posImpact[i]=0
            if np.isnan(negImpact[i]):
                negImpact[i]=0

        hist0 = Histogram(dp, values=thisFeatureIs, color='blue', title="Impact of " + thisFeatureIs, bins=10)
        plot0 = figure()
        plot0.xaxis.axis_label = thisFeatureIs
        plot0.yaxis.axis_label = "Relative Impact"
        #     plot0.title = "Relative Impact of " + thisFeatureIs
        plot0.circle(midPoints, list(negImpact), size=10, color="red", alpha=0.9, legend='Negative')
        plot0.circle(midPoints, list(posImpact), size=10, color="green", alpha=0.9, legend='Positive')
        plotList.append([hist0,plot0])

    output_file("flask_files/static/Report.html", title = "Report")
    hist = gridplot(plotList)
    save(hist)

    #=======================================================================

    # specify the models to run tests with
    theModels = {'Logistic Regression':LogisticRegression(penalty='l1'), 'LDA':LinearDiscriminantAnalysis(), 'SVM':SVC(kernel='linear'), 'Random Forest':RandomForestClassifier(n_estimators=300)}

    # ...then display the results of the tests
    classifierComparisons=[]
    for aModel in theModels:
        model = theModels[aModel]
        results = cross_validation.cross_val_score(model, Xt, y, scoring='f1', cv=StratifiedKFold(y, n_folds=3))
        classifierComparisons.append(dict(Classifier = aModel, Score = np.max(results)))

    #=======================================================================

    # display the plots
    theJPGs = [imgOne, imgTwo, imgThree]

    #=======================================================================

    return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs

示例#15

0

显示文件

文件： plot_random_under_sampler.py 项目： JFanZhao/UnbalancedDataset

from unbalanced_dataset.under_sampling import RandomUnderSampler

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)