예제 #1
0
def balance_class_balance(path="images/class_balance.png"):
    data = load_game()
    y = data["outcome"]

    oz = ClassBalance(labels=["draw", "loss", "win"])
    oz.fit(y)
    return oz.poof(outpath=path)
예제 #2
0
def balance():
    X, y = load_occupancy()
    _, _, y_train, y_test = tts(X, y, test_size=0.2)

    oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"])
    oz.fit(y_train, y_test)
    savefig(oz, "class_balance")
예제 #3
0
def balance_class_balance(path="images/class_balance.png"):
    data = load_game()
    y = data["outcome"]

    oz = ClassBalance(labels=["draw", "loss", "win"])
    oz.fit(y)
    return oz.poof(outpath=path)
예제 #4
0
def compare_class_balance(path="images/class_balance_compare.png"):
    data = load_occupancy()

    features = ["temperature", "relative_humidity", "light", "C02", "humidity"]
    classes = ['unoccupied', 'occupied']

    # Extract the numpy arrays from the data frame
    X = data[features]
    y = data["occupancy"]

    # Create the train and test data
    _, _, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Instantiate the classification model and visualizer
    visualizer = ClassBalance(labels=classes)

    visualizer.fit(y_train, y_test)
    return visualizer.poof(outpath=path)
예제 #5
0
def balance_yellowbrick(
    X,
    y,
    features,
):
    plt.switch_backend('agg')
    plt.clf()
    X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                        y,
                                                        stratify=y,
                                                        test_size=0.01)
    X = pd.DataFrame(X_test, columns=features)
    y = pd.Series(y_test)
    visualizer = ClassBalance()
    visualizer.fit(y)
    visualizer.finalize()

    return plt
예제 #6
0
def compare_class_balance(path="images/class_balance_compare.png"):
    data = load_occupancy()

    features = ["temperature", "relative_humidity", "light", "C02", "humidity"]
    classes = ['unoccupied', 'occupied']

    # Extract the numpy arrays from the data frame
    X = data[features]
    y = data["occupancy"]

    # Create the train and test data
    _, _, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Instantiate the classification model and visualizer
    visualizer = ClassBalance(labels=classes)

    visualizer.fit(y_train, y_test)
    return visualizer.poof(outpath=path)
예제 #7
0
def Imbalance(y):
    """ Imabalance between the labels

    Parameters
    ----------
    y: vector of labels

    Returns
    -------
    
    - A plot with the class imbalances for Ebola positive or negative
    
    """
    # Instantiate the visualizer
    visualizer = ClassBalance(labels=['Ebola negative', 'Ebola positive'])

    visualizer.fit(y)  # Fit the data to the visualizer
    #visualizer.show('class_balance')        # Finalize and render the figure
    plt.show()
예제 #8
0
def Imbalance_out(y):
    """ Imabalance between the labels

    Parameters
    ----------
    y: vector of labels

    Returns
    -------
    
    - A plot with the class imbalances for the outcome
    
    """
    # Instantiate the visualizer
    visualizer = ClassBalance(labels=['Survival', 'Death'])

    visualizer.fit(y)  # Fit the data to the visualizer
    #visualizer.show('class_balance')        # Finalize and render the figure
    plt.show()
예제 #9
0
            bs[index] = features
        else:
            o[(index * n_ngrams) + i] = features
bs = bs[~np.all(bs == 0, axis=1)]
o = o[~np.all(o == 0, axis=1)]

binding_sites = bs
other = o
binding_sites_labels = np.ones(binding_sites.shape[0], dtype=np.uint8)
other_labels = np.zeros(other.shape[0], dtype=np.uint8)
X = np.concatenate((binding_sites, other))
y = np.concatenate((binding_sites_labels, other_labels))

# %%
visualizer = ClassBalance(labels=class_names)
visualizer.fit(y)
visualizer.poof()

# %%
visualizer = ParallelCoordinates()
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
visualizer = Rank1D()
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
visualizer = Rank2D()
예제 #10
0
# código pronto

y_top25 = top_p(test_y)
y_top25.mean()


# In[69]:


# código pronto

from yellowbrick.target import ClassBalance

visualizer = ClassBalance(labels=["75%", "25%"])
visualizer.fit(y_top25)
visualizer.show()


# ## Para saber mais: agrupando
# 
# O `yellowbrick` possui uma função para visualizar possíveis binnings. O código a seguir mostra 4 sugestões de pontos para agrupamento. Não usaremos a sugestão do yellowbrick pois no nosso caso o cliente já definiu que queria os X% do topo.

# In[70]:


# código pronto

from yellowbrick.target import BalancedBinningReference

visualizer = BalancedBinningReference()
                           max_depth=5, max_feature='auto', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                           oob_score=False, random_state=1, verbose=False,
                           warn_start=False)
viz= FeatureImportances(rf)
viz.fit(X_train,y_train)
viz.show();


dt = DecisionForestClassifer(class_weight = None, criterion='gini',
                           max_depth=3, max_feature='None', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, presort=False,random_state=0,
                           splitters='best')
                           
viz= FeatureImportances(dt)
viz.fit(X_train,y_train)
viz.show();


from yellowbrick.classifer import ROCAUC

visualizer = ROCAUC(rf, classes=['stayed','quit'])

visualizer.fit(X_train, y_train)
visulizer.score(X_test, y_test)
visualizer.pool();
예제 #12
0
def main():

    script, fname, model = argcheck()

    df = filecheck(fname)

    print(df.head(5))

    #Data stats
    #Printing the number of rows and columns
    print(df.info())

    print("The number of rows")
    print(len(df))

    print("The number of columns")
    print(len(df.columns))

    print("Dataframe shape")
    print(df.shape)

    #Data preprocessing - step 1(Check for any null - N/A values)

    print("\n-------Data Preprocessing - Step 1--------")
    print("------------------------------------------")

    print("Checking for any N/A values")
    print(df.isna().values.any())

    #Check for any Null values
    print("Checking for any null values")
    print(df.isnull().values.any())

    #Data Preprocessing - step 2(Addressing class imbalance problem)

    print("\n-------Data Preprocessing - Step 2--------")
    print("------------------------------------------")

    Y = pd.DataFrame(data=df['Activity'])
    X = df.drop(['Activity'], axis=1)

    print("Before applying SMOTE algorithm")
    print("Unique values and count of target column 'Activity -'")
    print(df.groupby('Activity').nunique())

    unique_labels, frequency = np.unique(Y, return_counts=True)
    #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-Before-SMOTE.png'")
    visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer1.fit(Y.values.ravel())
    visualizer1.show("output/Class-balance-Before-SMOTE.png")

    #Solving the class imbalance problem by oversampling the data
    smote = SMOTE(random_state=1)
    X_1, Y_1 = smote.fit_resample(X, Y)

    print("After applying SMOTE algorithm")
    X_1_df = pd.DataFrame(data=X_1, columns=X.columns)
    Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns)

    print("The new shape of the X dataframe")
    print(X_1_df.shape)

    print("The new shape of the Y dataframe")
    print(Y_1_df.shape)

    unique, frequency = np.unique(Y_1, return_counts=True)
    # print unique values array
    print("Unique Values of new Y dataframe:", unique)

    # print frequency array
    print("Frequency Values of new Y dataframe:", frequency)

    #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-After-SMOTE.png'")
    visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer2.fit(Y_1_df.values.ravel())
    visualizer2.show("output/Class-balance-After-SMOTE.png")

    #Data Preprocessing - step 3(Label Encoding)
    print("\n-------Data Preprocessing - Step 3--------")
    print("------------------------------------------")

    #Convert the string labels to integers
    # 0- 'LAYING'
    # 1 - 'SITTING'
    # 2 - 'STANDING'
    # 3 - 'WALKING'
    # 4 - 'WALKING_DOWNSTAIRS'
    # 5 - 'WALKING_UPSTAIRS'
    label_encoder = preprocessing.LabelEncoder()
    Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity'])
    print("After label encoding, the target values are")
    classes = Y_1_df['Activity'].unique()
    print(Y_1_df['Activity'])

    #Data Preprocessing - step 4(Covariance/Correlation, standardization)

    print("\n-------Data Preprocessing - Step 4--------")
    print("------------------------------------------")
    #Covariance and correlation - Task 1(Preeti)
    dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True)
    print(dfCov)

    #Calculates Pearson product-moment correlation coefficients
    dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True)
    print("Correlation coefficient obtained : ", dfCorr)

    #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti)
    print(
        "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------"
    )
    print("------------------------------------------")
    X_train, X_test, y_train, y_test = train_test_split(X_1_df,
                                                        Y_1_df,
                                                        random_state=1,
                                                        test_size=0.2)

    #Data preprocessing - Step 6(Standardize the dataset)
    print("\n-------Data Preprocessing - Step 6--------")
    print("------------------------------------------")
    sc_X = preprocessing.StandardScaler()
    X_trainscaled = sc_X.fit_transform(X_train)
    X_testscaled = sc_X.transform(X_test)

    print("Mean of the standardized training set : ",
          X_trainscaled.mean(axis=0))
    print("std of the standardized training set : ", X_trainscaled.std(axis=0))

    print("Mean of the standardized test set : ", X_testscaled.mean(axis=0))
    print("std of the standardized test set : ", X_testscaled.std(axis=0))

    # Execute different model module based on input from user
    if model == 'decisiontree':
        decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test,
                                      classes, X_1_df, Y_1_df)

    elif model == 'svm':
        svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df,
                          Y_1_df)

    elif model == 'svmnonlinear':
        svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)

    elif model == 'naivebayes':
        naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test,
                                            X_1_df, Y_1_df)

    elif model == 'logisticregression':
        logisticregression.logisticRegressionTest(X_train, X_test, y_train,
                                                  y_test, X_1_df, Y_1_df)

    elif model == 'knn':
        knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'bagging':
        bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'adaboost':
        adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'randomforest':
        randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)
    elif model == 'ensemblevote':
        ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df,
                                    Y_1_df)
    else:
        print("please enter the correct classifier name")
        sys.exit()
예제 #13
0
#morg_2_train_strs_broken
#morg_2_test_strs_broken
#activ_inact_train
#activ_inact_test

frames = [morg_2_train_strs_broken, activ_inact_train]
import pandas as pd
dfrad = pd.concat(frames, axis=1)
dfrad = dfrad.dropna()
#dfrad.iloc[:,[2048]]
#dfrad.iloc[:,:100]

#CLASS BALANCE - No balanced
from yellowbrick.target import ClassBalance
visCB = ClassBalance(labels=[1, 0])
visCB.fit(dfrad['activities'])  #Fit the data to the visualizer
visCB.show()  #Finalize and render the figure

#RANK 2D "Pearson correlation" -No balanced
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson')
visualizer.fit(dfrad.iloc[:, :50],
               dfrad['activities'])  # Fit the data to the visualizer
visualizer.transform(dfrad.iloc[:, :50])  # Transform the data
visualizer.show()  # Finalize and render the figure

#MANIFOLD - No balanced
from yellowbrick.features import Manifold
classes = [1, 0]
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder(
예제 #14
0
def visualizeClassImbalance(labels_train, lables_test=None):
    visualizer = ClassBalance(labels=["boring", "interesting"])
    visualizer.fit(labels_train, lables_test)
    visualizer.poof()
import pandas as pd
import datetime
from yellowbrick.target import ClassBalance

print(datetime.datetime.now())

path = 'data/cleaned_data.csv'
pathr = 'data/resampled.csv'
pathr2 = 'data/resampled_borderline.csv'
pathr3 = 'data/resampled_adasyn.csv'
pathr4 = 'data/resampled_tomek.csv'
randomState = 42
classLabels = ['Not Bankrupt', 'Bankrupt']

df = pd.read_csv(pathr4, index_col=0)

print('Import done.')

# Extract labels from features
y = df['BK']
X = df.drop('BK', axis=1)

# Instantiate Visualizer
viz = ClassBalance(labels=classLabels)

viz.fit(y)
viz.show()
print(viz.support_)
예제 #16
0
 def draw_class_balance(self):
     visualizer = ClassBalance(labels=self.le.classes_)
     visualizer.fit(self.training_labels)
     visualizer.poof()
예제 #17
0
def class_balance(classes, y):
    from yellowbrick.target import ClassBalance
    visualizer = ClassBalance(labels=classes)
    visualizer.fit(y)
    visualizer.poof()
예제 #18
0
def describe(
    context: MLClientCtx,
    table: Union[DataItem, str],
    label_column: str,
    class_labels: List[str],
    key: str = "table-summary",
) -> None:
    """Summarize a table

    TODO: merge with dask version

    :param context:         the function context
    :param table:           pandas dataframe
    :param key:             key of table summary in artifact store
    """
    _gcf_clear(plt)

    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    os.makedirs(base_path + "/plots", exist_ok=True)

    print(f'TABLE {table}')
    table = pd.read_parquet(str(table))
    header = table.columns.values

    # describe table
    sumtbl = table.describe()
    sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True)
    sumtbl.insert(
        0, "metric",
        ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"])

    sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False)
    context.log_artifact(key, local_path=key + ".csv")

    # plot class balance, record relative class weight
    _gcf_clear(plt)

    labels = table.pop(label_column)
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)

    scale_pos_weight = class_balance_model.support_[
        0] / class_balance_model.support_[1]
    #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact("scale_pos_weight", str(scale_pos_weight))

    class_balance_model.show(
        outpath=os.path.join(base_path, "plots/imbalance.png"))
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()),
                         local_path="plots/imbalance.html")

    # plot feature correlation
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    plt.savefig(os.path.join(base_path, "plots/corr.png"))
    context.log_artifact(PlotArtifact("correlation", body=plt.gcf()),
                         local_path="plots/corr.html")

    # plot histogram
    _gcf_clear(plt)
    """
예제 #19
0
display(X.shape)
display(y.shape)

#%%
import matplotlib.pyplot as plt
from yellowbrick.target import ClassBalance

_, y_counts = np.unique(y, return_counts=True)
class_labels = ["survived", "deceased"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9,4.5))
ax1.pie(y_counts, explode=(0, 0.05), labels = class_labels)

visualizer = ClassBalance(labels = class_labels, ax = ax2)
visualizer.fit(y)
visualizer.finalize()

plt.show()

#%%
print("Number of missing values:", X.isna().sum().sum())

#%%
X["timerecurrence"].describe()

#%%
# for column in X.columns[2:16]:
#     plt.scatter(X[column], y)
#     plt.xlabel(column)
#     plt.show()
예제 #20
0
df.drop(columns=['department', 'salary'], axis=1, inplace=True)
df.head()

"""### Now, it's really important to check for Class Imbalance in our dataset here

### Visualize Class Imbalance
---
"""

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer = ClassBalance(labels=["stayed", "quit"])
visualizer.fit(df.quit)

"""###  Create Training and Test Sets
---
"""

x = df.loc[:,df.columns != 'quit']
y = df.quit

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state =0, test_size=0.2, stratify=y)

"""### Building an Interactive Decision Tree Classifier
---
예제 #21
0
df.head()

# Dropping categorical variables

df.drop(columns=['department','salary'], axis=1, inplace=True)

"""# Step 4: Visualize Class Imbalance
---
"""

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer = ClassBalance(labels=["stayed", "quit"])
visualizer.fit(df.quit)

"""# Step 5: Create Training and Test Sets
---
"""

X = df.loc[:, df.columns != 'quit']
y = df.quit

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,
                                                    stratify=y)

"""# Step 6: Classification using Decision Tree Classifier
---
예제 #22
0
pd.crosstab(hr.salary,hr.quit).plot(kind='bar')
plt.title('TurnOver Frequency on Salary Bracket')
plt.xlabel('Salary')
plt.ylabel('Frequency of TurnOver')
plt.show()
# %%
pd.crosstab(hr.department,hr.quit).plot(kind='bar')
plt.title('TurnOver Frequency for Department')
plt.xlabel('Department')
plt.ylabel('Frequency of TurnOver')
plt.show()
# %%
hr.drop(columns=['department','salary'],axis=1,inplace=True)
# %%
visualizer = ClassBalance(labels=['stayed','quit'])
visualizer.fit(hr.quit)
visualizer.show()
# %%
X = hr.loc[:,hr.columns!='quit']
y = hr.quit
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)
# %%
@interact
def plot_tree(crit=['gini','entropy'],
              split=['best','random'],
              depth = IntSlider(min=1,max=30,value=2,continous_update=False),
              min_split=IntSlider(min=2,max=5,value=2,continous_update=False),
              min_leaf=IntSlider(min=1,max=5,value=1,continous_update=False)):
    estimator = DecisionTreeClassifier(random_state=0,criterion=crit,splitter=split,max_depth=depth,min_samples_split=min_split,min_samples_leaf=min_leaf)
    estimator.fit(X_train,y_train)
    print(accuracy_score(y_train,estimator.predict(X_train)))