Exemplo n.º 1
0
def balance_class_balance(path="images/class_balance.png"):
    data = load_game()
    y = data["outcome"]

    oz = ClassBalance(labels=["draw", "loss", "win"])
    oz.fit(y)
    return oz.poof(outpath=path)
Exemplo n.º 2
0
def balance():
    X, y = load_occupancy()
    _, _, y_train, y_test = tts(X, y, test_size=0.2)

    oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"])
    oz.fit(y_train, y_test)
    savefig(oz, "class_balance")
Exemplo n.º 3
0
def balance_class_balance(path="images/class_balance.png"):
    data = load_game()
    y = data["outcome"]

    oz = ClassBalance(labels=["draw", "loss", "win"])
    oz.fit(y)
    return oz.poof(outpath=path)
Exemplo n.º 4
0
 def target_visualizer(self,
                       classes=None,
                       params={'BalancedBinningReference': {
                           'bins': 5
                       }}):
     LOGGER.info('Initializing target visualizer')
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     visualizers = []
     y = self.y.squeeze()
     try:
         LOGGER.info('Visualizer BalancedBinningReference')
         visualizer = BalancedBinningReference()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = BalancedBinningReference(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR BalancedBinning')
     try:
         LOGGER.info('Visualizer CLassBalance')
         visualizer = ClassBalance()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = ClassBalance(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR ClassBalance')
     try:
         LOGGER.info('Visualizer Feature Correlation')
         visualizer = FeatureCorrelation(
             method='mutual_info-classification',
             feature_names=self.X.columns.tolist(),
             sort=True)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = FeatureCorrelation(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(self.X, y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR FeatureCorrelation')
Exemplo n.º 5
0
def compare_class_balance(path="images/class_balance_compare.png"):
    data = load_occupancy()

    features = ["temperature", "relative_humidity", "light", "C02", "humidity"]
    classes = ['unoccupied', 'occupied']

    # Extract the numpy arrays from the data frame
    X = data[features]
    y = data["occupancy"]

    # Create the train and test data
    _, _, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Instantiate the classification model and visualizer
    visualizer = ClassBalance(labels=classes)

    visualizer.fit(y_train, y_test)
    return visualizer.poof(outpath=path)
Exemplo n.º 6
0
def compare_class_balance(path="images/class_balance_compare.png"):
    data = load_occupancy()

    features = ["temperature", "relative_humidity", "light", "C02", "humidity"]
    classes = ['unoccupied', 'occupied']

    # Extract the numpy arrays from the data frame
    X = data[features]
    y = data["occupancy"]

    # Create the train and test data
    _, _, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Instantiate the classification model and visualizer
    visualizer = ClassBalance(labels=classes)

    visualizer.fit(y_train, y_test)
    return visualizer.poof(outpath=path)
Exemplo n.º 7
0
def Imbalance(y):
    """ Imabalance between the labels

    Parameters
    ----------
    y: vector of labels

    Returns
    -------
    
    - A plot with the class imbalances for Ebola positive or negative
    
    """
    # Instantiate the visualizer
    visualizer = ClassBalance(labels=['Ebola negative', 'Ebola positive'])

    visualizer.fit(y)  # Fit the data to the visualizer
    #visualizer.show('class_balance')        # Finalize and render the figure
    plt.show()
Exemplo n.º 8
0
def Imbalance_out(y):
    """ Imabalance between the labels

    Parameters
    ----------
    y: vector of labels

    Returns
    -------
    
    - A plot with the class imbalances for the outcome
    
    """
    # Instantiate the visualizer
    visualizer = ClassBalance(labels=['Survival', 'Death'])

    visualizer.fit(y)  # Fit the data to the visualizer
    #visualizer.show('class_balance')        # Finalize and render the figure
    plt.show()
Exemplo n.º 9
0
def balance_yellowbrick(
    X,
    y,
    features,
):
    plt.switch_backend('agg')
    plt.clf()
    X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                        y,
                                                        stratify=y,
                                                        test_size=0.01)
    X = pd.DataFrame(X_test, columns=features)
    y = pd.Series(y_test)
    visualizer = ClassBalance()
    visualizer.fit(y)
    visualizer.finalize()

    return plt
Exemplo n.º 10
0
hr = pd.read_csv("F:/COLLEGE/ML Project/Employee Turnover/Turnover.csv")
hr.profile_report(title="Data Report")

pd.crosstab(hr.department, hr.quit).plot(kind='bar')
plt.title("Turnover Frequency on Salary Bracket")
plt.xlabel('Department')
plt.ylabel('Frequency of Turnover')
cat_vars = ['department', 'salary']
for var in cat_vars:
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr = hr.join(cat_list)
print(hr.head())
hr.drop(columns=['department', 'salary'], axis=1, inplace=True)

#balance class
visualizer = ClassBalance(labels=['stayed', 'left']).fit(hr.quit)
visualizer.show()
X = hr.loc[:, hr.columns != 'quit']
y = hr.quit
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.2,
                                                    stratify=y)


@interact
def plot_tree(crit=['gini', 'entropy'],
              split=['best', 'random'],
              depth=IntSlider(min=1, max=30, value=2, continuous_update=False),
              min_split=IntSlider(min=2,
Exemplo n.º 11
0
cat_vars = ['department', 'salary']
for var in cat_vars:
  cat_list = pd.get_dummies(hr[var], prefix = var)
  hr = hr.join(cat_list)

hr.head()

hr.drop(columns = ['department', 'salary'], axis = 1, inplace = True)

#Task 4: Visualize class imbalance

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer =  ClassBalance(labels = ['stayed','quit']).fit(hr.quit)
visualizer.show()

#Task 5: Create traning and tests sets

X = hr.loc[:, hr.columns != 'quit']
y = hr.quit

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0, test_size=0.2,stratify=y)

#Task 6 : Build an interactive decision tree classifer

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Exemplo n.º 12
0
def main():

    script, fname, model = argcheck()

    df = filecheck(fname)

    print(df.head(5))

    #Data stats
    #Printing the number of rows and columns
    print(df.info())

    print("The number of rows")
    print(len(df))

    print("The number of columns")
    print(len(df.columns))

    print("Dataframe shape")
    print(df.shape)

    #Data preprocessing - step 1(Check for any null - N/A values)

    print("\n-------Data Preprocessing - Step 1--------")
    print("------------------------------------------")

    print("Checking for any N/A values")
    print(df.isna().values.any())

    #Check for any Null values
    print("Checking for any null values")
    print(df.isnull().values.any())

    #Data Preprocessing - step 2(Addressing class imbalance problem)

    print("\n-------Data Preprocessing - Step 2--------")
    print("------------------------------------------")

    Y = pd.DataFrame(data=df['Activity'])
    X = df.drop(['Activity'], axis=1)

    print("Before applying SMOTE algorithm")
    print("Unique values and count of target column 'Activity -'")
    print(df.groupby('Activity').nunique())

    unique_labels, frequency = np.unique(Y, return_counts=True)
    #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-Before-SMOTE.png'")
    visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer1.fit(Y.values.ravel())
    visualizer1.show("output/Class-balance-Before-SMOTE.png")

    #Solving the class imbalance problem by oversampling the data
    smote = SMOTE(random_state=1)
    X_1, Y_1 = smote.fit_resample(X, Y)

    print("After applying SMOTE algorithm")
    X_1_df = pd.DataFrame(data=X_1, columns=X.columns)
    Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns)

    print("The new shape of the X dataframe")
    print(X_1_df.shape)

    print("The new shape of the Y dataframe")
    print(Y_1_df.shape)

    unique, frequency = np.unique(Y_1, return_counts=True)
    # print unique values array
    print("Unique Values of new Y dataframe:", unique)

    # print frequency array
    print("Frequency Values of new Y dataframe:", frequency)

    #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-After-SMOTE.png'")
    visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer2.fit(Y_1_df.values.ravel())
    visualizer2.show("output/Class-balance-After-SMOTE.png")

    #Data Preprocessing - step 3(Label Encoding)
    print("\n-------Data Preprocessing - Step 3--------")
    print("------------------------------------------")

    #Convert the string labels to integers
    # 0- 'LAYING'
    # 1 - 'SITTING'
    # 2 - 'STANDING'
    # 3 - 'WALKING'
    # 4 - 'WALKING_DOWNSTAIRS'
    # 5 - 'WALKING_UPSTAIRS'
    label_encoder = preprocessing.LabelEncoder()
    Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity'])
    print("After label encoding, the target values are")
    classes = Y_1_df['Activity'].unique()
    print(Y_1_df['Activity'])

    #Data Preprocessing - step 4(Covariance/Correlation, standardization)

    print("\n-------Data Preprocessing - Step 4--------")
    print("------------------------------------------")
    #Covariance and correlation - Task 1(Preeti)
    dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True)
    print(dfCov)

    #Calculates Pearson product-moment correlation coefficients
    dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True)
    print("Correlation coefficient obtained : ", dfCorr)

    #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti)
    print(
        "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------"
    )
    print("------------------------------------------")
    X_train, X_test, y_train, y_test = train_test_split(X_1_df,
                                                        Y_1_df,
                                                        random_state=1,
                                                        test_size=0.2)

    #Data preprocessing - Step 6(Standardize the dataset)
    print("\n-------Data Preprocessing - Step 6--------")
    print("------------------------------------------")
    sc_X = preprocessing.StandardScaler()
    X_trainscaled = sc_X.fit_transform(X_train)
    X_testscaled = sc_X.transform(X_test)

    print("Mean of the standardized training set : ",
          X_trainscaled.mean(axis=0))
    print("std of the standardized training set : ", X_trainscaled.std(axis=0))

    print("Mean of the standardized test set : ", X_testscaled.mean(axis=0))
    print("std of the standardized test set : ", X_testscaled.std(axis=0))

    # Execute different model module based on input from user
    if model == 'decisiontree':
        decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test,
                                      classes, X_1_df, Y_1_df)

    elif model == 'svm':
        svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df,
                          Y_1_df)

    elif model == 'svmnonlinear':
        svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)

    elif model == 'naivebayes':
        naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test,
                                            X_1_df, Y_1_df)

    elif model == 'logisticregression':
        logisticregression.logisticRegressionTest(X_train, X_test, y_train,
                                                  y_test, X_1_df, Y_1_df)

    elif model == 'knn':
        knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'bagging':
        bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'adaboost':
        adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'randomforest':
        randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)
    elif model == 'ensemblevote':
        ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df,
                                    Y_1_df)
    else:
        print("please enter the correct classifier name")
        sys.exit()
Exemplo n.º 13
0
# =============================================================================
#morg_2_train_strs_broken
#morg_2_test_strs_broken
#activ_inact_train
#activ_inact_test

frames = [morg_2_train_strs_broken, activ_inact_train]
import pandas as pd
dfrad = pd.concat(frames, axis=1)
dfrad = dfrad.dropna()
#dfrad.iloc[:,[2048]]
#dfrad.iloc[:,:100]

#CLASS BALANCE - No balanced
from yellowbrick.target import ClassBalance
visCB = ClassBalance(labels=[1, 0])
visCB.fit(dfrad['activities'])  #Fit the data to the visualizer
visCB.show()  #Finalize and render the figure

#RANK 2D "Pearson correlation" -No balanced
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson')
visualizer.fit(dfrad.iloc[:, :50],
               dfrad['activities'])  # Fit the data to the visualizer
visualizer.transform(dfrad.iloc[:, :50])  # Transform the data
visualizer.show()  # Finalize and render the figure

#MANIFOLD - No balanced
from yellowbrick.features import Manifold
classes = [1, 0]
from sklearn import preprocessing
Exemplo n.º 14
0
def visualizeClassImbalance(labels_train, lables_test=None):
    visualizer = ClassBalance(labels=["boring", "interesting"])
    visualizer.fit(labels_train, lables_test)
    visualizer.poof()
Exemplo n.º 15
0
 def draw_class_balance(self):
     visualizer = ClassBalance(labels=self.le.classes_)
     visualizer.fit(self.training_labels)
     visualizer.poof()
Exemplo n.º 16
0
y = df['eventdeath']

display(X.shape)
display(y.shape)

#%%
import matplotlib.pyplot as plt
from yellowbrick.target import ClassBalance

_, y_counts = np.unique(y, return_counts=True)
class_labels = ["survived", "deceased"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9,4.5))
ax1.pie(y_counts, explode=(0, 0.05), labels = class_labels)

visualizer = ClassBalance(labels = class_labels, ax = ax2)
visualizer.fit(y)
visualizer.finalize()

plt.show()

#%%
print("Number of missing values:", X.isna().sum().sum())

#%%
X["timerecurrence"].describe()

#%%
# for column in X.columns[2:16]:
#     plt.scatter(X[column], y)
#     plt.xlabel(column)
Exemplo n.º 17
0
# %%
pd.crosstab(hr.salary,hr.quit).plot(kind='bar')
plt.title('TurnOver Frequency on Salary Bracket')
plt.xlabel('Salary')
plt.ylabel('Frequency of TurnOver')
plt.show()
# %%
pd.crosstab(hr.department,hr.quit).plot(kind='bar')
plt.title('TurnOver Frequency for Department')
plt.xlabel('Department')
plt.ylabel('Frequency of TurnOver')
plt.show()
# %%
hr.drop(columns=['department','salary'],axis=1,inplace=True)
# %%
visualizer = ClassBalance(labels=['stayed','quit'])
visualizer.fit(hr.quit)
visualizer.show()
# %%
X = hr.loc[:,hr.columns!='quit']
y = hr.quit
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)
# %%
@interact
def plot_tree(crit=['gini','entropy'],
              split=['best','random'],
              depth = IntSlider(min=1,max=30,value=2,continous_update=False),
              min_split=IntSlider(min=2,max=5,value=2,continous_update=False),
              min_leaf=IntSlider(min=1,max=5,value=1,continous_update=False)):
    estimator = DecisionTreeClassifier(random_state=0,criterion=crit,splitter=split,max_depth=depth,min_samples_split=min_split,min_samples_leaf=min_leaf)
    estimator.fit(X_train,y_train)
Exemplo n.º 18
0

# código pronto

y_top25 = top_p(test_y)
y_top25.mean()


# In[69]:


# código pronto

from yellowbrick.target import ClassBalance

visualizer = ClassBalance(labels=["75%", "25%"])
visualizer.fit(y_top25)
visualizer.show()


# ## Para saber mais: agrupando
# 
# O `yellowbrick` possui uma função para visualizar possíveis binnings. O código a seguir mostra 4 sugestões de pontos para agrupamento. Não usaremos a sugestão do yellowbrick pois no nosso caso o cliente já definiu que queria os X% do topo.

# In[70]:


# código pronto

from yellowbrick.target import BalancedBinningReference
#

# ### Task 4: Visualize Class Imbalance
# ---

# In[16]:

from yellowbrick.target import ClassBalance  # machine learning diagnostic library

plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12, 8)

# In[17]:

# SHOW THE TARGET VARIABLES IN 1 AND 0 IN THE FORM OF STAYED OR QUIT
visualizer = ClassBalance(labels=['stayed', 'quit']).fit(
    data.quit)  # fit(target_variable)
visualizer.show()

# ### Task 5: Create Training and Test Sets
# ---

# In[18]:

x = data.loc[:, data.columns != 'quit']
y = data.quit

# In[19]:

from sklearn.model_selection import train_test_split

# since the dataset is unbalanced, lets stratify the target variables
Exemplo n.º 20
0
def class_balance(classes, y):
    from yellowbrick.target import ClassBalance
    visualizer = ClassBalance(labels=classes)
    visualizer.fit(y)
    visualizer.poof()
Exemplo n.º 21
0
def describe(
    context: MLClientCtx,
    table: Union[DataItem, str],
    label_column: str,
    class_labels: List[str],
    key: str = "table-summary",
) -> None:
    """Summarize a table

    TODO: merge with dask version

    :param context:         the function context
    :param table:           pandas dataframe
    :param key:             key of table summary in artifact store
    """
    _gcf_clear(plt)

    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    os.makedirs(base_path + "/plots", exist_ok=True)

    print(f'TABLE {table}')
    table = pd.read_parquet(str(table))
    header = table.columns.values

    # describe table
    sumtbl = table.describe()
    sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True)
    sumtbl.insert(
        0, "metric",
        ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"])

    sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False)
    context.log_artifact(key, local_path=key + ".csv")

    # plot class balance, record relative class weight
    _gcf_clear(plt)

    labels = table.pop(label_column)
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)

    scale_pos_weight = class_balance_model.support_[
        0] / class_balance_model.support_[1]
    #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact("scale_pos_weight", str(scale_pos_weight))

    class_balance_model.show(
        outpath=os.path.join(base_path, "plots/imbalance.png"))
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()),
                         local_path="plots/imbalance.html")

    # plot feature correlation
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    plt.savefig(os.path.join(base_path, "plots/corr.png"))
    context.log_artifact(PlotArtifact("correlation", body=plt.gcf()),
                         local_path="plots/corr.html")

    # plot histogram
    _gcf_clear(plt)
    """
Exemplo n.º 22
0
        if (ngram.isupper()):
            bs[index] = features
        else:
            o[(index * n_ngrams) + i] = features
bs = bs[~np.all(bs == 0, axis=1)]
o = o[~np.all(o == 0, axis=1)]

binding_sites = bs
other = o
binding_sites_labels = np.ones(binding_sites.shape[0], dtype=np.uint8)
other_labels = np.zeros(other.shape[0], dtype=np.uint8)
X = np.concatenate((binding_sites, other))
y = np.concatenate((binding_sites_labels, other_labels))

# %%
visualizer = ClassBalance(labels=class_names)
visualizer.fit(y)
visualizer.poof()

# %%
visualizer = ParallelCoordinates()
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
visualizer = Rank1D()
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
import pandas as pd
import datetime
from yellowbrick.target import ClassBalance

print(datetime.datetime.now())

path = 'data/cleaned_data.csv'
pathr = 'data/resampled.csv'
pathr2 = 'data/resampled_borderline.csv'
pathr3 = 'data/resampled_adasyn.csv'
pathr4 = 'data/resampled_tomek.csv'
randomState = 42
classLabels = ['Not Bankrupt', 'Bankrupt']

df = pd.read_csv(pathr4, index_col=0)

print('Import done.')

# Extract labels from features
y = df['BK']
X = df.drop('BK', axis=1)

# Instantiate Visualizer
viz = ClassBalance(labels=classLabels)

viz.fit(y)
viz.show()
print(viz.support_)
#Task 3: Encode Categorical Features

cat_vars = ['department', 'salary']
for var in cat_vars:
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr = hr.join(cat_list)
hr.head()
hr.drop(columns=['department', 'salary'], axis=1, inplace=True)
hr.head()

#Task 4: Visualize Class Imbalance

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12, 8)
visualizer = ClassBalance(label=['satyed', 'quit']).fit(hr.quit)
visualizer.show()

#Task 5: Create Training and Test Sets

X = hr.loc[:, hr.columns != 'quit']
y = hr.quit
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, Y, stratify=Y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.2,
                                                    stratify=y)
Exemplo n.º 25
0
df.head()

df.drop(columns=['department', 'salary'], axis=1, inplace=True)
df.head()

"""### Now, it's really important to check for Class Imbalance in our dataset here

### Visualize Class Imbalance
---
"""

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer = ClassBalance(labels=["stayed", "quit"])
visualizer.fit(df.quit)

"""###  Create Training and Test Sets
---
"""

x = df.loc[:,df.columns != 'quit']
y = df.quit

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state =0, test_size=0.2, stratify=y)

"""### Building an Interactive Decision Tree Classifier
---
Exemplo n.º 26
0
df.head()

# Dropping categorical variables

df.drop(columns=['department','salary'], axis=1, inplace=True)

"""# Step 4: Visualize Class Imbalance
---
"""

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer = ClassBalance(labels=["stayed", "quit"])
visualizer.fit(df.quit)

"""# Step 5: Create Training and Test Sets
---
"""

X = df.loc[:, df.columns != 'quit']
y = df.quit

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,
                                                    stratify=y)

"""# Step 6: Classification using Decision Tree Classifier
Exemplo n.º 27
0
top_10 = top_p(pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), p=0.90).values
if not np.array_equal(top_10, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]):
    print("Não retornou o top 10% corretamente, deveria ser", top_10)


# código pronto

y_top25 = top_p(test_y)
y_top25.mean()


# código pronto

from yellowbrick.target import ClassBalance

visualizer = ClassBalance(labels=["75get_ipython().run_line_magic("",", " \"25%\"])")
visualizer.fit(y_top25)
visualizer.poof()


# código pronto

from yellowbrick.target import BalancedBinningReference

visualizer = BalancedBinningReference()
visualizer.fit(train_y)
visualizer.poof()


# código pronto
from sklearn.dummy import DummyClassifier