def balance_class_balance(path="images/class_balance.png"): data = load_game() y = data["outcome"] oz = ClassBalance(labels=["draw", "loss", "win"]) oz.fit(y) return oz.poof(outpath=path)
def balance(): X, y = load_occupancy() _, _, y_train, y_test = tts(X, y, test_size=0.2) oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"]) oz.fit(y_train, y_test) savefig(oz, "class_balance")
def compare_class_balance(path="images/class_balance_compare.png"): data = load_occupancy() features = ["temperature", "relative_humidity", "light", "C02", "humidity"] classes = ['unoccupied', 'occupied'] # Extract the numpy arrays from the data frame X = data[features] y = data["occupancy"] # Create the train and test data _, _, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the classification model and visualizer visualizer = ClassBalance(labels=classes) visualizer.fit(y_train, y_test) return visualizer.poof(outpath=path)
def balance_yellowbrick( X, y, features, ): plt.switch_backend('agg') plt.clf() X_train, X_test, y_train, y_test = train_test_split(X[features], y, stratify=y, test_size=0.01) X = pd.DataFrame(X_test, columns=features) y = pd.Series(y_test) visualizer = ClassBalance() visualizer.fit(y) visualizer.finalize() return plt
def Imbalance(y): """ Imabalance between the labels Parameters ---------- y: vector of labels Returns ------- - A plot with the class imbalances for Ebola positive or negative """ # Instantiate the visualizer visualizer = ClassBalance(labels=['Ebola negative', 'Ebola positive']) visualizer.fit(y) # Fit the data to the visualizer #visualizer.show('class_balance') # Finalize and render the figure plt.show()
def Imbalance_out(y): """ Imabalance between the labels Parameters ---------- y: vector of labels Returns ------- - A plot with the class imbalances for the outcome """ # Instantiate the visualizer visualizer = ClassBalance(labels=['Survival', 'Death']) visualizer.fit(y) # Fit the data to the visualizer #visualizer.show('class_balance') # Finalize and render the figure plt.show()
bs[index] = features else: o[(index * n_ngrams) + i] = features bs = bs[~np.all(bs == 0, axis=1)] o = o[~np.all(o == 0, axis=1)] binding_sites = bs other = o binding_sites_labels = np.ones(binding_sites.shape[0], dtype=np.uint8) other_labels = np.zeros(other.shape[0], dtype=np.uint8) X = np.concatenate((binding_sites, other)) y = np.concatenate((binding_sites_labels, other_labels)) # %% visualizer = ClassBalance(labels=class_names) visualizer.fit(y) visualizer.poof() # %% visualizer = ParallelCoordinates() visualizer.fit_transform(X, y) visualizer.poof() # %% visualizer = Rank1D() visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %% visualizer = Rank2D()
# código pronto y_top25 = top_p(test_y) y_top25.mean() # In[69]: # código pronto from yellowbrick.target import ClassBalance visualizer = ClassBalance(labels=["75%", "25%"]) visualizer.fit(y_top25) visualizer.show() # ## Para saber mais: agrupando # # O `yellowbrick` possui uma função para visualizar possíveis binnings. O código a seguir mostra 4 sugestões de pontos para agrupamento. Não usaremos a sugestão do yellowbrick pois no nosso caso o cliente já definiu que queria os X% do topo. # In[70]: # código pronto from yellowbrick.target import BalancedBinningReference visualizer = BalancedBinningReference()
max_depth=5, max_feature='auto', max_leaf_nodes = None, min_impurity_decrease=0.0, min_impurity_split= None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1, verbose=False, warn_start=False) viz= FeatureImportances(rf) viz.fit(X_train,y_train) viz.show(); dt = DecisionForestClassifer(class_weight = None, criterion='gini', max_depth=3, max_feature='None', max_leaf_nodes = None, min_impurity_decrease=0.0, min_impurity_split= None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False,random_state=0, splitters='best') viz= FeatureImportances(dt) viz.fit(X_train,y_train) viz.show(); from yellowbrick.classifer import ROCAUC visualizer = ROCAUC(rf, classes=['stayed','quit']) visualizer.fit(X_train, y_train) visulizer.score(X_test, y_test) visualizer.pool();
def main(): script, fname, model = argcheck() df = filecheck(fname) print(df.head(5)) #Data stats #Printing the number of rows and columns print(df.info()) print("The number of rows") print(len(df)) print("The number of columns") print(len(df.columns)) print("Dataframe shape") print(df.shape) #Data preprocessing - step 1(Check for any null - N/A values) print("\n-------Data Preprocessing - Step 1--------") print("------------------------------------------") print("Checking for any N/A values") print(df.isna().values.any()) #Check for any Null values print("Checking for any null values") print(df.isnull().values.any()) #Data Preprocessing - step 2(Addressing class imbalance problem) print("\n-------Data Preprocessing - Step 2--------") print("------------------------------------------") Y = pd.DataFrame(data=df['Activity']) X = df.drop(['Activity'], axis=1) print("Before applying SMOTE algorithm") print("Unique values and count of target column 'Activity -'") print(df.groupby('Activity').nunique()) unique_labels, frequency = np.unique(Y, return_counts=True) #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-Before-SMOTE.png'") visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer1.fit(Y.values.ravel()) visualizer1.show("output/Class-balance-Before-SMOTE.png") #Solving the class imbalance problem by oversampling the data smote = SMOTE(random_state=1) X_1, Y_1 = smote.fit_resample(X, Y) print("After applying SMOTE algorithm") X_1_df = pd.DataFrame(data=X_1, columns=X.columns) Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns) print("The new shape of the X dataframe") print(X_1_df.shape) print("The new shape of the Y dataframe") print(Y_1_df.shape) unique, frequency = np.unique(Y_1, return_counts=True) # print unique values array print("Unique Values of new Y dataframe:", unique) # print frequency array print("Frequency Values of new Y dataframe:", frequency) #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-After-SMOTE.png'") visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer2.fit(Y_1_df.values.ravel()) visualizer2.show("output/Class-balance-After-SMOTE.png") #Data Preprocessing - step 3(Label Encoding) print("\n-------Data Preprocessing - Step 3--------") print("------------------------------------------") #Convert the string labels to integers # 0- 'LAYING' # 1 - 'SITTING' # 2 - 'STANDING' # 3 - 'WALKING' # 4 - 'WALKING_DOWNSTAIRS' # 5 - 'WALKING_UPSTAIRS' label_encoder = preprocessing.LabelEncoder() Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity']) print("After label encoding, the target values are") classes = Y_1_df['Activity'].unique() print(Y_1_df['Activity']) #Data Preprocessing - step 4(Covariance/Correlation, standardization) print("\n-------Data Preprocessing - Step 4--------") print("------------------------------------------") #Covariance and correlation - Task 1(Preeti) dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True) print(dfCov) #Calculates Pearson product-moment correlation coefficients dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True) print("Correlation coefficient obtained : ", dfCorr) #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti) print( "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------" ) print("------------------------------------------") X_train, X_test, y_train, y_test = train_test_split(X_1_df, Y_1_df, random_state=1, test_size=0.2) #Data preprocessing - Step 6(Standardize the dataset) print("\n-------Data Preprocessing - Step 6--------") print("------------------------------------------") sc_X = preprocessing.StandardScaler() X_trainscaled = sc_X.fit_transform(X_train) X_testscaled = sc_X.transform(X_test) print("Mean of the standardized training set : ", X_trainscaled.mean(axis=0)) print("std of the standardized training set : ", X_trainscaled.std(axis=0)) print("Mean of the standardized test set : ", X_testscaled.mean(axis=0)) print("std of the standardized test set : ", X_testscaled.std(axis=0)) # Execute different model module based on input from user if model == 'decisiontree': decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svm': svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svmnonlinear': svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'naivebayes': naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'logisticregression': logisticregression.logisticRegressionTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'knn': knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'bagging': bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'adaboost': adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'randomforest': randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'ensemblevote': ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) else: print("please enter the correct classifier name") sys.exit()
#morg_2_train_strs_broken #morg_2_test_strs_broken #activ_inact_train #activ_inact_test frames = [morg_2_train_strs_broken, activ_inact_train] import pandas as pd dfrad = pd.concat(frames, axis=1) dfrad = dfrad.dropna() #dfrad.iloc[:,[2048]] #dfrad.iloc[:,:100] #CLASS BALANCE - No balanced from yellowbrick.target import ClassBalance visCB = ClassBalance(labels=[1, 0]) visCB.fit(dfrad['activities']) #Fit the data to the visualizer visCB.show() #Finalize and render the figure #RANK 2D "Pearson correlation" -No balanced from yellowbrick.features import Rank2D visualizer = Rank2D(algorithm='pearson') visualizer.fit(dfrad.iloc[:, :50], dfrad['activities']) # Fit the data to the visualizer visualizer.transform(dfrad.iloc[:, :50]) # Transform the data visualizer.show() # Finalize and render the figure #MANIFOLD - No balanced from yellowbrick.features import Manifold classes = [1, 0] from sklearn import preprocessing label_encoder = preprocessing.LabelEncoder(
def visualizeClassImbalance(labels_train, lables_test=None): visualizer = ClassBalance(labels=["boring", "interesting"]) visualizer.fit(labels_train, lables_test) visualizer.poof()
import pandas as pd import datetime from yellowbrick.target import ClassBalance print(datetime.datetime.now()) path = 'data/cleaned_data.csv' pathr = 'data/resampled.csv' pathr2 = 'data/resampled_borderline.csv' pathr3 = 'data/resampled_adasyn.csv' pathr4 = 'data/resampled_tomek.csv' randomState = 42 classLabels = ['Not Bankrupt', 'Bankrupt'] df = pd.read_csv(pathr4, index_col=0) print('Import done.') # Extract labels from features y = df['BK'] X = df.drop('BK', axis=1) # Instantiate Visualizer viz = ClassBalance(labels=classLabels) viz.fit(y) viz.show() print(viz.support_)
def draw_class_balance(self): visualizer = ClassBalance(labels=self.le.classes_) visualizer.fit(self.training_labels) visualizer.poof()
def class_balance(classes, y): from yellowbrick.target import ClassBalance visualizer = ClassBalance(labels=classes) visualizer.fit(y) visualizer.poof()
def describe( context: MLClientCtx, table: Union[DataItem, str], label_column: str, class_labels: List[str], key: str = "table-summary", ) -> None: """Summarize a table TODO: merge with dask version :param context: the function context :param table: pandas dataframe :param key: key of table summary in artifact store """ _gcf_clear(plt) base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) os.makedirs(base_path + "/plots", exist_ok=True) print(f'TABLE {table}') table = pd.read_parquet(str(table)) header = table.columns.values # describe table sumtbl = table.describe() sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True) sumtbl.insert( 0, "metric", ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"]) sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False) context.log_artifact(key, local_path=key + ".csv") # plot class balance, record relative class weight _gcf_clear(plt) labels = table.pop(label_column) class_balance_model = ClassBalance(labels=class_labels) class_balance_model.fit(labels) scale_pos_weight = class_balance_model.support_[ 0] / class_balance_model.support_[1] #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}") context.log_artifact("scale_pos_weight", str(scale_pos_weight)) class_balance_model.show( outpath=os.path.join(base_path, "plots/imbalance.png")) context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path="plots/imbalance.html") # plot feature correlation _gcf_clear(plt) tblcorr = table.corr() ax = plt.axes() sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") plt.savefig(os.path.join(base_path, "plots/corr.png")) context.log_artifact(PlotArtifact("correlation", body=plt.gcf()), local_path="plots/corr.html") # plot histogram _gcf_clear(plt) """
display(X.shape) display(y.shape) #%% import matplotlib.pyplot as plt from yellowbrick.target import ClassBalance _, y_counts = np.unique(y, return_counts=True) class_labels = ["survived", "deceased"] fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9,4.5)) ax1.pie(y_counts, explode=(0, 0.05), labels = class_labels) visualizer = ClassBalance(labels = class_labels, ax = ax2) visualizer.fit(y) visualizer.finalize() plt.show() #%% print("Number of missing values:", X.isna().sum().sum()) #%% X["timerecurrence"].describe() #%% # for column in X.columns[2:16]: # plt.scatter(X[column], y) # plt.xlabel(column) # plt.show()
df.drop(columns=['department', 'salary'], axis=1, inplace=True) df.head() """### Now, it's really important to check for Class Imbalance in our dataset here ### Visualize Class Imbalance --- """ from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels=["stayed", "quit"]) visualizer.fit(df.quit) """### Create Training and Test Sets --- """ x = df.loc[:,df.columns != 'quit'] y = df.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, random_state =0, test_size=0.2, stratify=y) """### Building an Interactive Decision Tree Classifier ---
df.head() # Dropping categorical variables df.drop(columns=['department','salary'], axis=1, inplace=True) """# Step 4: Visualize Class Imbalance --- """ from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels=["stayed", "quit"]) visualizer.fit(df.quit) """# Step 5: Create Training and Test Sets --- """ X = df.loc[:, df.columns != 'quit'] y = df.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) """# Step 6: Classification using Decision Tree Classifier ---
pd.crosstab(hr.salary,hr.quit).plot(kind='bar') plt.title('TurnOver Frequency on Salary Bracket') plt.xlabel('Salary') plt.ylabel('Frequency of TurnOver') plt.show() # %% pd.crosstab(hr.department,hr.quit).plot(kind='bar') plt.title('TurnOver Frequency for Department') plt.xlabel('Department') plt.ylabel('Frequency of TurnOver') plt.show() # %% hr.drop(columns=['department','salary'],axis=1,inplace=True) # %% visualizer = ClassBalance(labels=['stayed','quit']) visualizer.fit(hr.quit) visualizer.show() # %% X = hr.loc[:,hr.columns!='quit'] y = hr.quit X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y) # %% @interact def plot_tree(crit=['gini','entropy'], split=['best','random'], depth = IntSlider(min=1,max=30,value=2,continous_update=False), min_split=IntSlider(min=2,max=5,value=2,continous_update=False), min_leaf=IntSlider(min=1,max=5,value=1,continous_update=False)): estimator = DecisionTreeClassifier(random_state=0,criterion=crit,splitter=split,max_depth=depth,min_samples_split=min_split,min_samples_leaf=min_leaf) estimator.fit(X_train,y_train) print(accuracy_score(y_train,estimator.predict(X_train)))