def __init__(self, y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn'): self.zero_division = zero_division self.output_dict = output_dict self.y_pred = y_pred self.target_names = target_names self.digits = digits Metrics.__init__(self, sample_weight=sample_weight, y_true=y_true, labels=labels) self.value = CR(sample_weight=self.sample_weight, zero_division=self.zero_division, digits=self.digits, y_true=self.y_true, labels=self.labels, output_dict=self.output_dict, y_pred=self.y_pred, target_names=self.target_names)
def evaluateModel(self,test_x,test_y,model): from sklearn.metrics import classification_report as CR, roc_auc_score as ROC predict_result = model.predict(test_x) predict_prob = model.predict_proba(test_x)[:,1] crReport = CR(test_y,predict_result) acc = model.score(test_x,test_y) roc = ROC(test_y,predict_prob) #print("Accuracy %f" % model.score(test_x,test_y)) return crReport,acc,roc
def log_anomalyPRF_isof(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') km = OCS(kernel='linear') if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) pred[np.where(pred == -1)[0]] = 0 else: pred = km.fit_predict(dataset) pred[np.where(pred == -1)[0]] = 0 # pred = assign_labels(pred, ground_truth) print CR(ground_truth, pred)
def log_anomalyPRF_AC(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') # KMeans model # km = KMeans(n_clusters=n_clusters, n_init=cluster_init, n_jobs=-1, # random_state=SEED) km = AC(n_clusters=n_clusters) if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) else: pred = km.fit_predict(dataset) pred = assign_labels(pred, ground_truth) print CR(ground_truth, pred)
# sns.kdeplot(iris[['sepal_width','sepal_length']][iris['species'] == "setosa"]) # plt.show() # SPLIT DATA X_train, X_test, y_train, y_test = TTS(iris.drop('species', axis=1), iris['species'], test_size=0.3, random_state=101) # TRAIN MODEL model = SVC() model.fit(X_train, y_train) pred = model.predict(X_test) print(CR(y_test, pred), CM(y_test, pred)) print(model) # GRID SEARCH - "THIS IS NOT NECESSARY, THE MODEL IS PERFECT" param_grid = { 'C': list(np.arange(0.1, 10, 0.1)), 'gamma': [1, 0.1, 0.001, 0.0001] } grid = GSCV(SVC(), param_grid, verbose=3, n_jobs=4) grid.fit(X_train, y_train) print(grid.best_params_) print(grid.best_estimator_) gpred = grid.predict(X_test)
# SCALING scaler = SS() scaler.fit(df.drop('TARGET CLASS',axis=1)) scaled = scaler.transform(df.drop('TARGET CLASS',axis=1)) df_scale = pd.DataFrame(scaled,columns=df.columns[:-1]) print(df_scale.head()) # SPLIT DATA INTO TRAINING AND TESTING X_train,X_test,y_train,y_test = TTS(df_scale,df['TARGET CLASS'],test_size=0.3,random_state=101) # KNN model = KNC(n_neighbors=1) model.fit(X_train,y_train) pred = model.predict(X_test) print(CR(y_test,pred)) print(CM(y_test,pred)) # CHOOSE K VALUE (ELBOW METHOD) error_rate = [] for i in range(1,40): model = KNC(n_neighbors=i) model.fit(X_train,y_train) pred_i = model.predict(X_test) error_rate.append(np.mean(y_test != pred_i)) sns.lineplot(x=np.arange(1,40),y=np.array(error_rate)) plt.show() # RERUN WITH NEW K
model = SVC(kernel=kernel, C=C, gamma=gamma) clf = model.fit(X_train, Y_train) end = T() pred = clf.predict(X_test) mScore = clf.score(X_test, Y_test) print(f'Score against Testing Data: {mScore * 100:.3f}%') print(f'Model took {(end-start)*1000:.3f}ms to train') # ### Generate Classification Report # In[11]: from sklearn.metrics import classification_report as CR print("Classification Report:\n", CR(Y_test, pred, zero_division=0)) # ### Cross Validation # In[12]: from sklearn.model_selection import StratifiedKFold as SKF from sklearn.model_selection import cross_val_score as CVS model = SVC(kernel='rbf', C=13, gamma=0.325) folds = 5 start = T() cross_val = SKF(n_splits=folds, shuffle=True, random_state=4) scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val) end = T()
def evaluate_model(self): """ Evaluate the model. Model is restored from models_path/model_name If model could not be loaded, exits. Data used for evaluation is the testing data. Once the whole dataset is forwarded, classification report and confusion matrix are computed """ # Initialize the variables sess = tf.Session() sess.run(tf.global_variables_initializer()) # Load variables if self.SAVE: try: modelname = self.MODELS_PATH + self.name saver = tf.train.import_meta_graph(modelname + ".meta") self.saver.restore(sess, modelname) except: print "Failed to restore model. Exiting." exit() #### TESTING #### Y_true = [] Y_pred = [] testing_time = time.time() testing_acc = 0 testing_loss = 0 tophonetic = np.vectorize(lambda t: sorted(self.labels)[t]) for batch_id in range(self.nb_batch_test): batch_time = time.time() # Get batch batch_X = self.X_test[batch_id] batch_Y = self.Y_test[batch_id] lengths = self.lengths_test[batch_id] # Get loss and accuracy loss, acc, predictions = sess.run( fetches=[self.loss, self.acc, self.predictions], feed_dict={ self.X_: batch_X, self.Y_: batch_Y, self.seq_lengths: lengths }) # Update global variables testing_acc += acc testing_loss += loss for i in range(self.batchsize): true = batch_Y[i, :lengths[i]] true = np.argmax(true, axis=1) Y_true += list(true) pred = predictions[i, :lengths[i]] pred = np.argmax(pred, axis=1) Y_pred += list(pred) testing_time = time.time() - testing_time testing_acc /= self.nb_batch_test testing_loss /= self.nb_batch_test self.logger.write_log( "\n\nAccuracy:\t%.2f%%\nLoss:\t\t%s\nTime:\t\t%.2fs\n" % (100 * testing_acc, testing_loss, testing_time)) Y_true = tophonetic(Y_true) Y_pred = tophonetic(Y_pred) # Classification Report (CR) self.logger.write_log(CR(Y_true, Y_pred)) # Confusion Matrix (CM) mat = CM(Y_true, Y_pred) # header line CONFMAT = "\t" + "\t".join([lbl[:5] for lbl in sorted(self.labels)]) + "\n" for i, phonetic in enumerate(sorted(self.labels)): CONFMAT += phonetic[:5] + "\t" + "\t".join( map(str, mat[i].tolist() + [np.sum(mat[i])])) + "\n\n" # footer line, sums CONFMAT += "\t" + "\t".join(map(str, np.sum(mat, axis=0).tolist())) self.logger.write_log(CONFMAT)
# NAIVE BAYES spam_detect_model = MNB().fit(messages_tfidf,df['label']) pred4 = spam_detect_model.predict(tfidf4)[0] print(pred4) pred = spam_detect_model.predict(messages_tfidf) rate = np.mean(pred == df['label']) print("Rate: {}\n".format(rate)) # TRAIN AND TEST msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64) # PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE pipe = Pipeline([ ('bow',CV(analyzer=text_process)), # COUNT VECTORIZER ('tfidf',TT()), # TFIDF TRANSFORMER ('classifier',MNB()) ]) # FIT AND PREDICT - BUSINESS AS USUAL pipe.fit(msg_train,label_train) pred_pipe = pipe.predict(msg_test) rate = np.mean(pred_pipe == label_test) print("Rate: {}\n".format(rate)) print(CR(label_test,pred_pipe))
y = yelp_class['stars'] # CREATE COUNT VECTORIZER AND FIT TO X cv = CV().fit(X) # OVERWRITE X WITH TRANSFORM X = cv.transform(X) # TRAIN TEST SPLIT X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64) # CREATE NAIVE BAYES OBJECT AND FIT nb = MNB().fit(X_train, y_train) pred = nb.predict(X_test) print(CR(y_test, pred)) # PIPELINE pipe = Pipeline([('CV', CV()), ('TFIDF', TT()), ("BAYES", MNB())]) # REDO SPLIT X = yelp_class['text'] y = yelp_class['stars'] X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64) # FIT THE PIPE pipe.fit(X_train, y_train) # PREDICT WITH PIPE pred_pipe = pipe.predict(X_test)
feat_cols = [tf.feature_column.numeric_column('x', shape=[13])] # MODEL deep_model = estimator.DNNClassifier( hidden_units=[20, 20, 20, 20], feature_columns=feat_cols, n_classes=3, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)) # INPUT FUNCTION input_func = estimator.inputs.numpy_input_fn(x={ 'x': scaled_x_train, }, y=y_train, shuffle=True, batch_size=50, num_epochs=100) # TRAINING deep_model.train(input_fn=input_func, steps=500) # EVALUATION input_func_eval = estimator.inputs.numpy_input_fn(x={'x': scaled_x_test}, shuffle=False) preds = list(deep_model.predict(input_fn=input_func_eval)) predictions = [p['class_ids'][0] for p in preds] print(CR(y_test, predictions)) print(CM(y_test, predictions))
test_size=0.3, random_state=64) # SCALE scaler = MMS() scaled_x_train = scaler.fit_transform(X_train) scaled_x_test = scaler.transform(X_test) # KERAS LAYERS dnn_keras_model = models.Sequential() dnn_keras_model.add(layers.Dense(units=20, input_dim=13, activation='relu')) dnn_keras_model.add(layers.Dense(units=20, activation='elu')) dnn_keras_model.add(layers.Dense(units=20, activation='elu')) dnn_keras_model.add(layers.Dense(units=20, activation='elu')) dnn_keras_model.add(layers.Dense(units=3, activation='softmax')) # COMPILE MODEL dnn_keras_model.compile(optimizer='Adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # TRAIN dnn_keras_model.fit(scaled_x_train, y_train, epochs=50) # PREDICTIONS predictions = dnn_keras_model.predict_classes(scaled_x_test) # EVALUATION print(CR(y_true=y_test, y_pred=predictions))
# plt.show() # DUMMI VARIABLES final = pd.get_dummies(data=df, columns=['purpose'], drop_first=True) print(final.head()) # SPLIT DATA X_train, X_test, y_train, y_test = TTS(final.drop('not.fully.paid', axis=1), final['not.fully.paid'], test_size=0.3, random_state=101) # DECISION TREE CLASSIFIER tree = DTC() tree.fit(X_train, y_train) # PREDICT tpred = tree.predict(X_test) print(CR(y_test, tpred)) print(CM(y_test, tpred)) # RANDOM FOREST CLASSIFIER forest = RFC(n_estimators=500) forest.fit(X_train, y_train) fpred = forest.predict(X_test) print(CR(y_test, fpred)) print(CM(y_test, fpred)) # RANDOM FOREST PERFORMED BETTER OVER ALL - BUT THE FALSE NEGATIVES INCREASED COMAPRED TO A SINGLE TREE
c, hash_bucket_size=n) feat = tf.feature_column.embedding_column(cat, dimension=n) fcols.append(feat) # INPUT FUNCTION input_func_train = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=1000, num_epochs=100, shuffle=True) # MODEL model = tf.estimator.DNNClassifier(hidden_units=[10, 10, 10, 10], feature_columns=fcols) # TRAINING model.train(input_fn=input_func_train, steps=None) # EVALUATION input_func_eval = tf.estimator.inputs.pandas_input_fn(x=X_test, shuffle=False, num_epochs=1) preds = model.predict(input_fn=input_func_eval) lpreds = list(preds) cpreds = [pred['class_ids'][0] for pred in list(lpreds)] print(CM(y_true=y_test, y_pred=cpreds)) print(CR(y_true=y_test, y_pred=cpreds))
train = pd.concat([train, sex, embark], axis=1) train = pd.concat([train, pclass], axis=1) # IN SEPARATE LINE FOR EASY ON/OFF SWITCH # DROP THE NO LONGE NEEDED COLUMNS train.drop(['Sex', 'Embarked', 'Name', 'Ticket', 'PassengerId'], inplace=True, axis=1) train.drop(['Pclass'], inplace=True, axis=1) # IN SEPARATE LINE FOR EASY ON/OFF SWITCH # PART 3 - TRAINING AND PREDICTION X = train.drop('Survived', axis=1) y = train['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) lg = LGR() lg.fit(X_train, y_train) # EVALUATE THE RESULTS pred = pd.DataFrame({'P': lg.predict(X_test), 'R': y_test}) print(pred.corr()) print(CR(pred['R'], pred['P'])) print(CM(pred['R'], pred['P']))
# sns.distplot(df['Outstate'][df['Private'] == 'Yes'],bins=30,kde=False) # plt.show() # sns.distplot(df['Grad.Rate'][df['Private'] == 'No'],bins=30,kde=False) # sns.distplot(df['Grad.Rate'][df['Private'] == 'Yes'],bins=30,kde=False) # plt.show() # INFO print("Name of private school with grad rate higher than 100: {}\n".format( df.loc[df['Grad.Rate'] > 100].index.values[0])) df.loc[df['Grad.Rate'] > 100, 'Grad.Rate'] = 100 # sns.distplot(df['Grad.Rate'][df['Private'] == 'No'],bins=30,kde=False) # sns.distplot(df['Grad.Rate'][df['Private'] == 'Yes'],bins=30,kde=False) # plt.show() # K MEANS # FIT model = KM(n_clusters=2) model.fit(df.drop('Private', axis=1)) # CENTER VECTORS print("Fitted models center vectors:\n{}\n".format(model.cluster_centers_)) # CREATE NUMERICAL CLUSTER COLUMN df['Cluster'] = df['Private'].map({'Yes': 1, 'No': 0}) print(CR(df['Cluster'], model.labels_)) print(CM(df['Cluster'], model.labels_))