Пример #1
0
 def __init__(self,
              y_true,
              y_pred,
              labels=None,
              target_names=None,
              sample_weight=None,
              digits=2,
              output_dict=False,
              zero_division='warn'):
     self.zero_division = zero_division
     self.output_dict = output_dict
     self.y_pred = y_pred
     self.target_names = target_names
     self.digits = digits
     Metrics.__init__(self,
                      sample_weight=sample_weight,
                      y_true=y_true,
                      labels=labels)
     self.value = CR(sample_weight=self.sample_weight,
                     zero_division=self.zero_division,
                     digits=self.digits,
                     y_true=self.y_true,
                     labels=self.labels,
                     output_dict=self.output_dict,
                     y_pred=self.y_pred,
                     target_names=self.target_names)
Пример #2
0
 def evaluateModel(self,test_x,test_y,model):
     from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
     
     predict_result = model.predict(test_x)
     predict_prob = model.predict_proba(test_x)[:,1]
     
     
     crReport = CR(test_y,predict_result)
     acc = model.score(test_x,test_y)
     roc = ROC(test_y,predict_prob)
     #print("Accuracy %f" % model.score(test_x,test_y))        
     return crReport,acc,roc
Пример #3
0
def log_anomalyPRF_isof(cp, ground_truth, dataset, log_flag, SEED=1234):
    # Init clustering hyperparameters
    n_clusters = cp.getint('Hyperparameters', 'ClusterNum')
    cluster_init = cp.getint('Hyperparameters', 'ClusterInit')
    km = OCS(kernel='linear')
    if isinstance(dataset, basestring):
        pred = km.fit_predict(np.load(dataset))
        pred[np.where(pred == -1)[0]] = 0
    else:
        pred = km.fit_predict(dataset)
        pred[np.where(pred == -1)[0]] = 0
    #  pred = assign_labels(pred, ground_truth)
    print CR(ground_truth, pred)
Пример #4
0
def log_anomalyPRF_AC(cp, ground_truth, dataset, log_flag, SEED=1234):
    # Init clustering hyperparameters
    n_clusters = cp.getint('Hyperparameters', 'ClusterNum')
    cluster_init = cp.getint('Hyperparameters', 'ClusterInit')
    # KMeans model
    #  km = KMeans(n_clusters=n_clusters, n_init=cluster_init, n_jobs=-1,
    #  random_state=SEED)
    km = AC(n_clusters=n_clusters)
    if isinstance(dataset, basestring):
        pred = km.fit_predict(np.load(dataset))
    else:
        pred = km.fit_predict(dataset)
    pred = assign_labels(pred, ground_truth)
    print CR(ground_truth, pred)
# sns.kdeplot(iris[['sepal_width','sepal_length']][iris['species'] == "setosa"])
# plt.show()

# SPLIT DATA
X_train, X_test, y_train, y_test = TTS(iris.drop('species', axis=1),
                                       iris['species'],
                                       test_size=0.3,
                                       random_state=101)

# TRAIN MODEL
model = SVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(CR(y_test, pred), CM(y_test, pred))
print(model)

# GRID SEARCH - "THIS IS NOT NECESSARY, THE MODEL IS PERFECT"
param_grid = {
    'C': list(np.arange(0.1, 10, 0.1)),
    'gamma': [1, 0.1, 0.001, 0.0001]
}

grid = GSCV(SVC(), param_grid, verbose=3, n_jobs=4)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_estimator_)

gpred = grid.predict(X_test)
Пример #6
0
# SCALING
scaler = SS()
scaler.fit(df.drop('TARGET CLASS',axis=1))
scaled = scaler.transform(df.drop('TARGET CLASS',axis=1))
df_scale = pd.DataFrame(scaled,columns=df.columns[:-1])
print(df_scale.head())

# SPLIT DATA INTO TRAINING AND TESTING
X_train,X_test,y_train,y_test = TTS(df_scale,df['TARGET CLASS'],test_size=0.3,random_state=101)

# KNN
model = KNC(n_neighbors=1)
model.fit(X_train,y_train)
pred = model.predict(X_test)

print(CR(y_test,pred))
print(CM(y_test,pred))

# CHOOSE K VALUE (ELBOW METHOD)
error_rate = []

for i in range(1,40):
	model = KNC(n_neighbors=i)
	model.fit(X_train,y_train)
	pred_i = model.predict(X_test)
	error_rate.append(np.mean(y_test != pred_i))

sns.lineplot(x=np.arange(1,40),y=np.array(error_rate))
plt.show()

# RERUN WITH NEW K
model = SVC(kernel=kernel, C=C, gamma=gamma)
clf = model.fit(X_train, Y_train)
end = T()

pred = clf.predict(X_test)
mScore = clf.score(X_test, Y_test)
print(f'Score against Testing Data: {mScore * 100:.3f}%')
print(f'Model took {(end-start)*1000:.3f}ms to train')

# ### Generate Classification Report

# In[11]:

from sklearn.metrics import classification_report as CR

print("Classification Report:\n", CR(Y_test, pred, zero_division=0))

# ### Cross Validation

# In[12]:

from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.model_selection import cross_val_score as CVS

model = SVC(kernel='rbf', C=13, gamma=0.325)
folds = 5

start = T()
cross_val = SKF(n_splits=folds, shuffle=True, random_state=4)
scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val)
end = T()
Пример #8
0
    def evaluate_model(self):
        """ Evaluate the model.
            Model is restored from models_path/model_name
            If model could not be loaded, exits.
            Data used for evaluation is the testing data.
            Once the whole dataset is forwarded, classification report and
            confusion matrix are computed
        """
        # Initialize the variables
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Load variables
        if self.SAVE:
            try:
                modelname = self.MODELS_PATH + self.name
                saver = tf.train.import_meta_graph(modelname + ".meta")
                self.saver.restore(sess, modelname)
            except:
                print "Failed to restore model. Exiting."
                exit()

        #### TESTING ####
        Y_true = []
        Y_pred = []
        testing_time = time.time()
        testing_acc = 0
        testing_loss = 0
        tophonetic = np.vectorize(lambda t: sorted(self.labels)[t])
        for batch_id in range(self.nb_batch_test):
            batch_time = time.time()

            # Get batch
            batch_X = self.X_test[batch_id]
            batch_Y = self.Y_test[batch_id]
            lengths = self.lengths_test[batch_id]

            # Get loss and accuracy
            loss, acc, predictions = sess.run(
                fetches=[self.loss, self.acc, self.predictions],
                feed_dict={
                    self.X_: batch_X,
                    self.Y_: batch_Y,
                    self.seq_lengths: lengths
                })

            # Update global variables
            testing_acc += acc
            testing_loss += loss

            for i in range(self.batchsize):
                true = batch_Y[i, :lengths[i]]
                true = np.argmax(true, axis=1)
                Y_true += list(true)
                pred = predictions[i, :lengths[i]]
                pred = np.argmax(pred, axis=1)
                Y_pred += list(pred)

        testing_time = time.time() - testing_time
        testing_acc /= self.nb_batch_test
        testing_loss /= self.nb_batch_test
        self.logger.write_log(
            "\n\nAccuracy:\t%.2f%%\nLoss:\t\t%s\nTime:\t\t%.2fs\n" %
            (100 * testing_acc, testing_loss, testing_time))

        Y_true = tophonetic(Y_true)
        Y_pred = tophonetic(Y_pred)

        # Classification Report (CR)
        self.logger.write_log(CR(Y_true, Y_pred))

        # Confusion Matrix (CM)
        mat = CM(Y_true, Y_pred)

        # header line
        CONFMAT = "\t" + "\t".join([lbl[:5]
                                    for lbl in sorted(self.labels)]) + "\n"

        for i, phonetic in enumerate(sorted(self.labels)):
            CONFMAT += phonetic[:5] + "\t" + "\t".join(
                map(str, mat[i].tolist() + [np.sum(mat[i])])) + "\n\n"

        # footer line, sums
        CONFMAT += "\t" + "\t".join(map(str, np.sum(mat, axis=0).tolist()))
        self.logger.write_log(CONFMAT)
Пример #9
0
# NAIVE BAYES
spam_detect_model = MNB().fit(messages_tfidf,df['label'])
pred4 = spam_detect_model.predict(tfidf4)[0]
print(pred4)

pred = spam_detect_model.predict(messages_tfidf)

rate = np.mean(pred == df['label'])
print("Rate: {}\n".format(rate))

# TRAIN AND TEST
msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64)

# PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE
pipe = Pipeline([
	('bow',CV(analyzer=text_process)), # COUNT VECTORIZER
	('tfidf',TT()), # TFIDF TRANSFORMER
	('classifier',MNB())
])

# FIT AND PREDICT - BUSINESS AS USUAL
pipe.fit(msg_train,label_train)

pred_pipe = pipe.predict(msg_test)

rate = np.mean(pred_pipe == label_test)
print("Rate: {}\n".format(rate))

print(CR(label_test,pred_pipe))
Пример #10
0
y = yelp_class['stars']

# CREATE COUNT VECTORIZER AND FIT TO X
cv = CV().fit(X)

# OVERWRITE X WITH TRANSFORM
X = cv.transform(X)

# TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64)

# CREATE NAIVE BAYES OBJECT AND FIT
nb = MNB().fit(X_train, y_train)
pred = nb.predict(X_test)

print(CR(y_test, pred))

# PIPELINE
pipe = Pipeline([('CV', CV()), ('TFIDF', TT()), ("BAYES", MNB())])

# REDO SPLIT
X = yelp_class['text']
y = yelp_class['stars']
X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64)

# FIT THE PIPE
pipe.fit(X_train, y_train)

# PREDICT WITH PIPE
pred_pipe = pipe.predict(X_test)
Пример #11
0
feat_cols = [tf.feature_column.numeric_column('x', shape=[13])]

# MODEL
deep_model = estimator.DNNClassifier(
    hidden_units=[20, 20, 20, 20],
    feature_columns=feat_cols,
    n_classes=3,
    optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))

# INPUT FUNCTION
input_func = estimator.inputs.numpy_input_fn(x={
    'x': scaled_x_train,
},
                                             y=y_train,
                                             shuffle=True,
                                             batch_size=50,
                                             num_epochs=100)

# TRAINING
deep_model.train(input_fn=input_func, steps=500)

# EVALUATION
input_func_eval = estimator.inputs.numpy_input_fn(x={'x': scaled_x_test},
                                                  shuffle=False)

preds = list(deep_model.predict(input_fn=input_func_eval))

predictions = [p['class_ids'][0] for p in preds]

print(CR(y_test, predictions))
print(CM(y_test, predictions))
Пример #12
0
                                       test_size=0.3,
                                       random_state=64)

# SCALE
scaler = MMS()
scaled_x_train = scaler.fit_transform(X_train)
scaled_x_test = scaler.transform(X_test)

# KERAS LAYERS
dnn_keras_model = models.Sequential()

dnn_keras_model.add(layers.Dense(units=20, input_dim=13, activation='relu'))
dnn_keras_model.add(layers.Dense(units=20, activation='elu'))
dnn_keras_model.add(layers.Dense(units=20, activation='elu'))
dnn_keras_model.add(layers.Dense(units=20, activation='elu'))
dnn_keras_model.add(layers.Dense(units=3, activation='softmax'))

# COMPILE MODEL
dnn_keras_model.compile(optimizer='Adadelta',
                        loss='sparse_categorical_crossentropy',
                        metrics=['accuracy'])

# TRAIN
dnn_keras_model.fit(scaled_x_train, y_train, epochs=50)

# PREDICTIONS
predictions = dnn_keras_model.predict_classes(scaled_x_test)

# EVALUATION
print(CR(y_true=y_test, y_pred=predictions))
# plt.show()

# DUMMI VARIABLES
final = pd.get_dummies(data=df, columns=['purpose'], drop_first=True)
print(final.head())

# SPLIT DATA
X_train, X_test, y_train, y_test = TTS(final.drop('not.fully.paid', axis=1),
                                       final['not.fully.paid'],
                                       test_size=0.3,
                                       random_state=101)

# DECISION TREE CLASSIFIER
tree = DTC()
tree.fit(X_train, y_train)

# PREDICT
tpred = tree.predict(X_test)

print(CR(y_test, tpred))
print(CM(y_test, tpred))

# RANDOM FOREST CLASSIFIER
forest = RFC(n_estimators=500)
forest.fit(X_train, y_train)
fpred = forest.predict(X_test)

print(CR(y_test, fpred))
print(CM(y_test, fpred))

# RANDOM FOREST PERFORMED BETTER OVER ALL - BUT THE FALSE NEGATIVES INCREASED COMAPRED TO A SINGLE TREE
Пример #14
0
            c, hash_bucket_size=n)
        feat = tf.feature_column.embedding_column(cat, dimension=n)

    fcols.append(feat)

# INPUT FUNCTION
input_func_train = tf.estimator.inputs.pandas_input_fn(x=X_train,
                                                       y=y_train,
                                                       batch_size=1000,
                                                       num_epochs=100,
                                                       shuffle=True)

# MODEL
model = tf.estimator.DNNClassifier(hidden_units=[10, 10, 10, 10],
                                   feature_columns=fcols)

# TRAINING
model.train(input_fn=input_func_train, steps=None)

# EVALUATION
input_func_eval = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                      shuffle=False,
                                                      num_epochs=1)
preds = model.predict(input_fn=input_func_eval)
lpreds = list(preds)
cpreds = [pred['class_ids'][0] for pred in list(lpreds)]

print(CM(y_true=y_test, y_pred=cpreds))

print(CR(y_true=y_test, y_pred=cpreds))
train = pd.concat([train, sex, embark], axis=1)
train = pd.concat([train, pclass],
                  axis=1)  # IN SEPARATE LINE FOR EASY ON/OFF SWITCH

# DROP THE NO LONGE NEEDED COLUMNS
train.drop(['Sex', 'Embarked', 'Name', 'Ticket', 'PassengerId'],
           inplace=True,
           axis=1)
train.drop(['Pclass'], inplace=True,
           axis=1)  # IN SEPARATE LINE FOR EASY ON/OFF SWITCH

# PART 3 - TRAINING AND PREDICTION
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=101)

lg = LGR()
lg.fit(X_train, y_train)

# EVALUATE THE RESULTS
pred = pd.DataFrame({'P': lg.predict(X_test), 'R': y_test})

print(pred.corr())
print(CR(pred['R'], pred['P']))
print(CM(pred['R'], pred['P']))
Пример #16
0
# sns.distplot(df['Outstate'][df['Private'] == 'Yes'],bins=30,kde=False)
# plt.show()

# sns.distplot(df['Grad.Rate'][df['Private'] == 'No'],bins=30,kde=False)
# sns.distplot(df['Grad.Rate'][df['Private'] == 'Yes'],bins=30,kde=False)
# plt.show()

# INFO
print("Name of private school with grad rate higher than 100: {}\n".format(
    df.loc[df['Grad.Rate'] > 100].index.values[0]))

df.loc[df['Grad.Rate'] > 100, 'Grad.Rate'] = 100

# sns.distplot(df['Grad.Rate'][df['Private'] == 'No'],bins=30,kde=False)
# sns.distplot(df['Grad.Rate'][df['Private'] == 'Yes'],bins=30,kde=False)
# plt.show()

# K MEANS
# FIT
model = KM(n_clusters=2)
model.fit(df.drop('Private', axis=1))

# CENTER VECTORS
print("Fitted models center vectors:\n{}\n".format(model.cluster_centers_))

# CREATE NUMERICAL CLUSTER COLUMN
df['Cluster'] = df['Private'].map({'Yes': 1, 'No': 0})

print(CR(df['Cluster'], model.labels_))
print(CM(df['Cluster'], model.labels_))