def main(): data = scipy.io.loadmat('spamData.mat') xtrain = preprocess.binarize(data['Xtrain']) # xtrain = data['Xtrain'] ytrain = data['ytrain'] xtest = preprocess.binarize(data['Xtest']) ytest = data['ytest'] threshold = 0.0001 reg_learn_pairs = [(0.01, 0.0001)] for regularization_weight, learning_rate in reg_learn_pairs: print 'Regularization_weight %s learning_rate %s' % (regularization_weight, learning_rate) xplot = [] yplot = [] beta = batch(xtrain, ytrain, threshold, regularization_weight, learning_rate, xplot, yplot) train = test_error(xtrain, ytrain, beta) test = test_error(xtest, ytest, beta) with open('res.txt', 'a') as f: f.write('%s\t%s\t%s\t%s\n' % (regularization_weight, learning_rate, train, test)) f.flush() # plot xplot vs yplot pyplot.plot(xplot, yplot) pyplot.title('Training Loss vs Number of Iterations.\nregularization_weight %s learning_rate %s' % ( regularization_weight, learning_rate)) pyplot.xlabel("Number of Iterations") pyplot.ylabel("Negative Log Likelihood") pyplot.show()
def generate_auc_roc(X_test, y_test): model_loaded = load_model(model_file_h5) generate_classification_report(model_loaded, X_test, y_test) predicted_classes = model_loaded.predict_classes(X_test) print("Predicted Classes") print(predicted_classes) score, acc = model_loaded.evaluate(X_test, y_test, batch_size=BATCH_SIZE) print(score) print(acc) y_score = model_loaded.predict_proba(X_test) print("Predicted Probabilities") print(y_score) bin_output = preprocess.binarize(y_test) multiclassROC.calculate_roc(bin_output, y_score, "RnnClassifierModel", CLASSES)
df_telco = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', delimiter=',', header=None, na_values='\s+', skiprows=1) df_telco.iloc[:, 5] = pd.to_numeric(df_telco.iloc[:, 5]) df_telco.iloc[:, 18] = pd.to_numeric(df_telco.iloc[:, 18]) df_telco.iloc[:, 19] = pd.to_numeric(df_telco.iloc[:, 19], errors='coerce') df_telco = tp.process_missing_label(df_telco) df_telco = tp.process_missing_attribute( df_telco, [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]) print('Missing values handled') df_telco = tp.process_string_to_int( df_telco, [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20]) print('String converted to integer labels') df_telco, binarizers_telco, binarizers_telco_columns = tp.binarize( df_telco, [5, 18, 19]) print('Continuous values binarized') df_telco = df_telco.reset_index(drop=True) df_telco = df_telco.drop([0], axis=1) df_telco = df_telco.T.reset_index(drop=True).T df_telco.to_csv('Preprocessed_Telco.csv', sep=',') print('Preprocessing Finished...') end_preprocessing = time.time() print("Preprocessing took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min") print() print() else: df_telco = pd.read_csv('Preprocessed_Telco.csv', delimiter=',',
import preprocess as ap import decision_tree import adaboost prepro = 0 if prepro == 1: start_preprocessing = time.time() print('Preprocessing started...') df_adult_train = pd.read_csv('adult_data.csv', delimiter=',', header=None, na_values=' ?') df_adult_train = ap.process_missing_label(df_adult_train) df_adult_train = ap.process_missing_attribute(df_adult_train, [1, 3, 5, 6, 7, 8, 9, 13]) print('Missing values handled') df_adult_train = ap.process_string_to_int(df_adult_train, [1, 3, 5, 6, 7, 8, 9, 13, 14]) print('String converted to integer labels') df_adult_train, binarizers_adult, binarizers_adult_columns = ap.binarize(df_adult_train, [0, 2, 4, 10, 11, 12]) print('Continuous values binarized') df_adult_train = df_adult_train.reset_index(drop=True) df_adult_train.to_csv('Preprocessed_Adult_Train.csv', sep=',') end_preprocessing = time.time() print("Preprocessing training data took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min") start_preprocessing = time.time() df_adult_test = pd.read_csv('adult_test.csv', delimiter=',', header=None, na_values=' ?') df_adult_test = ap.process_missing_label(df_adult_test) df_adult_test = ap.process_missing_attribute(df_adult_test, [1, 3, 5, 6, 7, 8, 9, 13]) df_adult_test = ap.process_string_to_int(df_adult_test, [1, 3, 5, 6, 7, 8, 9, 13, 14]) df_adult_test = ap.binarize_test(binarizers_adult, binarizers_adult_columns, df_adult_test) df_adult_test = df_adult_test.reset_index(drop=True) df_adult_test.to_csv('Preprocessed_Adult_Test.csv', sep=',')
import sys import cv2 import preprocess import segmentbeautify # File paths filepath = "public/img/" file = sys.argv[1] print "\n This is the filename: " + filepath + file + "\n" # Read image img = cv2.imread(filepath + file, 0) print img # Preprocess the image img = preprocess.binarize(img) img = preprocess.removeSaltnPepperNoise(img) # Segment words and lines words_mapping, word_spacing, line_spacing = segmentbeautify.extractLines( img, file) print word_spacing, line_spacing # Beautify the text beautified = segmentbeautify.beautify(img, words_mapping, word_spacing, line_spacing, file, filepath) print "beautified", beautified
if label_split[0] != "a01-117-05-02": labels.append([label_split[0], label_split[-1]]) print labels i = 0 filepath = "Datasets/words/" target = open('sequence1.csv', "w") for label in labels: if (113624 <= i): print label[1] navi = label[0].split("-") word = cv2.imread( filepath + navi[0] + "/" + navi[0] + "-" + navi[1] + "/" + label[0] + ".png", 0) cv2.imshow("original", word) word = preprocess.binarize(word) cv2.imshow("binarized", word) print word.shape word = preprocess.resizeImage(word) print word.shape word = preprocess.sharpen(word) word = cv2.bitwise_not(word) word = word / 255 for row in range(word.shape[0]): for col in range(word.shape[1]): target.write(str(word[row, col]) + ",") target.write(label[1]) target.write("\n") i += 1 target.close()
# Path = 'datasets/NN/24_riboswitches.csv' Path = 'processed_datasets/final_32train.csv' #Call function to Load Dataset Data_train, Output_train = preprocess.Load_Data_baseModel(Path, Data_train, Output_train) #Converting the train data into Float Data_train, Output_train = preprocess.Convert_to_Float(Data_train, Output_train) Path = 'processed_datasets/final_32test.csv' #Call function to Load Dataset Data_test, Output_test = preprocess.Load_Data_baseModel(Path, Data_test, Output_test) #Converting the train data into Float Data_test, Output_test = preprocess.Convert_to_Float(Data_test, Output_test) bin_output = preprocess.binarize(Output_test) scaler = StandardScaler() scaler.fit(Data_train) Data_train = scaler.transform(Data_train) Data_test = scaler.transform(Data_test) construct_models(Data_train, Data_test, Output_train, Output_test, bin_output) total_class=preprocess.get_totalclass('processed_datasets/final_32test.csv') generate_roc(Data_train, Data_test, Output_train, Output_test, bin_output,total_class)
import decision_tree import adaboost prepro = 0 if prepro == 1: start_preprocessing = time.time() print('Preprocessing started...') df_credit_temp = pd.read_csv('creditcard.csv', delimiter=',', header=None, na_values='\s+', skiprows=1) df_credit_pos = df_credit_temp.loc[df_credit_temp.iloc[:, df_credit_temp.shape[1] - 1] == 1] df_credit_neg = df_credit_temp.loc[df_credit_temp.iloc[:, df_credit_temp.shape[1] - 1] == 0] df_credit_neg = df_credit_neg.sample(n=20000, replace=False) df_credit = pd.concat([df_credit_neg, df_credit_pos], axis=0) df_credit = cp.process_missing_label(df_credit) df_credit = cp.process_missing_attribute(df_credit, range(0, df_credit.shape[1] - 1)) print('Missing values handled') df_credit, binarizers_credit, binarizers_credit_columns = cp.binarize(df_credit, range(0, df_credit.shape[1] - 1)) print('Continuous values binarized') df_credit = df_credit.reset_index(drop=True) df_credit.to_csv('Preprocessed_Credit.csv', sep=',') print('Preprocessing Finished...') end_preprocessing = time.time() print("Preprocessing took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min") print() print() else: df_credit = pd.read_csv('Preprocessed_Credit.csv', delimiter=',', header=None) df_credit_train, df_credit_test = model_selection.train_test_split(df_credit, test_size=0.20) start_training = time.time()
print 'Logistic Regression with Binarized Features:' # print 'refreshing dataset...' # parse train and test text files train_x = get_features('spam_train.txt') train_y = get_classification('spam_train.txt') test_x = get_features('spam_test.txt') test_y = get_classification('spam_test.txt') # print 'binarizing features...' # standardize features train_x = binarize(train_x) test_x = binarize(test_x) # add 1 y-intercept column train_x = add_ones(train_x) test_x = add_ones(test_x) # print 'calculating weights...' # find W for logistic regression with gradient descent w = logistic_regression(train_x, train_y) # print 'predicting...' # make predictions train_predictions = predict_y(train_x, w)