def main(): # simple arguments check if len(sys.argv) != 6: print("wrong number of arguments") sys.exit(0) # passing args learning_rate = sys.argv[1] num_hidden_units = sys.argv[2] num_epoches = sys.argv[3] training_set_path = sys.argv[4] test_set_path = sys.argv[5] # load data training_set, test_set = func.load_data(training_set_path, test_set_path) # find index of num and category features num_feature_index_list, cate_feature_index_list = func.classify_features(training_set, test_set) # getting traning set matrices training_set_num_feature_matrix, training_set_cate_feature_matrix, training_set_label_matrix = func.get_matrices(training_set, num_feature_index_list, cate_feature_index_list) test_set_num_feature_matrix, test_set_cate_feature_matrix, test_set_label_matrix = func.get_matrices(test_set, num_feature_index_list, cate_feature_index_list) # get the total number of training and test instances total_num_training = training_set_num_feature_matrix.shape[0] total_num_test = test_set_num_feature_matrix.shape[0] # fill up the feature status list # [num, cate] feature_status = [0,0] if training_set_num_feature_matrix.size != 0 : feature_status[0] = 1 if training_set_label_matrix.size != 0 : feature_status[1] = 1 # standardize num features normed_training_set_num_feature_matrix = np.zeros((total_num_training,0)) normed_test_set_num_feature_matrix = np.zeros((total_num_test,0)) if feature_status[0] == 1: normed_training_set_num_feature_matrix, normed_test_set_num_feature_matrix = func.standardize(training_set_num_feature_matrix, test_set_num_feature_matrix) # combine the feature matrix, reorder combined_index_list = num_feature_index_list + cate_feature_index_list sorted_index_list = np.argsort(combined_index_list) combined_training_set_feature_matrix = np.hstack((normed_training_set_num_feature_matrix,training_set_cate_feature_matrix)) combined_test_set_feature_matrix = np.hstack((normed_test_set_num_feature_matrix,test_set_cate_feature_matrix)) ordered_training_set_feature_matrix = combined_training_set_feature_matrix[:,sorted_index_list] ordered_test_set_feature_matrix = combined_test_set_feature_matrix[:,sorted_index_list] # one-hot num_to_skip = 0 for idx,original_idx in enumerate(cate_feature_index_list): variant_list = training_set["metadata"]["features"][:-1][original_idx][1] cur_training_col = training_set_cate_feature_matrix[:,idx] cur_test_col = test_set_cate_feature_matrix[:,idx] for jdx, variant in enumerate(variant_list): cur_training_col[cur_training_col == variant] = jdx cur_test_col[cur_test_col == variant] = jdx cur_training_col = cur_training_col.astype(int) cur_test_col = cur_test_col.astype(int) expanded_training_cols = np.zeros((total_num_training,len(variant_list))) expanded_training_cols[np.arange(total_num_training),cur_training_col.flatten()] = 1 expanded_test_cols = np.zeros((total_num_test,len(variant_list))) expanded_test_cols[np.arange(total_num_test),cur_test_col.flatten()] = 1 ordered_training_set_feature_matrix = np.delete(ordered_training_set_feature_matrix,original_idx + num_to_skip,axis=1) ordered_training_set_feature_matrix = np.insert(ordered_training_set_feature_matrix,[original_idx + num_to_skip],expanded_training_cols,axis=1) ordered_test_set_feature_matrix = np.delete(ordered_test_set_feature_matrix,original_idx + num_to_skip,axis=1) ordered_test_set_feature_matrix = np.insert(ordered_test_set_feature_matrix,[original_idx + num_to_skip],expanded_test_cols,axis=1) num_to_skip += (len(variant_list) - 1) # append bias entry ordered_training_set_feature_matrix = np.insert(ordered_training_set_feature_matrix,0,1,axis=1).astype(float) ordered_test_set_feature_matrix = np.insert(ordered_test_set_feature_matrix,0,1,axis=1).astype(float) # initialize weight w_i_h = np.random.uniform(low=-0.01, high=0.01, size=(int(num_hidden_units), ordered_training_set_feature_matrix.shape[1])) w_h_o = np.random.uniform(low=-0.01, high=0.01, size=(1, int(num_hidden_units) + 1)) # nn SGD class_list = training_set["metadata"]["features"][-1][1] for epoch in range(int(num_epoches)): num_corr = 0 num_incorr = 0 sum_E = 0 for idx in range(total_num_training): # index indicate hidden unit, 1d net_i_h = np.dot(ordered_training_set_feature_matrix[idx,:],np.transpose(w_i_h)) h = func.sigmoid(net_i_h) # adding bias entry h_o = np.insert(h,0,1).astype(float) net_h_o = np.dot(w_h_o, h_o) o = func.sigmoid(net_h_o) y = training_set_label_matrix[idx,0] if class_list.index(y) == 0: y = 0 else: y = 1 E = -y * np.log(o) - (1 - y) * np.log(1 - o) sum_E += E d_o = y - o d_h = h_o*(1 - h_o)*d_o*w_h_o update_h_o = float(learning_rate)*d_o*h_o update_i_h = float(learning_rate)*d_h[:,1]*ordered_training_set_feature_matrix[idx,:] for curcol in range(2,d_h.shape[1]): temp = float(learning_rate)*d_h[:,curcol]*ordered_training_set_feature_matrix[idx,:] update_i_h = np.vstack((update_i_h,temp)) w_i_h += update_i_h w_h_o += update_h_o pred = 0 if o > 0.5: pred = 1 else: pred = 0 if pred == y: num_corr +=1 else: num_incorr +=1 print(str(epoch+1)+ " {:.12f}".format(sum_E[0])+ " " + str(num_corr) + " " + str(num_incorr)) # prediction on test set num_corr = 0 num_incorr = 0 # true positive tp = 0 # predicted positive pp = 0 for idx in range(total_num_test): # index indicate hidden unit, 1d net_i_h = np.dot(ordered_test_set_feature_matrix[idx,:],np.transpose(w_i_h)) h = func.sigmoid(net_i_h) # adding bias entry h_o = np.insert(h,0,1).astype(float) net_h_o = np.dot(w_h_o, h_o) o = func.sigmoid(net_h_o) y = test_set_label_matrix[idx,0] if class_list.index(y) == 0: y = 0 else: y = 1 pred = 0 if o > 0.5: pred = 1 pp += 1 else: pred = 0 if pred == y: num_corr +=1 if pred == 1: tp += 1 else: num_incorr +=1 print("{:.12f} ".format(o[0]) + str(pred) + " " + str(y)) print(str(num_corr) + " " + str(num_incorr)) actual_pos = np.sum(test_set_label_matrix == class_list[1]) recall = tp/actual_pos precision = tp/pp F1 = 2*precision*recall/(precision + recall) print("{:.12f}".format(F1))
def main(): # simple arguments check if len(sys.argv) != 5: print("wrong number of arguments") sys.exit(0) # passing args learning_rate = sys.argv[1] max_epoch = sys.argv[2] training_set_path = sys.argv[3] test_set_path = sys.argv[4] # load data training_set, test_set = func.load_data(training_set_path, test_set_path) # find index of num and category features num_feature_index_list, cate_feature_index_list = func.classify_features( training_set, test_set) # getting traning set matrices training_set_num_feature_matrix, training_set_cate_feature_matrix, training_set_label_matrix = func.get_matrices( training_set, num_feature_index_list, cate_feature_index_list) test_set_num_feature_matrix, test_set_cate_feature_matrix, test_set_label_matrix = func.get_matrices( test_set, num_feature_index_list, cate_feature_index_list) # get the total number of training and test instances total_num_training = training_set_num_feature_matrix.shape[0] total_num_test = test_set_num_feature_matrix.shape[0] # fill up the feature status list # [num, cate] feature_status = [0, 0] if training_set_num_feature_matrix.size != 0: feature_status[0] = 1 if training_set_label_matrix.size != 0: feature_status[1] = 1 # standardize num features normed_training_set_num_feature_matrix = np.zeros((total_num_training, 0)) normed_test_set_num_feature_matrix = np.zeros((total_num_test, 0)) if feature_status[0] == 1: normed_training_set_num_feature_matrix, normed_test_set_num_feature_matrix = func.standardize( training_set_num_feature_matrix, test_set_num_feature_matrix) # combine the feature matrix, reorder combined_index_list = num_feature_index_list + cate_feature_index_list sorted_index_list = np.argsort(combined_index_list) combined_training_set_feature_matrix = np.hstack( (normed_training_set_num_feature_matrix, training_set_cate_feature_matrix)) combined_test_set_feature_matrix = np.hstack( (normed_test_set_num_feature_matrix, test_set_cate_feature_matrix)) ordered_training_set_feature_matrix = combined_training_set_feature_matrix[:, sorted_index_list] ordered_test_set_feature_matrix = combined_test_set_feature_matrix[:, sorted_index_list] # one-hot num_to_skip = 0 for idx, original_idx in enumerate(cate_feature_index_list): variant_list = training_set["metadata"]["features"][:-1][original_idx][ 1] cur_training_col = training_set_cate_feature_matrix[:, idx] cur_test_col = test_set_cate_feature_matrix[:, idx] for jdx, variant in enumerate(variant_list): cur_training_col[cur_training_col == variant] = jdx cur_test_col[cur_test_col == variant] = jdx cur_training_col = cur_training_col.astype(int) cur_test_col = cur_test_col.astype(int) expanded_training_cols = np.zeros( (total_num_training, len(variant_list))) expanded_training_cols[np.arange(total_num_training), cur_training_col.flatten()] = 1 expanded_test_cols = np.zeros((total_num_test, len(variant_list))) expanded_test_cols[np.arange(total_num_test), cur_test_col.flatten()] = 1 ordered_training_set_feature_matrix = np.delete( ordered_training_set_feature_matrix, original_idx + num_to_skip, axis=1) ordered_training_set_feature_matrix = np.insert( ordered_training_set_feature_matrix, [original_idx + num_to_skip], expanded_training_cols, axis=1) ordered_test_set_feature_matrix = np.delete( ordered_test_set_feature_matrix, original_idx + num_to_skip, axis=1) ordered_test_set_feature_matrix = np.insert( ordered_test_set_feature_matrix, [original_idx + num_to_skip], expanded_test_cols, axis=1) num_to_skip += (len(variant_list) - 1) # append bias entry ordered_training_set_feature_matrix = np.insert( ordered_training_set_feature_matrix, 0, 1, axis=1).astype(float) ordered_test_set_feature_matrix = np.insert( ordered_test_set_feature_matrix, 0, 1, axis=1).astype(float) # SGD F1_training = [] F1_test = [] class_list = training_set["metadata"]["features"][-1][1] for num_epoches in range(1, int(max_epoch) + 1): # initialize weight w = np.random.uniform( low=-0.01, high=0.01, size=(1, ordered_training_set_feature_matrix.shape[1])) for epoch in range(num_epoches): for idx in range(total_num_training): net = np.dot(w, ordered_training_set_feature_matrix[idx, :]) o = func.sigmoid(net) y = training_set_label_matrix[idx, 0] if class_list.index(y) == 0: y = 0 else: y = 1 E = -y * np.log(o) - (1 - y) * np.log(1 - o) grad = (o - y) * ordered_training_set_feature_matrix[idx, :] update = -float(learning_rate) * grad w += update # prediction on test set num_corr = 0 num_incorr = 0 # true positive tp = 0 # predicted positive pp = 0 for idx in range(total_num_test): net = np.dot(w, ordered_test_set_feature_matrix[idx, :]) o = func.sigmoid(net) y = test_set_label_matrix[idx, 0] if class_list.index(y) == 0: y = 0 else: y = 1 pred = 0 if o > 0.5: pred = 1 pp += 1 else: pred = 0 if pred == y: num_corr += 1 if pred == 1: tp += 1 else: num_incorr += 1 actual_pos = np.sum(test_set_label_matrix == class_list[1]) recall = tp / actual_pos precision = tp / pp F1 = 2 * precision * recall / (precision + recall) F1_test.append(F1) # prediction on training set num_corr = 0 num_incorr = 0 # true positive tp = 0 # predicted positive pp = 0 for idx in range(total_num_training): net = np.dot(w, ordered_training_set_feature_matrix[idx, :]) o = func.sigmoid(net) y = training_set_label_matrix[idx, 0] if class_list.index(y) == 0: y = 0 else: y = 1 pred = 0 if o > 0.5: pred = 1 pp += 1 else: pred = 0 if pred == y: num_corr += 1 if pred == 1: tp += 1 else: num_incorr += 1 actual_pos = np.sum(training_set_label_matrix == class_list[1]) recall = tp / actual_pos precision = tp / pp F1 = 2 * precision * recall / (precision + recall) F1_training.append(F1) plt.plot(range(1, int(max_epoch) + 1), F1_training, label="on training set") plt.plot(range(1, int(max_epoch) + 1), F1_test, label="on test set") plt.title("F1 vs #epoches on heart dataset, learning rate = 0.05") plt.ylabel("F1") plt.xlabel("#epoches") plt.legend() plt.show()
import seaborn as sns import matplotlib.pyplot as plt import numpy as np from sklearn.ensemble import RandomForestClassifier from func import load_data #outline #1.EDA: Exploratory Data Analysis with Visualization #2.Feature Extraction #3.Data Modeling #4.Model Evaluation #1.EDA: Exploratory Data Analysis with Visualization #1.1 load data train, test, combine = load_data() #1.2 data structure print train.shape #891*12 train.describe() #statistics on numerical variables train.describe(include=['O']) #categorical data train.info() #check data tyoe and missing value train.isnull().sum() train['Embarked'].value_counts(normalize=True) #1.3 relationship btw features and target variable #target var distribution survived = train['Survived'][train['Survived'] == 1] not_survived = train['Survived'][train['Survived'] == 0] print "Survived: %i (%.1f%%)" % (len(survived), float(len(survived)) / len(train) * 100.0) print "Not Survived: %i (%.1f%%)" % (
from keras.models import Sequential from keras.layers import Dense, Dropout from keras.layers import Embedding from keras.layers import LSTM from keras.utils import np_utils from func import load_data, chord2index import numpy as np (x_train, y_train) = load_data('../data/train_note.csv', '../data/train_chord.csv') # data pre-processing y_train = np_utils.to_categorical(y_train, num_classes=24) x_test = x_train y_test = y_train # max_features = 1024 model = Sequential() model.add(Embedding(max_features, output_dim=13)) model.add(LSTM(128)) model.add(Dropout(0.5)) model.add(Dense(24, activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=16, epochs=10)
default=256) parser.add_argument("--epochs", type=int, help="Epochs", default=7) parser.add_argument('--gpu', action="store_true", help='Enable GPU') arguments = parser.parse_args() return arguments if __name__ == "__main__": arguments = arg_parser() data_dir = arguments.data_dir checkpoint_dir = arguments.save_dir architecture = arguments.arch learning_rate = arguments.learning_rate hidden_units = arguments.hidden_units epochs = arguments.epochs gpu = arguments.gpu device = torch.device("cuda" if gpu else "cpu") dataloaders, validloaders, testloader, image_datasets = load_data(data_dir) model, criterion, optimizer = build_model(architecture, learning_rate, hidden_units, epochs, device, image_datasets.class_to_idx) train_model(epochs, dataloaders, validloaders, model, criterion, optimizer, device) calculate_acc(model, testloader, device) save_checkpoint(architecture, model, image_datasets, optimizer, epochs, checkpoint_dir)
tensor_out = Flatten()(tensor_out) tensor_out = Dropout(0.5)(tensor_out) tensor_out = [Dense(10, name='digit1', activation='softmax')(tensor_out),\ Dense(10, name='digit2', activation='softmax')(tensor_out),\ Dense(10, name='digit3', activation='softmax')(tensor_out),\ Dense(10, name='digit4', activation='softmax')(tensor_out)] model = Model(inputs=tensor_in, outputs=tensor_out) model.compile(loss='categorical_crossentropy', optimizer='Adamax', metrics=['accuracy']) model.summary() x_train, y_train, x_val, y_val = load_data('label.txt', split_threshold=800) ''' data = pd.read_csv('label.txt', header=None) di = dict() for index, row in data.iterrows(): if (len(str(row[1])) < 4 ): row[1] = ('0000' + str(row[1]))[-4:] di[row[0]] = str(row[1]) #print(di) split_th = 800 x_train = [] y_train = [] yListData = [[] for _ in range(4)] yListVal = [[] for _ in range(4)] for data_idx, key in enumerate(di.keys()):