def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix # label mapping is for me to track # make them into two classifiers, zero and one. # the distribution of the zero (ham) was higher? # the distribution of one (man) was higher? # output: two classes zero and one X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) print(labels_map) # y output, a list of zeros and ones, 相对应,第几篇文章里面是什么 # y is the golden standard, it is used for both training, and evaluation y = np.asarray(labels_to_y(authors, labels_map)) # numerical print(f"Numpy array has shape {X.shape} and dtype {X.dtype}") # TODO shuffle, then split the data # if split has already had a shuffle function embedded in it, no need for importing train, test = split_data(X, y, 0.25) # TODO: train a multinomial NB model, evaluate on validation split nbm = MultinomialNB() # to see what is the definition of nbm, what it requires as in the parameter # train is array, two tuples with [] in it, the first one is a array, teh second one is target # rows of X and the len of y are not identical. # y 的长度要大于X, 不能直接用y, 需要用剪裁过在train 里面的 nbm.fit(train[0], train[1]) # change preds_nbm = nbm.predict(test[0]) test_y = test[1] accuracy = calculate_accuracy(preds_nbm, test_y) print(f" the accuracy for multinomial NB model is {accuracy}") # TODO: train a Bernoulli NB model, evaluate on validation split nbb = BernoulliNB() nbb.fit(train[0], train[1]) preds_nbb = nbb.predict(test[0]) accuracy = calculate_accuracy(preds_nbb, test_y) print(f" the accuracy for Bernoulli NB model is {accuracy}") # TODO: fit the zero rule train_y = train[1] most_frequent_class = find_zero_rule_class(train_y) print(f"the most frequent class is {most_frequent_class}") test_predictions = apply_zero_rule(test[0], most_frequent_class) test_accuracy = calculate_accuracy(test_predictions, test_y) print(f" the accuracy for the baseline is {test_accuracy}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('model_path', type=str, metavar='MODEL', help='Path to Keras model') parser.add_argument('x_path', type=str, metavar='X', help='Path to inputs') parser.add_argument('y_path', type=str, metavar='Y', help='Path to targets') parser.add_argument('-f', '--output_file', type=str, default=None, metavar='FILE', help='File to print output to') args = parser.parse_args() model = load_model(args.model_path) x = np.load(args.x_path) y = np.load(args.y_path) y_pred = model.predict(x) acc = calculate_accuracy(y, y_pred) cm = generate_confusion_matrix(y, y_pred) cm_norm = cm.astype(np.float) / cm.sum(axis=1)[:, np.newaxis] stats = calculate_recall_precision_f1_mcc(cm) print('Accuracy: {:.2f}%'.format(acc * 100.0)) print('Confusion matrix (rows are actual, columns are predicted):') print(cm) print('Normalized confusion matrix:') print(cm_norm) print('Recall: {}'.format(stats['recall'])) print('Precision: {}'.format(stats['precision'])) print('F1 score: {}'.format(stats['f1'])) print('Matthews correlation coefficient: {}'.format(stats['mcc'])) if args.output_file is not None: with open(args.output_file, 'w') as f: f.write('Accuracy: {:.2f}%\n'.format(acc * 100.0)) f.write( 'Confusion matrix (rows are actual, columns are predicted):\n') f.write(str(cm) + '\n') f.write('Normalized confusion matrix:\n') f.write(str(cm_norm) + '\n') f.write('Recall: {}\n'.format(stats['recall'])) f.write('Precision: {}\n'.format(stats['precision'])) f.write('F1 score: {}\n'.format(stats['f1'])) f.write('Matthews correlation coefficient: {}\n'.format( stats['mcc']))
def quantize(model_name, out_path): keras_model = tf.keras.Sequential( [hub.KerasLayer(tf_hub_links[model_name], output_shape=[1001])]) if "inception_v3" in model_name: keras_model._set_inputs( preprocess_cv_dataset_299[0]) # Batch input shape. else: keras_model._set_inputs( preprocess_cv_dataset_224[0]) # Batch input shape. tflite_convertor = TFLiteConvertor(\ saved_model_dir=keras_model, base_path=out_path, model_name=model_name) tflite_convertor.fp32() tflite_convertor.weight_int8_act_fp32() if "inception_v3" in model_name: tflite_convertor.full_integer_except_io(representative_data_gen_299) else: tflite_convertor.full_integer_except_io(representative_data_gen_224) im_height = 299 if "inception_v3" in model_name else 224 im_width = im_height tflite_model_path = out_path + "/" + model_name + "_fp32.tflite" top1, top5 = calculate_accuracy(test_dataset, tflite_model_path, im_height, im_width, preprocess, postprocess[model_name], 10) print("{:15} {:20} {:10} {:10}".format(model_name, "fp32", top1, top5)) tflite_model_path = out_path + "/" + model_name + "_full_integer_except_io.tflite" top1, top5 = calculate_accuracy(test_dataset, tflite_model_path, im_height, im_width, preprocess, postprocess[model_name], 10) print("{:15} {:20} {:10} {:10}".format(model_name, "full_integer", top1, top5))
def main(): # bank marketing dataset bank_marketing = fetch_openml(data_id=1461) # only using 5k of the available 45211 instances data = bank_marketing.data[:5000] labels = bank_marketing.target[:5000] # converting nominal features to strings nominal_features = {1, 2, 3, 4, 6, 7, 8, 10, 15} converted_data = [] num_rows = data.shape[0] num_cols = data.shape[1] for row in range(num_rows): new_row = [] for col in range(num_cols): if col in nominal_features: new_row.append(str(data[row, col])) else: new_row.append(data[row, col]) new_row.append(labels[row]) converted_data.append(new_row) train_n = int(0.8 * num_rows) random.shuffle(converted_data) train = converted_data[:train_n] test = converted_data[train_n:] print("Number of training rows: {}".format(train_n)) print("Number of testing rows: {}".format(len(test))) train_X = [row[:-1] for row in train] train_y = [row[-1] for row in train] test_X = [row[:-1] for row in test] test_y = [row[-1] for row in test] dt = DecisionTree() t1 = time.time() dt.fit(train_X, train_y) t2 = time.time() diff = t2 - t1 print("Time to fit the decision tree: {:.2f} seconds".format(diff)) predictions = [dt.predict(record) for record in test_X] accuracy = calculate_accuracy(predictions, test_y) print("Accuracy on the test set: {:.2f}%".format(accuracy * 100))
def load_and_train(model_path, x_train, x_val, x_test, y_train, y_val, y_test, max_epochs=1000, batch_size=32, patience=20): """ Load model and resume training. """ model = load_model(model_path) checkpointer = ModelCheckpoint(model_path, save_best_only=True) model.fit(x_train, y_train, epochs=max_epochs, batch_size=batch_size, verbose=1, validation_data=(x_val, y_val), callbacks=[EarlyStopping(patience=patience), checkpointer]) model = load_model(model_path) y_train_pred = model.predict(x_train) print('Train accuracy: {:.2f}%'.format( calculate_accuracy(y_train, y_train_pred) * 100.0)) # print('Test accuracy: {:.2f}%'.format(calculate_accuracy(y_test, y_test_pred) * 100.0)) y_test_pred = [] for i in range(3): y_test_pred.append(model.predict(x_test[i])) print('Test accuracy: {:.2f}%'.format( calculate_accuracy(y_test[i], y_test_pred[i]) * 100.0)) return model
def main(data_file): print(data_file) # load the data authors, essays, essay_ids = parse_federalist_papers(data_file) num_essays = len(essays) print(f"Working with {num_essays} reviews") # create a key that links author id string -> integer author_key = labels_to_key(authors) print(len(author_key)) print(author_key) # convert all the labels using the key y = labels_to_y(authors, author_key) assert y.size == len( authors ), f"Size of label array (y.size) must equal number of labels {len(authors)}" # shuffle and split the data train, test = split_data(essays, y, 0.3) data_size_after = len(train[1]) + len(test[1]) assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}" print(f"{len(train[0])} in train; {len(test[0])} in test") # learn zero rule on train train_y = train[1] most_frequent_class = find_zero_rule_class(train_y) print(most_frequent_class) # lookup label string from class # reverse_author_key = {v: k for k, v in author_key.items()} print( f"The most frequent class is {reverse_author_key[most_frequent_class]}" ) # apply zero rule to test reviews test_predictions = apply_zero_rule(test[0], most_frequent_class) print(f"Zero rule predictions on held-out data: {test_predictions}") # score accuracy test_y = test[1] test_accuracy = calculate_accuracy(test_predictions, test_y) print(f"Accuracy of zero rule: {test_accuracy:0.03f}")
def val_epoch(epoch, data_loader, model, criterion, opt, logger): print('validation at epoch {}'.format(epoch)) model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) if not opt.no_cuda: targets = targets.cuda(async=True) inputs = Variable(inputs, volatile=True) targets = Variable(targets, volatile=True) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data[0], inputs.size(0)) accuracies.update(acc, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) return losses.avg
def val_epoch(epoch, data_loader, model, criterion, opt, vis,vallogwindow): print('validation at epoch {}'.format(epoch)) model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() mmap = meter.mAPMeter() AP = meter.APMeter() top = meter.ClassErrorMeter(topk=[1, 3, 5], accuracy=True) mmap.reset() AP.reset() top.reset() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) if type(inputs) is list: inputs = [Variable(inputs[ii].cuda()) for ii in range(len(inputs))] else: inputs = Variable(inputs.cuda()) targets = targets.cuda() with torch.no_grad(): #inputs = Variable(inputs) targets = Variable(targets) outputs ,context= model(inputs) #if i %5==0: #for jj in range(num): # org_img = inverse_normalize(inputs[0,jj,:,:,:].detach().cpu().numpy()) # show_keypoint(org_img, context[0].detach().cpu(),vis=vis,title = str(jj+1)) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data.item(), targets.detach().size(0)) accuracies.update(acc, targets.detach().size(0)) one_hot = torch.zeros_like(outputs).cuda().scatter_(1, targets.view(-1, 1), 1) mmap.add(outputs.detach(), one_hot.detach()) top.add(outputs.detach(), targets.detach()) AP.add(outputs.detach(), one_hot.detach()) batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})\t' 'mmap {mmap}\t' 'top1 3 5: {top}\t'.format( epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies, mmap=mmap.value(), top=top.value() )) vis.text("gpu:{}, epoch: {},loss: {},accu:{},mAP:{}, top135 {}\nAP:{}".format(torch.cuda.current_device(),epoch,losses.avg,accuracies.avg,mmap.value(),top.value(),AP.value()) ,win=vallogwindow,append=True) #exit() #if epoch==10: # exit() return losses.avg, mmap.value()
def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, vis, trainlogwindow): print('train at epoch {}'.format(epoch)) model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() mmap = meter.mAPMeter() top = meter.ClassErrorMeter(topk=[1, 3, 5], accuracy=True) mmap.reset() top.reset() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) targets = targets.cuda() if type(inputs) is list: inputs = [Variable(inputs[ii]).cuda() for ii in range(len(inputs))] else: inputs = inputs.cuda() #inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, opt.DATASET.ALPHA, True) #inputs, targets_a, targets_b = Variable(inputs), Variable(targets_a), Variable(targets_b) inputs = Variable(inputs) #print(targets) targets = Variable(targets) outputs, context = model(inputs) #loss_func = mixup_criterion(targets_a, targets_b, lam) #loss = loss_func(criterion, outputs) loss = criterion(outputs, targets) #print(outputs.shape) #print(targets) acc = calculate_accuracy(outputs, targets) one_hot = torch.zeros_like(outputs).cuda().scatter_( 1, targets.view(-1, 1), 1) mmap.add(outputs.detach(), one_hot.detach()) top.add(outputs.detach(), targets.detach()) losses.update(loss.data.item(), targets.detach().size(0)) accuracies.update(acc, targets.detach().size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() vis.text( "gpu{}, epoch: {},batch:{},iter: {},loss: {},acc:{},lr: {}\n".format(torch.cuda.current_device(),epoch, i + 1,(epoch - 1) * len(data_loader) + (i + 1),losses.val, \ accuracies.val,optimizer.param_groups[0]['lr']) ,win=trainlogwindow,append=True) print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})\t' 'mmap {mmap}\t' 'top1 3 5: {top}\t'.format(epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies, mmap=mmap.value(), top=top.value())) vis.text( "total:\n gpu:{} epoch: {},loss: {},lr: {}, accu:{},mAP:{}, top135 {}\n" .format(torch.cuda.current_device(), epoch, losses.avg, optimizer.param_groups[0]['lr'], accuracies.avg, mmap.value(), top.value()), win=trainlogwindow, append=True) if torch.cuda.current_device() == 0: print("saveing ckp ########################################") if epoch % opt.MODEL.CKP_DURING == 0: save_file_path = os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME, 'save_{}.pth'.format(epoch)) if not os.path.exists( os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME)): os.makedirs(os.path.join(opt.MODEL.RESULT, opt.MODEL.NAME)) states = { 'epoch': epoch + 1, 'arch': opt.MODEL.NAME, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path) return losses.avg, mmap.value()
def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) y = np.asarray(labels_to_y(authors, labels_map)) print(f"X has shape {X.shape} and dtype {X.dtype}") print(f"y has shape {y.shape} and dtype {y.dtype}") # TODO shuffle, then split the data train, test = split_data(X, y, 0.25) data_size_after = len(train[1]) + len(test[1]) assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}" print(f"{len(train[0])} in train; {len(test[0])} in test") # TODO: train a multinomial NB model, evaluate on validation split nb_mul = MultinomialNB() nb_mul.fit(train[0], train[1]) pred_mul = nb_mul.predict(test[0]) acc_mul = metrics.accuracy_score(test[1], pred_mul) print(f"Accuracy of Multinomial NB method: {acc_mul:0.03f}") # TODO: train a Bernoulli NB model, evaluate on validation split ### make a binary feature vector train_bi = np.copy(train[0]) for i in range(len(train_bi)): train_bi[i] = np.where(train_bi[i] > 0, 1, 0) test_bi = np.copy(test[0]) for i in range(len(test_bi)): test_bi[i] = np.where(test_bi[i] > 0, 1, 0) nb_ber = BernoulliNB() nb_ber.fit(train_bi, train[1]) pred_ber = nb_ber.predict(test_bi) acc_ber = metrics.accuracy_score(test[1], pred_ber) print(f"Accuracy of Bernoulli NB method: {acc_ber:0.03f}") # TODO: fit the zero rule # learn zero rule on train most_frequent_class = find_zero_rule_class(train[1]) # apply zero rule to test reviews test_predictions = apply_zero_rule(test[0], most_frequent_class) # score accuracy test_accuracy = calculate_accuracy(test_predictions, test[1]) print(f"Accuracy of Zero rule: {test_accuracy:0.03f}") # lookup label string from class # author_key = labels_to_key(authors) reverse_author_key = {v: k for k, v in author_key.items()} print( f"The author predicted by the Zero rule is {reverse_author_key[most_frequent_class]}" )
def bgru(x_train, x_val, x_test, y_train, y_val, y_test, out_dir, name='bgru_model', hidden_units=10, layers=1, max_epochs=1000, batch_size=32, patience=20, dropout=0.0, recurrent_dropout=0.0): """ Bidirectional GRU model for protein secondary structure prediction. """ num_samples = x_train.shape[0] max_seq_len = x_train.shape[1] num_features = x_train.shape[2] num_classes = y_train.shape[2] # Build Keras model model = Sequential() model.add(Masking(mask_value=0, input_shape=(max_seq_len, num_features))) model.add( Bidirectional( GRU(hidden_units, return_sequences=True, input_shape=(max_seq_len, num_features), dropout=dropout, recurrent_dropout=recurrent_dropout))) if layers > 1: for _ in range(layers - 1): model.add( Bidirectional( GRU(hidden_units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))) model.add(TimeDistributed(Dense(num_classes))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) # Train model. Use early-stopping on validation data to determine when to stop training. model_path = os.path.join(out_dir, name + '.h5') checkpointer = ModelCheckpoint(model_path, save_best_only=True) model.fit(x_train, y_train, epochs=max_epochs, batch_size=batch_size, verbose=1, validation_data=(x_val, y_val), callbacks=[EarlyStopping(patience=patience), checkpointer]) model = load_model( model_path ) # Best model is not necessarily current model instance b/c patience != 0 y_train_pred = model.predict(x_train) print('Train accuracy: {:.2f}%'.format( calculate_accuracy(y_train, y_train_pred) * 100.0)) # Test set accuracy y_test_pred = [] for i in range(3): y_test_pred.append(model.predict(x_test[i])) print('Test accuracy: {:.2f}%'.format( calculate_accuracy(y_test[i], y_test_pred[i]) * 100.0)) return model
def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, epoch_logger, batch_logger): print('train at epoch {}'.format(epoch)) model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) if not opt.no_cuda: targets = targets.cuda(async=True) inputs = Variable(inputs) targets = Variable(targets) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data[0], inputs.size(0)) accuracies.update(acc, inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() batch_logger.log({ 'epoch': epoch, 'batch': i + 1, 'iter': (epoch - 1) * len(data_loader) + (i + 1), 'loss': losses.val, 'acc': accuracies.val, 'lr': optimizer.param_groups[0]['lr'] }) print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) epoch_logger.log({ 'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg, 'lr': optimizer.param_groups[0]['lr'] }) if epoch % opt.checkpoint == 0: save_file_path = os.path.join(opt.result_path, 'save_{}.pth'.format(epoch)) states = { 'epoch': epoch + 1, 'arch': opt.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path)