def train(from_scratch=False): x_train, x_eval = \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP), \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP) y_train, y_eval = \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP), \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP) print(u'all refined x && y loaded..') if from_scratch: print(u'filtering data set for all big tags') all_big_tag_x_train, all_big_tag_y_train = refined_preprocessing.filter_x_y( x_train, y_train, u'all_big_tags') all_big_tag_x_eval, all_big_tag_y_eval = refined_preprocessing.filter_x_y( x_eval, y_eval, u'all_big_tags') train_big_tag_model(u'all_big_tags', all_big_tag_x_train, all_big_tag_y_train, all_big_tag_x_eval, all_big_tag_y_eval) for big_tag, big_tag_seq in refined_preprocessing.TagManager.BIG_TAG_TO_SEQ.items( ): if refined_preprocessing.TagManager.SUBTAG_COUNT[big_tag_seq] == 0: continue print(u'filtering data set for %s' % big_tag) cur_x_train, cur_y_train = refined_preprocessing.filter_x_y( x_train, y_train, big_tag) cur_x_eval, cur_y_eval = refined_preprocessing.filter_x_y( x_eval, y_eval, big_tag) print(u'train sample count: %d, eval sample count: %d' % (len(cur_x_train), len(cur_x_eval))) train_big_tag_model(big_tag, cur_x_train, cur_y_train, cur_x_eval, cur_y_eval)
def train_baseline(): import refined_preprocessing model = new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE, baseline=True) x_train, y_train = \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP), \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP, return_idx=True) print(u'train data loaded') x_eval, y_eval = \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP), \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP, return_idx=True) print(u'eval data loaded') early_stopping = EarlyStopping(monitor='val_loss', patience=2) check_point = ModelCheckpoint(MODEL_PATH, save_weights_only=True) history = model.fit(x_train, y_train, callbacks=[early_stopping, check_point], validation_data=(x_eval, y_eval), nb_epoch=4, batch_size=64, sample_weight=None) model.save_weights(MODEL_WEIGHTS_PATH) print(u'model saved to %s' % MODEL_WEIGHTS_PATH) print(history.history) with codecs.open(HISTORY_PATH, 'w', 'utf8') as history_output: history_output.write(unicode(history.history)) lsq(MODEL_WEIGHTS_PATH, x_train, y_train, on_refined_data=True, baseline=True)
def evaluation_sp(evaluators, size=None): x_eval_sp = refined_preprocessing.read_refined_x( refined_preprocessing.REFINED_X_EVAL_SP, size=size) print(u'x_eval_sp loaded..start predict') pred_lists = predict_eval_data_based_on_a_star(x_eval_sp) print(u'prediction done..') eval_y_lists = read_refined_eval_y_for_evaluation(size=size) for evaluator in evaluators: total_score = 0.0 for i in range(len(pred_lists)): total_score += evaluator(pred_lists[i], eval_y_lists[i]) total_score /= len(pred_lists) print(evaluator.__doc__) print(total_score)
def evaluation_baseline(model_weights_path, evaluators): import refined_preprocessing my_model.read_threshold_lsq_coefficient() if model_weights_path.endswith(u'hdf5'): import keras model = keras.models.load_model(model_weights_path) else: model = my_model.new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE, baseline=True) model.load_weights(model_weights_path) x_eval, y_eval = \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP), \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP, return_idx=True) print (u'eval data loaded') y_pred = model.predict(x_eval) print (u'all prediction done') for evaluator in evaluators: total_score = 0.0 for idx in range(len(y_pred)): total_score += evaluator(y_pred[idx], y_eval[idx]) total_score /= len(y_pred) print (evaluator.__doc__) print (total_score)
def lsq(trained_model_path, x_train=None, y_train=None, sample_size=None, on_refined_data=False, baseline=False): if x_train is None: if on_refined_data: import refined_preprocessing x_train, y_train = \ refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP, sample_size), \ refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP, sample_size, return_idx=True) else: x_train, y_train = \ preprocessing.read_x(preprocessing.X_TRAIN_IGNORE_STOP_PATH, sample_size), \ preprocessing.read_y(preprocessing.Y_TRAIN_IGNORE_STOP_PATH, sample_size) print(u'train data loaded') from scipy.optimize import leastsq import refined_preprocessing from numpy.linalg import lstsq model = new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE ) if on_refined_data else new_model() if baseline: model = new_model(baseline=True) print(u'lsq loading model %s' % trained_model_path) if trained_model_path.endswith(u'hdf5'): model = keras.models.load_model(trained_model_path) else: model.load_weights(trained_model_path) print(u'model %s loaded, prediction start..' % trained_model_path) y_pred = model.predict(x_train) sorted_y_pred = numpy.sort(y_pred) sorted_y_pred_args = numpy.argsort(y_pred) print(u'prediction and sort done..generate optimal threshold vector..') optimal_threshold_v = numpy.zeros(len(x_train)) nb_tags = len(y_train[0]) aver_precision, aver_recall = 0.0, 0.0 for i in range(len(x_train)): true_tags = derive_tag_indices_from_y(y_train[i], is_y_true=True) precision, recall, pos = 0.0, 0.0, -1 correct_tags_predicted = 0.0 nb_misclassified = sys.maxint # searching optimal threshold for idx in range(nb_tags - 1, -1, -1): # for speeding up if nb_tags - idx > 10: break if sorted_y_pred_args[i][idx] not in true_tags: continue correct_tags_predicted += 1 this_precision, this_recall = \ correct_tags_predicted / (nb_tags - idx), correct_tags_predicted / len(true_tags) if (nb_tags - idx - correct_tags_predicted) + (len( true_tags) - correct_tags_predicted) < nb_misclassified: precision, recall, pos = this_precision, this_recall, idx nb_misclassified = (nb_tags - idx - correct_tags_predicted) + ( len(true_tags) - correct_tags_predicted) aver_precision, aver_recall = (aver_precision * i + precision) / ( i + 1), (aver_recall * i + recall) / (i + 1) optimal_threshold_v[i] = \ (sorted_y_pred[i][pos] + (sorted_y_pred[i][pos - 1] if pos != 0 else sorted_y_pred[i][pos])) / 2.0 print( u'optimal threshold retrieve done, average precision: %lf, average recall: %lf..' u'starting least square..' % (aver_precision, aver_recall)) amend_y_pred = numpy.column_stack((y_pred, numpy.ones(len(y_pred)))) def lsq_func(threshold_lsq_coefficient): return amend_y_pred.dot( threshold_lsq_coefficient) - optimal_threshold_v #### ans = leastsq(lsq_func, numpy.random.rand(nb_tags + 1)) ans = lstsq(amend_y_pred, optimal_threshold_v) print(u'square loss: %lf' % numpy.sum(lsq_func(ans[0])**2)) numpy.savetxt(THRESHOLD_LSQ_COEFFICIENT_PATH, ans[0]) print(u'least square done..')