예제 #1
0
def train(from_scratch=False):
    x_train, x_eval = \
        refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP), \
        refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP)
    y_train, y_eval = \
        refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP), \
        refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP)
    print(u'all refined x && y loaded..')
    if from_scratch:
        print(u'filtering data set for all big tags')
        all_big_tag_x_train, all_big_tag_y_train = refined_preprocessing.filter_x_y(
            x_train, y_train, u'all_big_tags')
        all_big_tag_x_eval, all_big_tag_y_eval = refined_preprocessing.filter_x_y(
            x_eval, y_eval, u'all_big_tags')
        train_big_tag_model(u'all_big_tags', all_big_tag_x_train,
                            all_big_tag_y_train, all_big_tag_x_eval,
                            all_big_tag_y_eval)
    for big_tag, big_tag_seq in refined_preprocessing.TagManager.BIG_TAG_TO_SEQ.items(
    ):
        if refined_preprocessing.TagManager.SUBTAG_COUNT[big_tag_seq] == 0:
            continue
        print(u'filtering data set for %s' % big_tag)
        cur_x_train, cur_y_train = refined_preprocessing.filter_x_y(
            x_train, y_train, big_tag)
        cur_x_eval, cur_y_eval = refined_preprocessing.filter_x_y(
            x_eval, y_eval, big_tag)
        print(u'train sample count: %d, eval sample count: %d' %
              (len(cur_x_train), len(cur_x_eval)))
        train_big_tag_model(big_tag, cur_x_train, cur_y_train, cur_x_eval,
                            cur_y_eval)
예제 #2
0
def train_baseline():
    import refined_preprocessing
    model = new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE,
                      baseline=True)
    x_train, y_train = \
        refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP), \
        refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP, return_idx=True)
    print(u'train data loaded')
    x_eval, y_eval = \
        refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP), \
        refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP, return_idx=True)
    print(u'eval data loaded')
    early_stopping = EarlyStopping(monitor='val_loss', patience=2)
    check_point = ModelCheckpoint(MODEL_PATH, save_weights_only=True)
    history = model.fit(x_train,
                        y_train,
                        callbacks=[early_stopping, check_point],
                        validation_data=(x_eval, y_eval),
                        nb_epoch=4,
                        batch_size=64,
                        sample_weight=None)
    model.save_weights(MODEL_WEIGHTS_PATH)
    print(u'model saved to %s' % MODEL_WEIGHTS_PATH)
    print(history.history)
    with codecs.open(HISTORY_PATH, 'w', 'utf8') as history_output:
        history_output.write(unicode(history.history))
    lsq(MODEL_WEIGHTS_PATH,
        x_train,
        y_train,
        on_refined_data=True,
        baseline=True)
예제 #3
0
def evaluation_sp(evaluators, size=None):
    x_eval_sp = refined_preprocessing.read_refined_x(
        refined_preprocessing.REFINED_X_EVAL_SP, size=size)
    print(u'x_eval_sp loaded..start predict')
    pred_lists = predict_eval_data_based_on_a_star(x_eval_sp)
    print(u'prediction done..')
    eval_y_lists = read_refined_eval_y_for_evaluation(size=size)
    for evaluator in evaluators:
        total_score = 0.0
        for i in range(len(pred_lists)):
            total_score += evaluator(pred_lists[i], eval_y_lists[i])
        total_score /= len(pred_lists)
        print(evaluator.__doc__)
        print(total_score)
예제 #4
0
def evaluation_baseline(model_weights_path, evaluators):
    import refined_preprocessing
    my_model.read_threshold_lsq_coefficient()
    if model_weights_path.endswith(u'hdf5'):
        import keras
        model = keras.models.load_model(model_weights_path)
    else:
        model = my_model.new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE, baseline=True)
        model.load_weights(model_weights_path)
    x_eval, y_eval = \
        refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_EVAL_SP), \
        refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_EVAL_SP, return_idx=True)
    print (u'eval data loaded')
    y_pred = model.predict(x_eval)
    print (u'all prediction done')
    for evaluator in evaluators:
        total_score = 0.0
        for idx in range(len(y_pred)):
            total_score += evaluator(y_pred[idx], y_eval[idx])
        total_score /= len(y_pred)
        print (evaluator.__doc__)
        print (total_score)
예제 #5
0
def lsq(trained_model_path,
        x_train=None,
        y_train=None,
        sample_size=None,
        on_refined_data=False,
        baseline=False):
    if x_train is None:
        if on_refined_data:
            import refined_preprocessing
            x_train, y_train = \
                refined_preprocessing.read_refined_x(refined_preprocessing.REFINED_X_TRAIN_SP, sample_size), \
                refined_preprocessing.read_refined_y(refined_preprocessing.REFINED_Y_TRAIN_SP, sample_size,
                                                     return_idx=True)
        else:
            x_train, y_train = \
                preprocessing.read_x(preprocessing.X_TRAIN_IGNORE_STOP_PATH, sample_size), \
                preprocessing.read_y(preprocessing.Y_TRAIN_IGNORE_STOP_PATH, sample_size)
        print(u'train data loaded')
    from scipy.optimize import leastsq
    import refined_preprocessing
    from numpy.linalg import lstsq
    model = new_model(refined_preprocessing.TagManager.REFINED_TAG_DICT_SIZE
                      ) if on_refined_data else new_model()
    if baseline:
        model = new_model(baseline=True)
    print(u'lsq loading model %s' % trained_model_path)
    if trained_model_path.endswith(u'hdf5'):
        model = keras.models.load_model(trained_model_path)
    else:
        model.load_weights(trained_model_path)
    print(u'model %s loaded, prediction start..' % trained_model_path)
    y_pred = model.predict(x_train)
    sorted_y_pred = numpy.sort(y_pred)
    sorted_y_pred_args = numpy.argsort(y_pred)
    print(u'prediction and sort done..generate optimal threshold vector..')
    optimal_threshold_v = numpy.zeros(len(x_train))
    nb_tags = len(y_train[0])
    aver_precision, aver_recall = 0.0, 0.0
    for i in range(len(x_train)):
        true_tags = derive_tag_indices_from_y(y_train[i], is_y_true=True)
        precision, recall, pos = 0.0, 0.0, -1
        correct_tags_predicted = 0.0
        nb_misclassified = sys.maxint
        # searching optimal threshold
        for idx in range(nb_tags - 1, -1, -1):
            # for speeding up
            if nb_tags - idx > 10:
                break
            if sorted_y_pred_args[i][idx] not in true_tags:
                continue
            correct_tags_predicted += 1
            this_precision, this_recall = \
                correct_tags_predicted / (nb_tags - idx), correct_tags_predicted / len(true_tags)
            if (nb_tags - idx - correct_tags_predicted) + (len(
                    true_tags) - correct_tags_predicted) < nb_misclassified:
                precision, recall, pos = this_precision, this_recall, idx
                nb_misclassified = (nb_tags - idx - correct_tags_predicted) + (
                    len(true_tags) - correct_tags_predicted)
        aver_precision, aver_recall = (aver_precision * i + precision) / (
            i + 1), (aver_recall * i + recall) / (i + 1)
        optimal_threshold_v[i] = \
            (sorted_y_pred[i][pos] + (sorted_y_pred[i][pos - 1] if pos != 0 else sorted_y_pred[i][pos])) / 2.0
    print(
        u'optimal threshold retrieve done, average precision: %lf, average recall: %lf..'
        u'starting least square..' % (aver_precision, aver_recall))
    amend_y_pred = numpy.column_stack((y_pred, numpy.ones(len(y_pred))))

    def lsq_func(threshold_lsq_coefficient):
        return amend_y_pred.dot(
            threshold_lsq_coefficient) - optimal_threshold_v

    #### ans = leastsq(lsq_func, numpy.random.rand(nb_tags + 1))
    ans = lstsq(amend_y_pred, optimal_threshold_v)
    print(u'square loss: %lf' % numpy.sum(lsq_func(ans[0])**2))
    numpy.savetxt(THRESHOLD_LSQ_COEFFICIENT_PATH, ans[0])
    print(u'least square done..')