def evaluate(session, ops, previous_ops, dataset):
    losses = []
    accuracies = []
    f1_scores = []
    masked_predictions = []
    aspects = []
    ground_truths = []
    masks = []
    cm = np.zeros(shape=(args.num_classes, args.num_classes), dtype=np.int32)
    test_metrics = {}
    if args.mode == 'test':
        if args.task == 'semeval16-restaurant':
            aspect_word_index_map = RESTAURANT_ASPECT_WORD_INDEX_MAP
        elif args.task == 'semeval16-laptops':
            # TODO: change this
            aspect_word_index_map = LAPTOPS_ASPECT_WORD_INDEX_MAP
        n_sentiment_classes = args.num_classes
        n_aspect = len(aspect_word_index_map) - 1
        n_total_classes = n_aspect * (n_sentiment_classes - 1) + 1
        n_multilabel_success = 0
        n_multilabel_failure = 0
        n_sentence = 0
        args.batch_size = n_aspect

        per_aspect_sentiments_cm = np.zeros(shape=(n_aspect,
                                                   n_sentiment_classes,
                                                   n_sentiment_classes),
                                            dtype=np.int32)
        per_aspect_aspect_detection_cm = np.zeros(shape=(n_aspect + 1, 2, 2),
                                                  dtype=np.int32)
        joint_aspect_sentiment_cm = np.zeros(shape=(n_total_classes, 2, 2),
                                             dtype=np.int32)

    for x1, x2, y in batch_iterator(dataset, args.batch_size, 1):
        # get feed_dicts
        fd = get_feed_data(x1, x2, y, is_training=False, args=args)

        # get previous feed_dicts
        previous_fd = fd.copy()

        # execute previous models
        word_level_inputs, aspect_embedded_encoder_output, aspect_embedded_sentence_inputs, birnn_output = session.run(
            [
                previous_ops['word_level_inputs'],
                previous_ops['aspect_embedded_encoder_output'],
                previous_ops['aspect_embedded_sentence_inputs'],
                previous_ops['birnn_output']
            ], previous_fd)

        fd[PREFIX + PREVIOUS_WORD_LEVEL_INPUTS_TENSOR_NAME] = word_level_inputs
        fd[PREFIX +
           PREVIOUS_ASPECT_EMBEDDED_ENCODER_OUTPUT_TENSOR_NAME] = aspect_embedded_encoder_output
        fd[PREFIX +
           PREVIOUS_ASPECT_EMBEDDED_SENTENCE_INPUTS_TENSOR_NAME] = aspect_embedded_sentence_inputs
        fd[PREFIX +
           PREVIOUS_SENTENCE_ENCODER_OUTPUT_TENSOR_NAME] = birnn_output
        fd[PREFIX + ASPECTS_TENSOR_NAME] = fd[ASPECTS_TENSOR_NAME]
        fd[PREFIX +
           PADDED_REVIEWS_TENSOR_NAME] = fd[PADDED_REVIEWS_TENSOR_NAME]
        fd[PREFIX + ACTUAL_SENTENCE_COUNT_TENSOR_NAME] = fd[
            ACTUAL_SENTENCE_COUNT_TENSOR_NAME]
        fd[PREFIX +
           ACTUAL_WORD_COUNT_TENSOR_NAME] = fd[ACTUAL_WORD_COUNT_TENSOR_NAME]
        fd[PREFIX + SENTENCE_MASK_TENSOR_NAME] = fd[SENTENCE_MASK_TENSOR_NAME]
        fd[PREFIX + WORD_MASK_TENSOR_NAME] = fd[WORD_MASK_TENSOR_NAME]
        fd[PREFIX + PADDED_LABELS_TENSOR_NAME] = fd[PADDED_LABELS_TENSOR_NAME]
        fd[PREFIX + LABLE_WEIGHTS_TENSOR_NAME] = fd[LABLE_WEIGHTS_TENSOR_NAME]
        fd[PREFIX + IS_TRAINING_TENSOR_NAME] = fd[IS_TRAINING_TENSOR_NAME]

        # run evaluation
        val_accuracy, loss, f1_score, confusion_matrix, masked_prediction = session.run(
            [
                ops['accuracy'], ops['loss'], ops['f1_score'],
                ops['confusion_matrix'], ops['masked_predictions']
            ], fd)

        losses.append(loss)
        accuracies.append(val_accuracy)
        f1_scores.append(f1_score)
        cm += confusion_matrix
        masked_predictions.append(masked_prediction)
        aspects.append(x1)
        ground_truths.append(y)
        masks.append(fd[SENTENCE_MASK_TENSOR_NAME])
        if args.mode == 'test':
            eval_results = evaluation_metrics(
                fd[PREFIX + ASPECTS_TENSOR_NAME],
                fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME],
                fd[PREFIX + PADDED_LABELS_TENSOR_NAME], masked_prediction,
                aspect_word_index_map)
            per_aspect_sentiments_cm += eval_results[
                'per_aspect_sentiments_cm']
            per_aspect_aspect_detection_cm += eval_results[
                'per_aspect_aspect_detection_cm']
            joint_aspect_sentiment_cm += eval_results[
                'joint_aspect_sentiment_cm']
            n_multilabel_success += eval_results['n_multilabel_success']
            n_multilabel_failure += eval_results['n_multilabel_failure']
            n_sentence += eval_results['count']
            test_metrics = {
                'per_aspect_sentiments_cm': per_aspect_sentiments_cm,
                'per_aspect_aspect_detection_cm':
                per_aspect_aspect_detection_cm,
                'joint_aspect_sentiment_cm': joint_aspect_sentiment_cm,
                'n_multilabel_success': n_multilabel_success,
                'n_multilabel_failure': n_multilabel_failure,
                'n_sentence': n_sentence
            }

    df = {
        'loss': losses,
        'accuracy': accuracies,
        'f1_score': f1_scores,
        'confusion_matrix': cm,
        'masked_predictions': masked_predictions,
        'aspects': aspects,
        'ground_truths': ground_truths,
        'masks': masks,
        'test_metrics': test_metrics
    }
    return df
def train(config=None, reporter=None):
    """
    Main method to start training

    :param hyperparam_tune: flag for controlling hyperparameter tuning
    :param config: contains grid searched values for hyperparameters to be tuned
    :param reporter: can contain reporting values like accuracy, f1-score etc
    :return:
    """
    # set values according to hyperparamter tuner
    if args.hyperparam_tune:
        print('Data dir : ' + DATA_DIR)
        args.lr = config['learning_rate']
        args.batch_size = config['batch_size']
        args.dropout_keep_prob = config['dropout_keep_prob']

    print(args)
    write_experiment_parameters(args)

    # https://stackoverflow.com/questions/44873273/what-do-the-options-in-configproto-like-allow-soft-placement-and-log-device-plac
    config = tf.ConfigProto(allow_soft_placement=True)

    # Clears the default graph stack and resets the global default graph.
    tf.reset_default_graph()

    with tf.Session(config=config) as session:
        # attach keras and tf session so that we can use keras layers together with tf
        K.set_session(session)

        # load previous trained model
        previous_ops = model.get_previous_model(session=session, args=args)

        # get model and saver instances
        _, saver, ops = model.get_model(session=session,
                                        args=args,
                                        restore_only=False)

        # get label weights for handling class imbalance
        class_weights = calculate_class_weights()

        # create a training summary writer
        train_writer = tf.summary.FileWriter(TFLOG_DIR, graph=session.graph)

        # initializations
        val_accuracies = []
        val_per_class_accuracies = []
        val_per_class_f1_scores = []
        val_macro_f1_scores = []
        train_accuracies = []
        train_per_class_f1_scores = []
        train_per_class_accuracies = []
        train_macro_f1_scores = []
        train_confusion_matrix = np.zeros(shape=(args.num_classes,
                                                 args.num_classes),
                                          dtype=np.int32)
        best_macro_f1_score = 0
        best_step_number = 0

        # start training
        for i, (x1, x2, y) in enumerate(
                batch_iterator(train_loader(args.epochs), args.batch_size)):

            t0 = time.clock()

            # calculate dynamic class weights
            if args.dynamic_class_weights:
                class_weights = calculate_class_weights(classes=y)

            # get feed_dicts
            fd = get_feed_data(x1,
                               x2,
                               y,
                               class_weights=class_weights,
                               is_training=True,
                               args=args)

            # get previous feed_dicts
            previous_fd = fd.copy()
            previous_fd[IS_TRAINING_TENSOR_NAME] = False

            # execute previous model
            word_level_inputs, aspect_embedded_encoder_output, aspect_embedded_sentence_inputs, birnn_output = session.run(
                [
                    previous_ops['word_level_inputs'],
                    previous_ops['aspect_embedded_encoder_output'],
                    previous_ops['aspect_embedded_sentence_inputs'],
                    previous_ops['birnn_output']
                ], previous_fd)

            fd[PREFIX +
               PREVIOUS_WORD_LEVEL_INPUTS_TENSOR_NAME] = word_level_inputs
            fd[PREFIX +
               PREVIOUS_ASPECT_EMBEDDED_ENCODER_OUTPUT_TENSOR_NAME] = aspect_embedded_encoder_output
            fd[PREFIX +
               PREVIOUS_ASPECT_EMBEDDED_SENTENCE_INPUTS_TENSOR_NAME] = aspect_embedded_sentence_inputs
            fd[PREFIX +
               PREVIOUS_SENTENCE_ENCODER_OUTPUT_TENSOR_NAME] = birnn_output
            fd[PREFIX + ASPECTS_TENSOR_NAME] = fd[ASPECTS_TENSOR_NAME]
            fd[PREFIX +
               PADDED_REVIEWS_TENSOR_NAME] = fd[PADDED_REVIEWS_TENSOR_NAME]
            fd[PREFIX + ACTUAL_SENTENCE_COUNT_TENSOR_NAME] = fd[
                ACTUAL_SENTENCE_COUNT_TENSOR_NAME]
            fd[PREFIX + ACTUAL_WORD_COUNT_TENSOR_NAME] = fd[
                ACTUAL_WORD_COUNT_TENSOR_NAME]
            fd[PREFIX +
               SENTENCE_MASK_TENSOR_NAME] = fd[SENTENCE_MASK_TENSOR_NAME]
            fd[PREFIX + WORD_MASK_TENSOR_NAME] = fd[WORD_MASK_TENSOR_NAME]
            fd[PREFIX +
               PADDED_LABELS_TENSOR_NAME] = fd[PADDED_LABELS_TENSOR_NAME]
            fd[PREFIX +
               LABLE_WEIGHTS_TENSOR_NAME] = fd[LABLE_WEIGHTS_TENSOR_NAME]
            fd[PREFIX + IS_TRAINING_TENSOR_NAME] = fd[IS_TRAINING_TENSOR_NAME]

            # run session
            step, summaries, loss, accuracy, f1_score, f1_score_0, f1_score_1, f1_score_2, f1_score3, \
            confusion_matrix, labels, predictions, label_weights, _ = session.run(
                [
                    ops['global_step'],
                    ops['summary_op'],
                    ops['loss'],
                    ops['accuracy'],
                    ops['f1_score'],
                    ops['f1_score_0'],
                    ops['f1_score_1'],
                    ops['f1_score_2'],
                    ops['f1_score_3'],
                    ops['confusion_matrix'],
                    ops['padded_labels'],
                    ops['predictions'],
                    ops['label_weights'],
                    ops['train_op']
                ], fd)

            train_writer.add_summary(summaries, global_step=step)
            td = time.clock() - t0

            if args.hyperparam_tune:
                reporter(f1_score=f1_score)

            if step % args.print_frequency == 0:
                train_confusion_matrix += confusion_matrix
                print(
                    'step %s, loss=%s, accuracy=%s, f1_score=%s, t=%s, inputs=%s'
                    % (step, loss, accuracy, f1_score, round(
                        td, 2), fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME].shape))
            if step != 0 and step % args.eval_frequency == 0:
                # run validation
                val_results = evaluate(session=session,
                                       ops=ops,
                                       previous_ops=previous_ops,
                                       dataset=val_loader(epochs=1))
                print_results(val_results, args, 'VALIDATION RESULTS',
                              val_accuracies, val_per_class_accuracies,
                              val_macro_f1_scores, val_per_class_f1_scores)
                # save a checkpoint if best f1 score
                if val_macro_f1_scores[-1] >= best_macro_f1_score:
                    best_macro_f1_score = val_macro_f1_scores[-1]
                    best_step_number = step
                    print('Best Macro F1 Score : %.2f' % best_macro_f1_score)
                    print('Best step at : ' + str(best_step_number))
                    saver.save(session, CHECKPOINT_PATH, global_step=step)
                    print('checkpoint saved')
                train_results = {
                    'loss': loss,
                    'accuracy': accuracy,
                    'f1_score': f1_score,
                    'confusion_matrix': train_confusion_matrix
                }
                print_results(train_results, args, 'TRAINING RESULTS',
                              train_accuracies, train_per_class_accuracies,
                              train_macro_f1_scores, train_per_class_f1_scores)
                # reset train confusion matrix
                train_confusion_matrix = np.zeros(shape=(args.num_classes,
                                                         args.num_classes),
                                                  dtype=np.int32)

        val_per_class_accuracies = np.asarray(val_per_class_accuracies)
        train_per_class_accuracies = np.asarray(train_per_class_accuracies)
        val_per_class_f1_scores = np.asarray(val_per_class_f1_scores)
        train_per_class_f1_scores = np.asarray(train_per_class_f1_scores)

        plot_accuracy(val_accuracies, train_accuracies, title='Accuracy')
        plot_accuracy(val_per_class_accuracies[:, 0],
                      train_per_class_accuracies[:, 0],
                      title='Accuracy Class 0 Positive Sentiment')
        plot_accuracy(val_per_class_accuracies[:, 1],
                      train_per_class_accuracies[:, 1],
                      title='Accuracy Class 1 Negative Sentiment')
        plot_accuracy(val_per_class_accuracies[:, 2],
                      train_per_class_accuracies[:, 2],
                      title='Accuracy Class 2 Neutral Sentiment')
        plot_accuracy(val_per_class_accuracies[:, 3],
                      train_per_class_accuracies[:, 3],
                      title='Accuracy Class 3 Not Applicable Sentiment')

        plot_f1_score(val_macro_f1_scores,
                      train_macro_f1_scores,
                      title='Macro F1 Score')
        plot_f1_score(val_per_class_f1_scores[:, 0],
                      train_per_class_f1_scores[:, 0],
                      title='F1 Score Class 0 Positive Sentiment')
        plot_f1_score(val_per_class_f1_scores[:, 1],
                      train_per_class_f1_scores[:, 1],
                      title='F1 Score Class 1 Negative Sentiment')
        plot_f1_score(val_per_class_f1_scores[:, 2],
                      train_per_class_f1_scores[:, 2],
                      title='F1 Score Class 2 Neutral Sentiment')
        plot_f1_score(val_per_class_f1_scores[:, 3],
                      train_per_class_f1_scores[:, 3],
                      title='F1 Score Class 3 Not Applicable Sentiment')

        return best_step_number
Пример #3
0
def evaluate(session, ops, dataset):
    losses = []
    accuracies = []
    f1_scores = []
    masked_predictions = []
    aspects = []
    ground_truths = []
    masks = []
    cm = np.zeros(shape = (args.num_classes, args.num_classes), dtype = np.int32)
    test_metrics = {}
    if args.mode == 'test':
        n_sentiment_classes = args.num_classes
        n_aspect = len(GERMEVAL_ASPECT_WORD_INDEX_MAP) - 1
        n_total_classes = n_aspect * (n_sentiment_classes - 1) + 1
        n_multilabel_success = 0
        n_multilabel_failure = 0
        n_sentence = 0
        args.batch_size = n_aspect

        per_aspect_sentiments_cm = np.zeros(shape = (n_aspect, n_sentiment_classes, n_sentiment_classes),
                                            dtype = np.int32)
        per_aspect_aspect_detection_cm = np.zeros(shape = (n_aspect + 1, 2, 2), dtype = np.int32)
        joint_aspect_sentiment_cm = np.zeros(shape = (n_total_classes, 2, 2), dtype = np.int32)
    for x1, x2, y in batch_iterator(dataset, args.batch_size, 1):
        # get feed_dicts
        fd = get_feed_data(x1, x2, y, is_training = False, args = args)

        # run evaluation
        val_accuracy, loss, f1_score, confusion_matrix, masked_prediction = session.run(
            [ops['accuracy'], ops['loss'], ops['f1_score'], ops['confusion_matrix'], ops['masked_predictions']], fd)

        losses.append(loss)
        accuracies.append(val_accuracy)
        f1_scores.append(f1_score)
        cm += confusion_matrix
        masked_predictions.append(masked_prediction)
        aspects.append(x1)
        ground_truths.append(y)
        masks.append(fd[SENTENCE_MASK_TENSOR_NAME])
        if args.mode == 'test':
            eval_results = evaluation_metrics(fd[ASPECTS_TENSOR_NAME], fd[PADDED_REVIEWS_TENSOR_NAME],
                                              fd[PADDED_LABELS_TENSOR_NAME], masked_prediction,
                                              GERMEVAL_ASPECT_WORD_INDEX_MAP)
            per_aspect_sentiments_cm += eval_results['per_aspect_sentiments_cm']
            per_aspect_aspect_detection_cm += eval_results['per_aspect_aspect_detection_cm']
            joint_aspect_sentiment_cm += eval_results['joint_aspect_sentiment_cm']
            n_multilabel_success += eval_results['n_multilabel_success']
            n_multilabel_failure += eval_results['n_multilabel_failure']
            n_sentence += eval_results['count']
            test_metrics = {
                'per_aspect_sentiments_cm': per_aspect_sentiments_cm,
                'per_aspect_aspect_detection_cm': per_aspect_aspect_detection_cm,
                'joint_aspect_sentiment_cm': joint_aspect_sentiment_cm,
                'n_multilabel_success': n_multilabel_success,
                'n_multilabel_failure': n_multilabel_failure,
                'n_sentence': n_sentence
            }

    df = {'loss': losses,
          'accuracy': accuracies,
          'f1_score': f1_scores,
          'confusion_matrix': cm,
          'masked_predictions': masked_predictions,
          'aspects': aspects,
          'ground_truths': ground_truths,
          'masks': masks,
          'test_metrics': test_metrics
          }
    return df