예제 #1
0
def main_training():
    lexicon_loader = LexiconLoader()
    scored_lexicon: dict = lexicon_loader.load_all_and_merge()
    tr_tweets_loader = LabeledTweetsLoader(TRAINING_INPUT_FILENAME)
    tr_labeled_tweets = tr_tweets_loader.parse_tokens_and_labels(
        tr_tweets_loader.load_lines())

    token_summarizer = TokenSummarizer(scored_lexicon)
    feature_extractor = FeatureExtractor(scored_lexicon)

    vu = VocabUtil()
    nn_input_preparer = NNInputPreparer(vu)

    tr_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in tr_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        tr_feature_vectors.append(feature_vector)
    tr_network_input = np.array(tr_feature_vectors)
    tr_targets = [labeled_tweet[1] for labeled_tweet in tr_labeled_tweets]
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_targets)

    dev_tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME)
    dev_labeled_tweets = dev_tweets_loader.parse_tokens_and_labels(
        dev_tweets_loader.load_lines())
    dev_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in dev_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        dev_feature_vectors.append(feature_vector)
    dev_network_input = np.array(dev_feature_vectors)
    dev_targets = [labeled_tweet[1] for labeled_tweet in dev_labeled_tweets]
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_targets)

    # Every epoch is cheap (< 1ms), so we don't need the ability to continue training from a previous model.
    print("Commencing new training run")
    model_creator = ModelCreator(vu)
    model = model_creator.create_two_dense_model(hidden_layer_size=HIDDEN_SIZE)

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'
    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    model.fit(tr_network_input,
              tr_targets_one_hot_encoded,
              batch_size=32,
              epochs=MAX_EPOCHS,
              validation_data=(dev_network_input, dev_targets_one_hot_encoded),
              callbacks=[checkpoint])
예제 #2
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')

    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(TRAINING_MODEL_FILENAME)
    trained_model.summary()

    lexicon_loader = LexiconLoader()
    scored_lexicon: dict = lexicon_loader.load_all_and_merge()
    token_summarizer = TokenSummarizer(scored_lexicon)
    feature_extractor = FeatureExtractor(scored_lexicon)
    vu = VocabUtil()
    nn_input_preparer = NNInputPreparer(vu)

    for input_filename in [DEV_INPUT_FILENAME]:
        tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME)
        labeled_tweets = tweets_loader.parse_tokens_and_labels(
            tweets_loader.load_lines())
        feature_vectors = []  # 2D array of feature vectors
        for labeled_tweet in labeled_tweets:
            known_token_sequence = token_summarizer.get_known_tokens(
                labeled_tweet[0])
            feature_vector = feature_extractor.compute_feature_vector(
                known_token_sequence)
            feature_vectors.append(feature_vector)
        network_input = np.array(feature_vectors)
        print('network_input.shape:', network_input.shape)
        targets = [labeled_tweet[1] for labeled_tweet in labeled_tweets]
        targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
            targets)

        trained_model.evaluate(network_input, targets_one_hot_encoded)

        argmax_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()),
            dtype=int)
        expected_sampling_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()))

        expected_sampling_accuracy_sum = 0.0
        num_correct_argmax_predictions = 0
        for rectangular_input, target_human in tqdm(zip(
                network_input, targets)):
            rectangular_input.shape = (1, 3)
            target_index = vu.nn_rsl_to_int[target_human]
            predicted_probabilities = trained_model(rectangular_input)[0]
            # the predicted index if we take the class with the largest probability
            argmax_index = np.argmax(predicted_probabilities)
            if argmax_index == target_index:
                num_correct_argmax_predictions += 1
            argmax_confusion_matrix[target_index][argmax_index] += 1
            # rhs is the probability of guessing target_index if we sample according to predicted probabilities
            expected_sampling_accuracy_sum += tf.keras.backend.get_value(
                predicted_probabilities[target_index])
            for i in range(vu.get_output_vocab_size()):
                expected_sampling_confusion_matrix[target_index][
                    i] += predicted_probabilities[i]
        num_tweets_in_dataset = len(targets)

        print(f'Argmax accuracy for {input_filename}:',
              num_correct_argmax_predictions / num_tweets_in_dataset)
        print(f'Expected sampling accuracy for {input_filename}:',
              expected_sampling_accuracy_sum / num_tweets_in_dataset)

        print(
            f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", argmax_confusion_matrix)
        print(
            f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)