예제 #1
0
    def test_loads_expected_number_of_tweets(self):
        # We need to use the dev set as the test set, so we will optimize based on this training set.
        loader = LabeledDataLoader('../data/pos/train.conll')
        train_tweets_with_labeled_tokens = loader.parse_raw_tokens_and_labels(loader.load_lines())
        self.assertEqual(27893, len(train_tweets_with_labeled_tokens))

        # The test data is NOT labeled, so we want to use this dev set as the test set.
        loader = LabeledDataLoader('../data/pos/dev.conll')
        dev_tweets_with_labeled_tokens = loader.parse_raw_tokens_and_labels(loader.load_lines())
        self.assertEqual(4298, len(dev_tweets_with_labeled_tokens))
예제 #2
0
    def test_has_one_of_three_labels(self):
        # We need to use the dev set as the test set, so we will optimize based on this training set.
        loader = LabeledDataLoader('../data/sa/train.conll')
        train_tweets_with_labels = loader.parse_tokens_and_labels(
            loader.load_lines())
        train_labels_set = set(
            [labeled_tweet[1] for labeled_tweet in train_tweets_with_labels])
        self.assertEqual({'positive', 'negative', 'neutral'}, train_labels_set)

        loader = LabeledDataLoader('../data/sa/dev.conll')
        dev_tweets_with_labels = loader.parse_tokens_and_labels(
            loader.load_lines())
        dev_labels_set = set(
            [labeled_tweet[1] for labeled_tweet in dev_tweets_with_labels])
        self.assertEqual({'positive', 'negative', 'neutral'}, dev_labels_set)
예제 #3
0
def create_vocab_util_from_training_set(tr_input_filename: str) -> VocabUtil:
    """To keep things simple, we use the entire training set without filtering."""
    tr_loader = LabeledDataLoader(tr_input_filename)
    tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines())
    tr_unique_tokens = set([item[0] for tweet in tr_tweets for item in tweet])
    sorted_tr_tokens = sorted(tr_unique_tokens)
    return VocabUtil(sorted_tr_tokens)
예제 #4
0
    def test_loads_expected_number_of_tokens(self):
        # We need to use the dev set as the test set, so we will optimize based on this training set.
        loader = LabeledDataLoader('../data/pos/train.conll')
        train_tweets_with_labeled_tokens = loader.parse_tokens_and_labels(
            loader.load_lines())
        self.assertEqual(
            217068,
            sum((len(tweet) for tweet in train_tweets_with_labeled_tokens)))

        # The test data is NOT labeled, so we need to use this dev set as the test set.
        loader = LabeledDataLoader('../data/pos/dev.conll')
        dev_tweets_with_labeled_tokens = loader.parse_tokens_and_labels(
            loader.load_lines())
        self.assertEqual(
            33345, sum(
                (len(tweet) for tweet in dev_tweets_with_labeled_tokens)))
예제 #5
0
def create_vocab_util_from_training_set(tr_input_filename: str) -> VocabUtil:
    tr_loader = LabeledDataLoader(tr_input_filename)
    tr_labeled_tweets = tr_loader.parse_tokens_and_labels(
        tr_loader.load_lines())
    tr_unique_tokens = set([
        item for labeled_tweet in tr_labeled_tweets
        for item in labeled_tweet[0]
    ])
    tr_sorted_tokens = sorted(tr_unique_tokens)
    print(f"Creating VocabUtil from {len(tr_sorted_tokens)} unique tokens.")
    return VocabUtil(tr_sorted_tokens)
예제 #6
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')

    vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME)
    nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN)

    tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME)
    tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines())
    tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets)
    print(
        f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens'
    )
    tr_irregular_inputs = [[
        vu.nn_input_token_to_int[item[0]] for item in tweet
    ] for tweet in tr_tweets]
    tr_irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet]
                            for tweet in tr_tweets]
    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_rectangular_targets = nn_input_preparer.rectangularize_targets(
        tr_irregular_targets)
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model.summary()
    else:
        print("Commencing new training run")
        model_creator = LstmModelCreator(vu,
                                         embedding_dim=EMBEDDING_DIM,
                                         lstm_dim=LSTM_DIM,
                                         mask_zero=MASK_ZERO)
        model = model_creator.create_bi_lstm_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    rectangular_inputs, _, targets_one_hot_encoded = \
        prep_validation_set(DEV_INPUT_FILENAME, nn_input_preparer, vu)

    model.fit(x=tr_rectangular_inputs,
              y=tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(rectangular_inputs, targets_one_hot_encoded),
              callbacks=[checkpoint])
예제 #7
0
def prep_validation_set(input_filename: str,
                        nn_input_preparer: NNInputPreparer, vu: VocabUtil,
                        upsample: bool):
    loader = LabeledDataLoader(input_filename)
    labeled_tweets = loader.parse_tokens_and_labels(loader.load_lines())
    labeled_tweets = nn_input_preparer.filter_out_long_tweets(labeled_tweets)
    if upsample:
        labeled_tweets = nn_input_preparer.crude_upsample(labeled_tweets)
    irregular_inputs = [[
        vu.nn_input_token_to_int[token] if token in vu.nn_input_token_to_int
        else vu.nn_input_token_to_int['<OOV>'] for token in labeled_tweet[0]
    ] for labeled_tweet in labeled_tweets]
    rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        irregular_inputs)
    rectangular_targets = [tweet[1] for tweet in labeled_tweets]
    targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        rectangular_targets)

    return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
예제 #8
0
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    loader = LabeledDataLoader(input_filename)
    tweets = loader.parse_tokens_and_labels(loader.load_lines())
    tweets = nn_input_preparer.filter_out_long_sequences(tweets)
    print(
        f'processing all not-too-long {len(tweets)} tweets from {input_filename}'
    )
    irregular_inputs = [[
        vu.nn_input_token_to_int[item[0]] if item[0]
        in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>']
        for item in tweet
    ] for tweet in tweets]
    irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet]
                         for tweet in tweets]
    rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        irregular_inputs)
    rectangular_targets = nn_input_preparer.rectangularize_targets(
        irregular_targets)
    targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        rectangular_targets)
    return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
예제 #9
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    tr_loader = LabeledDataLoader(TR_INPUT_FILENAME)
    tr_labeled_tweets = tr_loader.parse_tokens_and_labels(
        tr_loader.load_lines())
    tr_labeled_tweets = btc.convert(tr_labeled_tweets)
    tr_labeled_tweets = btc.convert_to_ids(tr_labeled_tweets)
    tr_labeled_tweets = btc.prepend_cls(tr_labeled_tweets)
    tr_labeled_tweets = nn_input_preparer.filter_out_long_sequences(
        tr_labeled_tweets)
    print(
        f'Processing all not-too-long {len(tr_labeled_tweets)} tweets from {TR_INPUT_FILENAME}'
    )
    tr_irregular_inputs = [tweet[0] for tweet in tr_labeled_tweets]
    tr_rectangular_targets = [tweet[1] for tweet in tr_labeled_tweets]
    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model.summary()
    else:
        print("Commencing new training run")
        model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                         tvu=tvu,
                                         max_seq_len=MAX_SEQ_LEN)
        model = model_creator.create_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME)
    dev_labeled_tweets = tr_loader.parse_tokens_and_labels(
        dev_loader.load_lines())
    dev_labeled_tweets = btc.convert(dev_labeled_tweets)
    dev_labeled_tweets = btc.convert_to_ids(dev_labeled_tweets)
    dev_labeled_tweets = btc.prepend_cls(dev_labeled_tweets)
    dev_labeled_tweets = nn_input_preparer.filter_out_long_sequences(
        dev_labeled_tweets)
    print(
        f'Processing all not-too-long {len(dev_labeled_tweets)} tweets from {DEV_INPUT_FILENAME}'
    )
    dev_irregular_inputs = [tweet[0] for tweet in dev_labeled_tweets]
    dev_rectangular_targets = [tweet[1] for tweet in dev_labeled_tweets]
    dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        dev_irregular_inputs)
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_rectangular_targets)

    model.fit(tr_rectangular_inputs,
              tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(dev_rectangular_inputs,
                               dev_targets_one_hot_encoded),
              callbacks=[checkpoint])

    model.save(FINAL_TRAINED_MODEL_FILENAME)
예제 #10
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')
    tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME)
    tr_tweets = tr_loader.parse_raw_tokens_and_labels(tr_loader.load_lines())

    tvu = TargetVocabUtil()

    tokenizer = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                   tvu=tvu)
    tr_tweets = tokenizer.convert_to_tokens(tr_tweets)
    tr_tweets = tokenizer.convert_to_ids(tr_tweets)

    nn_input_preparer = NNInputPreparer(tvu, max_seq_len=MAX_SEQ_LEN)
    tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets)

    print(
        f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens'
    )

    tr_irregular_inputs = [[item[0] for item in tweet] for tweet in tr_tweets]
    tr_irregular_targets = [[item[1] for item in tweet] for tweet in tr_tweets]

    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_rectangular_targets = nn_input_preparer.rectangularize_targets(
        tr_irregular_targets)

    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE,
                           custom_objects={"BertModelLayer": BertModelLayer})
        model.summary()
    else:
        print('Commencing new training run')
        model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                         tvu=tvu,
                                         max_seq_len=MAX_SEQ_LEN,
                                         freeze_bert_layer=True)
        model = model_creator.create_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME)
    dev_tweets = dev_loader.parse_raw_tokens_and_labels(
        dev_loader.load_lines())
    dev_tweets = tokenizer.convert_to_tokens(dev_tweets)
    dev_tweets = tokenizer.convert_to_ids(dev_tweets)
    dev_tweets = nn_input_preparer.filter_out_long_sequences(dev_tweets)
    print(
        f'processing all {len(dev_tweets)} not-too-long tweets from {DEV_INPUT_FILENAME}'
    )
    dev_irregular_inputs = [[item[0] for item in tweet]
                            for tweet in dev_tweets]
    # print('proportion of non-pad in dev:', sum(len(tweet) for tweet in tweets) / (MAX_SEQ_LEN * len(tweets)))
    dev_irregular_targets = [[item[1] for item in tweet]
                             for tweet in dev_tweets]
    dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        dev_irregular_inputs)
    dev_rectangular_targets = nn_input_preparer.rectangularize_targets(
        dev_irregular_targets)
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_rectangular_targets)

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    model.fit(tr_rectangular_inputs,
              tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(dev_rectangular_inputs,
                               dev_targets_one_hot_encoded),
              callbacks=[checkpoint])

    model.save(FINAL_TRAINED_MODEL_FILENAME)
예제 #11
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')
    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(
        TRAINING_MODEL_FILENAME,
        custom_objects={"BertModelLayer": BertModelLayer})
    trained_model.summary()

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        loader = LabeledDataLoader(input_filename)
        tweets = loader.parse_tokens_and_labels(loader.load_lines())
        tweets = btc.convert(tweets)
        tweets = btc.convert_to_ids(tweets)
        tweets = btc.prepend_cls(tweets)
        tweets = nn_input_preparer.filter_out_long_sequences(tweets)
        print(
            f'Processing all not-too-long {len(tweets)} tweets from {input_filename}'
        )

        irregular_inputs = [tweet[0] for tweet in tweets]
        rectangular_targets = [tweet[1] for tweet in tweets]

        argmax_confusion_matrix = np.zeros(
            (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()),
            dtype=int)
        expected_sampling_confusion_matrix = np.zeros(
            (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()))

        num_correct_argmax_predictions = 0
        expected_sampling_accuracy_sum = 0.0

        for irregular_input, target_index in tqdm(
                zip(irregular_inputs, rectangular_targets)):
            rectangular_input_singleton = nn_input_preparer.rectangularize_inputs(
                [irregular_input])
            predicted_probabilities = trained_model(
                rectangular_input_singleton)[0]
            # the predicted index if we take the class with the largest probability
            argmax_index = np.argmax(predicted_probabilities)
            if argmax_index == target_index:
                num_correct_argmax_predictions += 1
            argmax_confusion_matrix[target_index][argmax_index] += 1

            # rhs is the probability of guessing target if we sample according to predicted probabilities
            expected_sampling_accuracy_sum += tf.keras.backend.get_value(
                predicted_probabilities[target_index])
            for i in range(tvu.get_output_vocab_size()):
                expected_sampling_confusion_matrix[target_index][
                    i] += predicted_probabilities[i]

        num_tweets_in_dataset = len(rectangular_targets)

        print(f'Argmax accuracy for {input_filename}:',
              num_correct_argmax_predictions / num_tweets_in_dataset)
        print(f'Expected sampling accuracy for {input_filename}:',
              expected_sampling_accuracy_sum / num_tweets_in_dataset)

        print(
            f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{tvu.raw_sentiment_labels}\n", argmax_confusion_matrix)
        print(
            f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{tvu.raw_sentiment_labels}\n",
            expected_sampling_confusion_matrix)
예제 #12
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')
    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(
        TRAINING_MODEL_FILENAME,
        custom_objects={"BertModelLayer": BertModelLayer})

    print('Loaded fine-tuned model:')
    trained_model.summary()

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(BERT_PRETRAINED_MODEL_DIR, tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        loader = LabeledDataLoader(input_filename)
        tweets = loader.parse_raw_tokens_and_labels(loader.load_lines())
        tweets = btc.convert_to_tokens(tweets)
        tweets = btc.convert_to_ids(tweets)
        tweets = nn_input_preparer.filter_out_long_sequences(tweets)
        print(
            f'processing all {len(tweets)} not-too-long tweets from {input_filename}'
        )
        irregular_inputs = [[item[0] for item in tweet] for tweet in tweets]
        irregular_targets = [[item[1] for item in tweet] for tweet in tweets]

        num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \
            num_token_level_correct_argmax_predictions_incl_pads = 0
        tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \
            token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0

        for (irregular_input, irregular_target_indices) in tqdm(
                zip(irregular_inputs, irregular_targets)):
            rectangular_inputs = nn_input_preparer.rectangularize_inputs(
                [irregular_input])
            rectangular_targets = nn_input_preparer.rectangularize_targets(
                [irregular_target_indices])
            num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0
            current_tweet_expected_sampling_accuracy_sum = 0.0
            predicted_probabilities_sequence = trained_model(
                rectangular_inputs)
            for predicted_probabilities, target_index in zip(
                    predicted_probabilities_sequence[0],
                    rectangular_targets[0]):
                # the predicted index if we take the class with the largest probability
                argmax_index = np.argmax(predicted_probabilities)
                # probability of guessing target_index if we sample according to predicted probabilities
                prob_sampling_success_on_token = tf.keras.backend.get_value(
                    predicted_probabilities[target_index])
                if argmax_index == target_index:
                    num_token_level_correct_argmax_predictions_incl_pads += 1
                token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token

                if target_index != 0:
                    if argmax_index == target_index:
                        num_token_level_correct_argmax_predictions += 1
                        num_current_tweet_correct_argmax_predictions += 1
                    current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    num_tokens_in_current_tweet += 1
                    num_tokens_in_dataset += 1

            # every tweet has at least one non-padding token, so we don't worry about division by zero
            current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet
            current_tweet_expected_sampling_accuracy = \
                current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet

            tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy
            tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy

        num_tweets_in_dataset = len(tweets)
        num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * num_tweets_in_dataset

        print(
            f'Argmax accuracy for {input_filename} including padding:',
            num_token_level_correct_argmax_predictions_incl_pads /
            num_tokens_in_dataset_incl_pads)
        print(
            f'Expected sampling accuracy for {input_filename} including padding:',
            token_level_expected_sampling_accuracy_sum_incl_pads /
            num_tokens_in_dataset_incl_pads)

        print(
            f'Token-level argmax accuracy for {input_filename}:',
            num_token_level_correct_argmax_predictions / num_tokens_in_dataset)
        print(
            f'Token-level expected sampling accuracy for {input_filename}:',
            token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset)

        print(f'Tweet-level argmax accuracy for {input_filename}:',
              tweet_level_argmax_accuracy_sum / num_tweets_in_dataset)
        print(
            f'Tweet-level expected sampling accuracy for {input_filename}:',
            tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)