예제 #1
0
    def add_predictions(self, dataset, name="set", logs={}):
        X = dataset[0]
        y = dataset[1]

        if self.regression:
            y_pred = self.model.predict(X, batch_size=2048, verbose=0)
            y_pred = numpy.reshape(y_pred, y.shape)
            y_test = y

        # test if the labels are categorical or singular
        else:
            if len(y.shape) > 1:
                try:
                    y_pred = self.model.predict_classes(X,
                                                        batch_size=2048,
                                                        verbose=0)
                except Exception as e:
                    y_pred = self.predic_classes(
                        self.model.predict(X, batch_size=2048, verbose=0))

                y_test = onehot_to_categories(y)

            else:
                y_pred = self.model.predict(X, batch_size=2048, verbose=0)
                y_pred = numpy.array([int(_y > 0.5) for _y in y_pred])
                y_test = y

        for k, metric in self.metrics.items():
            score = numpy.squeeze(metric(y_test, y_pred))
            entry = ".".join([name, k])
            self.params['metrics'].append(entry)
            logs[entry] = score
예제 #2
0
                                   mode="min",
                                   verbose=1,
                                   save_best_only=True)

_callbacks = []
_callbacks.append(metrics_callback)
_callbacks.append(plotting)
_callbacks.append(weights)

############################################################################
# APPLY CLASS WEIGHTS
############################################################################
if TASK == "BD":
    class_weights = get_class_weights2(training[1], smooth_factor=0)
else:
    class_weights = get_class_weights2(onehot_to_categories(training[1]),
                                       smooth_factor=0.1)

print("Class weights:",
      {cat_to_class_mapping[c]: w
       for c, w in class_weights.items()})

history = nn_model.fit(training[0],
                       training[1],
                       validation_data=(validation[0],
                                        validation[1]) if not FINAL else
                       (testing[0], testing[1]),
                       nb_epoch=50,
                       batch_size=64,
                       class_weight=class_weights,
                       callbacks=_callbacks)
예제 #3
0
    def run(self,
            train,
            test,
            features=None,
            test_features=None,
            extra_train=None,
            callbacks=True):
        self.tokenizer.fit_on_texts(train.text.values)

        features_dim = features.shape[1] if features is not None else None

        X_train, Y_train = self.get_features_targets(train)
        X_test, Y_test = self.get_features_targets(
            test, features_dim=X_train.shape[1])

        if extra_train is not None:
            self.tokenizer.fit_on_texts(extra_train.text.values)
            X_extra_train, Y_extra_train \
                = self.get_features_targets(extra_train)
            if X_extra_train.shape[1] > X_train.shape[1]:
                X_train = pad_sequences(X_train, maxlen=X_extra_train.shape[1])
                X_test = pad_sequences(X_test, maxlen=X_extra_train.shape[1])

        vocab_size = len(self.tokenizer.word_index) + 1

        class_count = 3 if self.ternary else 2

        embedding_matrix = None

        if self.use_embeddings:
            embedding_manager = EmbeddingManager()
            embedding_matrix = embedding_manager.get_embedding_matrix(
                self.tokenizer.word_index, self.embedding_dim)

        base_model_params = {
            'input_dim': X_train.shape[1],
            'class_count': class_count,
            'features_dim': features_dim,
            'dropout': self.dropout
        }

        if self.model_type == "elmo":
            params = {
                **base_model_params, 'index_word': self.tokenizer.index_word
            }
            self.model = ElmoModel().compile(**params)
        elif self.model_type == "bid_attent":
            params = {
                **base_model_params, 'vocab_size': vocab_size,
                'embedding_matrix': embedding_matrix,
                'embedding_dim': self.embedding_dim
            }
            self.model = BidirectionalAttention().compile(**params)
        else:
            params = {
                **base_model_params, 'vocab_size': vocab_size,
                'embedding_matrix': embedding_matrix,
                'embedding_dim': self.embedding_dim
            }
            self.model = BaselineWithFeatures().compile(**params)

        self.logger.setup(ternary=self.ternary,
                          embeddings=self.use_embeddings,
                          train_set=X_train,
                          test_set=X_test,
                          vocab_size=vocab_size,
                          epochs=self.epochs,
                          batch_size=self.batch_size,
                          dropout=self.dropout,
                          extra_train=extra_train is not None)

        self.model.summary(print_fn=self.logger.write)

        fit_params = {
            'batch_size':
            self.batch_size,
            'callbacks':
            self.get_callbacks() if callbacks else [],
            'epochs':
            self.epochs,
            'validation_split':
            self.validation_split,
            'verbose':
            1,
            'class_weight':
            get_class_weights2(onehot_to_categories(Y_train), smooth_factor=0)
        }

        if extra_train is not None:
            training = self.model.fit(X_extra_train, Y_extra_train,
                                      **fit_params)
            self.logger.write_history(training)

        train_input = [X_train, features] if features is not None else X_train
        test_input = [X_test, test_features
                      ] if features is not None else X_test

        training = self.model.fit(train_input, Y_train, **fit_params)
        pred_classes = self.model.predict(test_input, verbose=1).argmax(axis=1)

        self.logger.write_history(training)
        self.print_results(pred_classes, Y_test, class_count=class_count)
        self.save_output_for_scoring(test.tweet_id, pred_classes)