示例#1
0
            axis_array[row_arg, col_arg].imshow(image, cmap=cmap)
            axis_array[row_arg, col_arg].set_title(titles[image_arg])
            image_arg = image_arg + 1
    plt.tight_layout()


if __name__ == '__main__':
    # from utils.data_manager import DataManager
    from utils.utils import get_labels
    from keras.models import load_model
    import pickle

    # dataset_name = 'fer2013'
    # model_path = '../trained_models/emotion_models/simple_CNN.985-0.66.hdf5'
    dataset_name = 'fer2013'
    class_decoder = get_labels(dataset_name)
    # data_manager = DataManager(dataset_name)
    # faces, emotions = data_manager.get_data()
    faces = pickle.load(open('faces.pkl', 'rb'))
    emotions = pickle.load(open('emotions.pkl', 'rb'))
    pretty_imshow(plt.gca(), make_mosaic(faces[:4], 2, 2), cmap='gray')
    plt.show()
    """
    image_arg = 0
    face = faces[image_arg:image_arg + 1]
    emotion = emotions[image_arg:image_arg + 1]
    display_image(face, emotion, class_decoder)
    plt.show()

    normal_imshow(plt.gca(), make_mosaic(faces[:4], 3, 3), cmap='gray')
    plt.show()
            axis_array[row_arg, col_arg].axis('off')
            axis_array[row_arg, col_arg].imshow(image, cmap=cmap)
            axis_array[row_arg, col_arg].set_title(titles[image_arg])
            image_arg = image_arg + 1
    plt.tight_layout()

if __name__ == '__main__':
    #from utils.data_manager import DataManager
    from utils.utils import get_labels
    from keras.models import load_model
    import pickle

    #dataset_name = 'fer2013'
    #model_path = '../trained_models/emotion_models/simple_CNN.985-0.66.hdf5'
    dataset_name = 'fer2013'
    class_decoder = get_labels(dataset_name)
    #data_manager = DataManager(dataset_name)
    #faces, emotions = data_manager.get_data()
    faces = pickle.load(open('faces.pkl', 'rb'))
    emotions = pickle.load(open('emotions.pkl', 'rb'))
    pretty_imshow(plt.gca(), make_mosaic(faces[:4], 2, 2), cmap='gray')
    plt.show()

    """
    image_arg = 0
    face = faces[image_arg:image_arg + 1]
    emotion = emotions[image_arg:image_arg + 1]
    display_image(face, emotion, class_decoder)
    plt.show()

    normal_imshow(plt.gca(), make_mosaic(faces[:4], 3, 3), cmap='gray')
def main(hparams):
    try:
        logger.handlers.clear()
    except:
        pass

    logger_name = "SVM - Cross Validation"
    logger = logging.getLogger(logger_name)
    logger.setLevel("INFO")

    file_handler = logging.StreamHandler(sys.stdout)
    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    formatter = logging.Formatter(log_format)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    metrics = {
        "eval": {
            "acc": [],
            "f1": [],
            "precision": [],
            "recall": [],
            "mcc": []
        },
        "test": {
            "acc": [],
            "f1": [],
            "precision": [],
            "recall": [],
            "mcc": []
        }
    }

    logger.info("Hyperparameter:")

    print(json.dumps(vars(hparams), indent=4))

    df_test = pd.read_csv("{}/gold-standard-testset.csv".format(
        hparams.data_path))

    if hparams.amount_labels == 1:
        label_cols = ["GH"]
    else:
        label_cols = get_labels(hparams.amount_labels)

    for seed in range(5):
        logger.info(
            "Starting Classification for training split {}".format(seed))

        logger.info("Loading Text and Labels - use augmentations: {}".format(
            hparams.augment))

        data_path_split = "{}/{}/".format(hparams.data_path, seed)
        if hparams.augment:
            df_train = pd.read_csv(
                "{}df_train_{}_augmented_{}labels.csv".format(
                    data_path_split, seed, hparams.amount_labels))
        else:
            assert hparams.amount_labels == 21
            df_train = pd.read_csv("{}df_train_{}.csv".format(
                data_path_split, seed))

        df_eval = pd.read_csv("{}df_eval_{}.csv".format(data_path_split, seed))

        text_train = df_train.text.values
        text_eval = df_eval.text.values
        text_test = df_test.text.values
        labels_train = df_train.loc[:, label_cols].values
        labels_eval = df_eval.loc[:, label_cols].values
        labels_test = df_test.loc[:, label_cols].values

        logger.info("Computing Features")
        vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
        features_train = vectorizer.fit_transform(text_train)
        features_eval = vectorizer.transform(text_eval)
        features_test = vectorizer.transform(text_test)

        logger.info("Starting Training")
        strategy = BinaryRelevance
        model = SVC(verbose=True, class_weight="balanced")
        classifier = strategy(model)
        classifier.fit(features_train, labels_train)
        print("\n")
        logger.info("Starting Prediction for Evaluation and Testset")
        predictions_eval = classifier.predict(features_eval).todense()
        predictions_test = classifier.predict(features_test).todense()

        logger.info("Metrics on Evaluationset")
        metrics_eval = scoring(labels_eval, predictions_eval)

        logger.info("Metrics on Testset")
        metrics_test = scoring(labels_test, predictions_test)

        for metric, value in metrics_eval.items():
            metrics["eval"][metric].append(value)
        for metric, value in metrics_test.items():
            metrics["test"][metric].append(value)

        print(
            "--------------------------------------------------------------------------------"
        )

    logger.info("Cross Validation complete")
    logger.info("Averaged Metrics on Evaluationset")

    metrics_eval = metrics["eval"]
    for metric, value in metrics_eval.items():
        logger.info("{}: {}".format(metric, np.array(value).mean()))

    logger.info("Averaged Metrics on Testset")

    metrics_test = metrics["test"]
    for metric, value in metrics_test.items():
        logger.info("{}: {}".format(metric, np.array(value).mean()))

    logger.handlers.clear()
示例#4
0
    def prepare_data(self):
        split_seed = self.hparams.data_path.strip("/").split("/")[-1]
        if split_seed != "test":
            train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format(
                self.hparams.data_path, split_seed, self.hparams.amount_labels)

            eval_data_path = "{}/df_eval_{}.csv".format(
                self.hparams.data_path, split_seed)
        else:
            train_data_path = "{}/df_test.csv".format(self.hparams.data_path)
            eval_data_path = "{}/df_test.csv".format(self.hparams.data_path)

        train_eval_path = "/".join(
            self.hparams.data_path.strip("/").split("/")[:-1])
        test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path)

        training_set = pd.read_csv(train_data_path)
        evaluation_set = pd.read_csv(eval_data_path)
        try:
            test_set = pd.read_csv(test_data_path)
        except:
            test_set = pd.read_csv("/" + test_data_path)

        labels = get_labels(self.hparams.amount_labels)

        max_length = 4096
        print("Computing Input")
        training_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(training_set.text.values, total=len(training_set))
        ]

        training_input_ids = [
            training_input["input_ids"] for training_input in training_inputs
        ]
        training_attention_mask = [
            training_input["attention_mask"]
            for training_input in training_inputs
        ]
        training_labels = training_set.loc[:, labels].values

        evaluation_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(evaluation_set.text.values,
                             total=len(evaluation_set))
        ]

        evaluation_input_ids = [
            evaluation_input["input_ids"]
            for evaluation_input in evaluation_inputs
        ]
        evaluation_attention_mask = [
            evaluation_input["attention_mask"]
            for evaluation_input in evaluation_inputs
        ]
        evaluation_labels = evaluation_set.loc[:, labels].values

        test_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(test_set.text.values, total=len(test_set))
        ]

        test_input_ids = [
            test_input["input_ids"] for test_input in test_inputs
        ]
        test_attention_mask = [
            test_input["attention_mask"] for test_input in test_inputs
        ]
        test_labels = test_set.loc[:, labels].values

        training_input_ids = torch.tensor(training_input_ids)
        training_attention_mask = torch.tensor(training_attention_mask)
        training_labels = torch.tensor(training_labels)

        evaluation_input_ids = torch.tensor(evaluation_input_ids)
        evaluation_attention_mask = torch.tensor(evaluation_attention_mask)
        evaluation_labels = torch.tensor(evaluation_labels)

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        test_labels = torch.tensor(test_labels)

        self.label_weights = 1 / \
            evaluation_labels.shape[0] * evaluation_labels.sum(dim=0)

        self.train_data = TensorDataset(training_input_ids,
                                        training_attention_mask,
                                        training_labels)
        self.valid_data = TensorDataset(evaluation_input_ids,
                                        evaluation_attention_mask,
                                        evaluation_labels)

        self.test_data = TensorDataset(test_input_ids, test_attention_mask,
                                       test_labels)
示例#5
0
    def prepare_data(self):
        split_seed = self.hparams.data_path.strip("/").split("/")[-1]
        if split_seed != "test":
            if self.hparams.label is not None:
                train_data_path = "{}/df_train_{}{}_augmented_{}labels.csv".format(
                    self.hparams.data_path, self.hparams.label.lower(),
                    split_seed, self.hparams.amount_labels)

            else:
                train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format(
                    self.hparams.data_path, split_seed,
                    self.hparams.amount_labels)

            eval_data_path = "{}/df_eval_{}.csv".format(
                self.hparams.data_path, split_seed)
        else:
            train_data_path = "{}/df_test.csv".format(self.hparams.data_path)
            eval_data_path = "{}/df_test.csv".format(self.hparams.data_path)

        train_eval_path = "/".join(
            self.hparams.data_path.strip("/").split("/")[:-1])
        test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path)

        try:
            training_set = pd.read_csv(train_data_path)
        except:
            if self.hparams.augment:
                print("Augmenting Data...")
                train_data_path_unaug = "{}/df_train_{}.csv".format(
                    self.hparams.data_path, split_seed)
                unaug_training_set = pd.read_csv(train_data_path_unaug)
                augmentations_path = "/".join(
                    self.hparams.data_path.strip("/").split("/")[:-2])

                augmentations_df = pd.read_csv(
                    "/{}/article_confirmed_summary_augmentations.csv".format(
                        augmentations_path))
                training_set = augment_dataframe(
                    df=unaug_training_set,
                    augmentations_df=augmentations_df,
                    categories=[self.hparams.label.upper()])
                training_set.to_csv(train_data_path, index=False)
            else:
                print("Loading Data without Augmentation...")
                train_data_path = "{}/df_train_{}.csv".format(
                    self.hparams.data_path, split_seed)
                training_set = pd.read_csv(train_data_path)
                training_set.loc[:, "none"] = 0
                for index, row in training_set.iterrows():
                    if row.loc["none"] == 0:
                        training_set.loc[index, "none"] = 1

        evaluation_set = pd.read_csv(eval_data_path)
        try:
            test_set = pd.read_csv(test_data_path)
        except:
            test_set = pd.read_csv("/" + test_data_path)

        if self.hparams.label is not None:
            not_label = "Not{}".format(self.hparams.label.upper())
            training_set.rename(columns={"none": not_label}, inplace=True)
            evaluation_set.loc[:, not_label] = 0
            for index, row in evaluation_set.iterrows():
                if row.UD == 0:
                    evaluation_set.loc[index, not_label] = 1
            test_set.loc[:, not_label] = 0
            for index, row in test_set.iterrows():
                if row.UD == 0:
                    test_set.loc[index, not_label] = 1
            labels = [self.hparams.label.upper(), not_label]
        else:
            labels = get_labels(self.hparams.amount_labels)

        max_length = 512

        data_column = "summary_{}".format(self.hparams.summary_type)

        print("Computing Input")
        training_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(training_set.loc[:, data_column].values,
                             total=len(training_set))
        ]

        training_input_ids = [
            training_input["input_ids"] for training_input in training_inputs
        ]
        training_attention_mask = [
            training_input["attention_mask"]
            for training_input in training_inputs
        ]
        training_labels = training_set.loc[:, labels].values

        evaluation_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(evaluation_set.loc[:, data_column].values,
                             total=len(evaluation_set))
        ]

        evaluation_input_ids = [
            evaluation_input["input_ids"]
            for evaluation_input in evaluation_inputs
        ]
        evaluation_attention_mask = [
            evaluation_input["attention_mask"]
            for evaluation_input in evaluation_inputs
        ]
        evaluation_labels = evaluation_set.loc[:, labels].values

        test_inputs = [
            self.tokenizer(text,
                           max_length=max_length,
                           padding="max_length",
                           truncation=True)
            for text in tqdm(test_set.loc[:, data_column].values,
                             total=len(test_set))
        ]

        test_input_ids = [
            test_input["input_ids"] for test_input in test_inputs
        ]
        test_attention_mask = [
            test_input["attention_mask"] for test_input in test_inputs
        ]
        test_labels = test_set.loc[:, labels].values

        training_input_ids = torch.tensor(training_input_ids)
        training_attention_mask = torch.tensor(training_attention_mask)
        training_labels = torch.tensor(training_labels)

        evaluation_input_ids = torch.tensor(evaluation_input_ids)
        evaluation_attention_mask = torch.tensor(evaluation_attention_mask)
        evaluation_labels = torch.tensor(evaluation_labels)

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        test_labels = torch.tensor(test_labels)

        self.label_weights = 1 / \
            evaluation_labels.shape[0] * evaluation_labels.sum(dim=0)

        self.train_data = TensorDataset(training_input_ids,
                                        training_attention_mask,
                                        training_labels)
        self.valid_data = TensorDataset(evaluation_input_ids,
                                        evaluation_attention_mask,
                                        evaluation_labels)

        self.test_data = TensorDataset(test_input_ids, test_attention_mask,
                                       test_labels)
    def prepare_data(self):

        split_seed = self.hparams.data_path.strip("/").split("/")[-1]
        if split_seed != "test":
            train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format(
                self.hparams.data_path, split_seed, self.hparams.amount_labels)

            eval_data_path = "{}/df_eval_{}.csv".format(
                self.hparams.data_path, split_seed)
        else:
            train_data_path = "{}/df_test.csv".format(self.hparams.data_path)
            eval_data_path = "{}/df_test.csv".format(self.hparams.data_path)

        train_eval_path = "/".join(
            self.hparams.data_path.strip("/").split("/")[:-1])
        test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path)

        training_set = pd.read_csv(train_data_path)
        evaluation_set = pd.read_csv(eval_data_path)
        try:
            test_set = pd.read_csv(test_data_path)
        except:
            test_set = pd.read_csv("/" + test_data_path)

        labels = get_labels(self.hparams.amount_labels)

        split_length = self.hparams.split_size
        shift = self.hparams.shift
        self.tokenizer.padding_side = "right"
        self.tokenizer.pad_token = "<pad>"

        with IncreasedLoggingLevel("transformers.tokenization_utils_base"):
            max_length_training = max([
                len(self.tokenizer(text)["input_ids"])
                for text in training_set.text.values
            ])
            max_length_evaluation = max([
                len(self.tokenizer(text)["input_ids"])
                for text in evaluation_set.text.values
            ])
            max_length_test = max([
                len(self.tokenizer(text)["input_ids"])
                for text in test_set.text.values
            ])

        max_padding_training = ceil(max_length_training/split_length) * \
            split_length + split_length
        max_padding_evaluation = ceil(max_length_evaluation/split_length) * \
            split_length + split_length
        max_padding_test = ceil(max_length_test/split_length) * \
            split_length + split_length

        print("Computing Input")
        with IncreasedLoggingLevel("transformers.tokenization_utils_base"):
            training_inputs = [
                self.tokenizer(text,
                               max_length=max_padding_training,
                               padding="max_length",
                               truncation=True)
                for text in tqdm(training_set.text.values,
                                 total=len(training_set))
            ]

        training_input_ids, training_attention_mask = split_articles(
            training_inputs,
            max_length=max_padding_training,
            split_length=split_length,
            shift=shift)

        training_labels = training_set.loc[:, labels].values

        with IncreasedLoggingLevel("transformers.tokenization_utils_base"):
            evaluation_inputs = [
                self.tokenizer(text,
                               max_length=max_padding_evaluation,
                               padding="max_length",
                               truncation=True)
                for text in tqdm(evaluation_set.text.values,
                                 total=len(evaluation_set))
            ]

        evaluation_input_ids, evaluation_attention_mask = split_articles(
            evaluation_inputs,
            max_length=max_padding_evaluation,
            split_length=split_length,
            shift=shift)

        evaluation_labels = evaluation_set.loc[:, labels].values

        with IncreasedLoggingLevel("transformers.tokenization_utils_base"):
            test_inputs = [
                self.tokenizer(text,
                               max_length=max_padding_test,
                               padding="max_length",
                               truncation=True)
                for text in tqdm(test_set.text.values, total=len(test_set))
            ]

        test_input_ids, test_attention_mask = split_articles(
            test_inputs,
            max_length=max_padding_test,
            split_length=split_length,
            shift=shift)

        test_labels = test_set.loc[:, labels].values

        training_input_ids = torch.tensor(training_input_ids)
        training_attention_mask = torch.tensor(training_attention_mask)
        training_labels = torch.tensor(training_labels)

        evaluation_input_ids = torch.tensor(evaluation_input_ids)
        evaluation_attention_mask = torch.tensor(evaluation_attention_mask)
        evaluation_labels = torch.tensor(evaluation_labels)

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        test_labels = torch.tensor(test_labels)

        self.label_weights = 1 / \
            evaluation_labels.shape[0] * evaluation_labels.sum(dim=0)

        self.train_data = TensorDataset(training_input_ids,
                                        training_attention_mask,
                                        training_labels)
        self.valid_data = TensorDataset(evaluation_input_ids,
                                        evaluation_attention_mask,
                                        evaluation_labels)
        self.test_data = TensorDataset(test_input_ids, test_attention_mask,
                                       test_labels)