axis_array[row_arg, col_arg].imshow(image, cmap=cmap) axis_array[row_arg, col_arg].set_title(titles[image_arg]) image_arg = image_arg + 1 plt.tight_layout() if __name__ == '__main__': # from utils.data_manager import DataManager from utils.utils import get_labels from keras.models import load_model import pickle # dataset_name = 'fer2013' # model_path = '../trained_models/emotion_models/simple_CNN.985-0.66.hdf5' dataset_name = 'fer2013' class_decoder = get_labels(dataset_name) # data_manager = DataManager(dataset_name) # faces, emotions = data_manager.get_data() faces = pickle.load(open('faces.pkl', 'rb')) emotions = pickle.load(open('emotions.pkl', 'rb')) pretty_imshow(plt.gca(), make_mosaic(faces[:4], 2, 2), cmap='gray') plt.show() """ image_arg = 0 face = faces[image_arg:image_arg + 1] emotion = emotions[image_arg:image_arg + 1] display_image(face, emotion, class_decoder) plt.show() normal_imshow(plt.gca(), make_mosaic(faces[:4], 3, 3), cmap='gray') plt.show()
axis_array[row_arg, col_arg].axis('off') axis_array[row_arg, col_arg].imshow(image, cmap=cmap) axis_array[row_arg, col_arg].set_title(titles[image_arg]) image_arg = image_arg + 1 plt.tight_layout() if __name__ == '__main__': #from utils.data_manager import DataManager from utils.utils import get_labels from keras.models import load_model import pickle #dataset_name = 'fer2013' #model_path = '../trained_models/emotion_models/simple_CNN.985-0.66.hdf5' dataset_name = 'fer2013' class_decoder = get_labels(dataset_name) #data_manager = DataManager(dataset_name) #faces, emotions = data_manager.get_data() faces = pickle.load(open('faces.pkl', 'rb')) emotions = pickle.load(open('emotions.pkl', 'rb')) pretty_imshow(plt.gca(), make_mosaic(faces[:4], 2, 2), cmap='gray') plt.show() """ image_arg = 0 face = faces[image_arg:image_arg + 1] emotion = emotions[image_arg:image_arg + 1] display_image(face, emotion, class_decoder) plt.show() normal_imshow(plt.gca(), make_mosaic(faces[:4], 3, 3), cmap='gray')
def main(hparams): try: logger.handlers.clear() except: pass logger_name = "SVM - Cross Validation" logger = logging.getLogger(logger_name) logger.setLevel("INFO") file_handler = logging.StreamHandler(sys.stdout) log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' formatter = logging.Formatter(log_format) file_handler.setFormatter(formatter) logger.addHandler(file_handler) metrics = { "eval": { "acc": [], "f1": [], "precision": [], "recall": [], "mcc": [] }, "test": { "acc": [], "f1": [], "precision": [], "recall": [], "mcc": [] } } logger.info("Hyperparameter:") print(json.dumps(vars(hparams), indent=4)) df_test = pd.read_csv("{}/gold-standard-testset.csv".format( hparams.data_path)) if hparams.amount_labels == 1: label_cols = ["GH"] else: label_cols = get_labels(hparams.amount_labels) for seed in range(5): logger.info( "Starting Classification for training split {}".format(seed)) logger.info("Loading Text and Labels - use augmentations: {}".format( hparams.augment)) data_path_split = "{}/{}/".format(hparams.data_path, seed) if hparams.augment: df_train = pd.read_csv( "{}df_train_{}_augmented_{}labels.csv".format( data_path_split, seed, hparams.amount_labels)) else: assert hparams.amount_labels == 21 df_train = pd.read_csv("{}df_train_{}.csv".format( data_path_split, seed)) df_eval = pd.read_csv("{}df_eval_{}.csv".format(data_path_split, seed)) text_train = df_train.text.values text_eval = df_eval.text.values text_test = df_test.text.values labels_train = df_train.loc[:, label_cols].values labels_eval = df_eval.loc[:, label_cols].values labels_test = df_test.loc[:, label_cols].values logger.info("Computing Features") vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3)) features_train = vectorizer.fit_transform(text_train) features_eval = vectorizer.transform(text_eval) features_test = vectorizer.transform(text_test) logger.info("Starting Training") strategy = BinaryRelevance model = SVC(verbose=True, class_weight="balanced") classifier = strategy(model) classifier.fit(features_train, labels_train) print("\n") logger.info("Starting Prediction for Evaluation and Testset") predictions_eval = classifier.predict(features_eval).todense() predictions_test = classifier.predict(features_test).todense() logger.info("Metrics on Evaluationset") metrics_eval = scoring(labels_eval, predictions_eval) logger.info("Metrics on Testset") metrics_test = scoring(labels_test, predictions_test) for metric, value in metrics_eval.items(): metrics["eval"][metric].append(value) for metric, value in metrics_test.items(): metrics["test"][metric].append(value) print( "--------------------------------------------------------------------------------" ) logger.info("Cross Validation complete") logger.info("Averaged Metrics on Evaluationset") metrics_eval = metrics["eval"] for metric, value in metrics_eval.items(): logger.info("{}: {}".format(metric, np.array(value).mean())) logger.info("Averaged Metrics on Testset") metrics_test = metrics["test"] for metric, value in metrics_test.items(): logger.info("{}: {}".format(metric, np.array(value).mean())) logger.handlers.clear()
def prepare_data(self): split_seed = self.hparams.data_path.strip("/").split("/")[-1] if split_seed != "test": train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format( self.hparams.data_path, split_seed, self.hparams.amount_labels) eval_data_path = "{}/df_eval_{}.csv".format( self.hparams.data_path, split_seed) else: train_data_path = "{}/df_test.csv".format(self.hparams.data_path) eval_data_path = "{}/df_test.csv".format(self.hparams.data_path) train_eval_path = "/".join( self.hparams.data_path.strip("/").split("/")[:-1]) test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path) training_set = pd.read_csv(train_data_path) evaluation_set = pd.read_csv(eval_data_path) try: test_set = pd.read_csv(test_data_path) except: test_set = pd.read_csv("/" + test_data_path) labels = get_labels(self.hparams.amount_labels) max_length = 4096 print("Computing Input") training_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(training_set.text.values, total=len(training_set)) ] training_input_ids = [ training_input["input_ids"] for training_input in training_inputs ] training_attention_mask = [ training_input["attention_mask"] for training_input in training_inputs ] training_labels = training_set.loc[:, labels].values evaluation_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(evaluation_set.text.values, total=len(evaluation_set)) ] evaluation_input_ids = [ evaluation_input["input_ids"] for evaluation_input in evaluation_inputs ] evaluation_attention_mask = [ evaluation_input["attention_mask"] for evaluation_input in evaluation_inputs ] evaluation_labels = evaluation_set.loc[:, labels].values test_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(test_set.text.values, total=len(test_set)) ] test_input_ids = [ test_input["input_ids"] for test_input in test_inputs ] test_attention_mask = [ test_input["attention_mask"] for test_input in test_inputs ] test_labels = test_set.loc[:, labels].values training_input_ids = torch.tensor(training_input_ids) training_attention_mask = torch.tensor(training_attention_mask) training_labels = torch.tensor(training_labels) evaluation_input_ids = torch.tensor(evaluation_input_ids) evaluation_attention_mask = torch.tensor(evaluation_attention_mask) evaluation_labels = torch.tensor(evaluation_labels) test_input_ids = torch.tensor(test_input_ids) test_attention_mask = torch.tensor(test_attention_mask) test_labels = torch.tensor(test_labels) self.label_weights = 1 / \ evaluation_labels.shape[0] * evaluation_labels.sum(dim=0) self.train_data = TensorDataset(training_input_ids, training_attention_mask, training_labels) self.valid_data = TensorDataset(evaluation_input_ids, evaluation_attention_mask, evaluation_labels) self.test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)
def prepare_data(self): split_seed = self.hparams.data_path.strip("/").split("/")[-1] if split_seed != "test": if self.hparams.label is not None: train_data_path = "{}/df_train_{}{}_augmented_{}labels.csv".format( self.hparams.data_path, self.hparams.label.lower(), split_seed, self.hparams.amount_labels) else: train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format( self.hparams.data_path, split_seed, self.hparams.amount_labels) eval_data_path = "{}/df_eval_{}.csv".format( self.hparams.data_path, split_seed) else: train_data_path = "{}/df_test.csv".format(self.hparams.data_path) eval_data_path = "{}/df_test.csv".format(self.hparams.data_path) train_eval_path = "/".join( self.hparams.data_path.strip("/").split("/")[:-1]) test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path) try: training_set = pd.read_csv(train_data_path) except: if self.hparams.augment: print("Augmenting Data...") train_data_path_unaug = "{}/df_train_{}.csv".format( self.hparams.data_path, split_seed) unaug_training_set = pd.read_csv(train_data_path_unaug) augmentations_path = "/".join( self.hparams.data_path.strip("/").split("/")[:-2]) augmentations_df = pd.read_csv( "/{}/article_confirmed_summary_augmentations.csv".format( augmentations_path)) training_set = augment_dataframe( df=unaug_training_set, augmentations_df=augmentations_df, categories=[self.hparams.label.upper()]) training_set.to_csv(train_data_path, index=False) else: print("Loading Data without Augmentation...") train_data_path = "{}/df_train_{}.csv".format( self.hparams.data_path, split_seed) training_set = pd.read_csv(train_data_path) training_set.loc[:, "none"] = 0 for index, row in training_set.iterrows(): if row.loc["none"] == 0: training_set.loc[index, "none"] = 1 evaluation_set = pd.read_csv(eval_data_path) try: test_set = pd.read_csv(test_data_path) except: test_set = pd.read_csv("/" + test_data_path) if self.hparams.label is not None: not_label = "Not{}".format(self.hparams.label.upper()) training_set.rename(columns={"none": not_label}, inplace=True) evaluation_set.loc[:, not_label] = 0 for index, row in evaluation_set.iterrows(): if row.UD == 0: evaluation_set.loc[index, not_label] = 1 test_set.loc[:, not_label] = 0 for index, row in test_set.iterrows(): if row.UD == 0: test_set.loc[index, not_label] = 1 labels = [self.hparams.label.upper(), not_label] else: labels = get_labels(self.hparams.amount_labels) max_length = 512 data_column = "summary_{}".format(self.hparams.summary_type) print("Computing Input") training_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(training_set.loc[:, data_column].values, total=len(training_set)) ] training_input_ids = [ training_input["input_ids"] for training_input in training_inputs ] training_attention_mask = [ training_input["attention_mask"] for training_input in training_inputs ] training_labels = training_set.loc[:, labels].values evaluation_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(evaluation_set.loc[:, data_column].values, total=len(evaluation_set)) ] evaluation_input_ids = [ evaluation_input["input_ids"] for evaluation_input in evaluation_inputs ] evaluation_attention_mask = [ evaluation_input["attention_mask"] for evaluation_input in evaluation_inputs ] evaluation_labels = evaluation_set.loc[:, labels].values test_inputs = [ self.tokenizer(text, max_length=max_length, padding="max_length", truncation=True) for text in tqdm(test_set.loc[:, data_column].values, total=len(test_set)) ] test_input_ids = [ test_input["input_ids"] for test_input in test_inputs ] test_attention_mask = [ test_input["attention_mask"] for test_input in test_inputs ] test_labels = test_set.loc[:, labels].values training_input_ids = torch.tensor(training_input_ids) training_attention_mask = torch.tensor(training_attention_mask) training_labels = torch.tensor(training_labels) evaluation_input_ids = torch.tensor(evaluation_input_ids) evaluation_attention_mask = torch.tensor(evaluation_attention_mask) evaluation_labels = torch.tensor(evaluation_labels) test_input_ids = torch.tensor(test_input_ids) test_attention_mask = torch.tensor(test_attention_mask) test_labels = torch.tensor(test_labels) self.label_weights = 1 / \ evaluation_labels.shape[0] * evaluation_labels.sum(dim=0) self.train_data = TensorDataset(training_input_ids, training_attention_mask, training_labels) self.valid_data = TensorDataset(evaluation_input_ids, evaluation_attention_mask, evaluation_labels) self.test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)
def prepare_data(self): split_seed = self.hparams.data_path.strip("/").split("/")[-1] if split_seed != "test": train_data_path = "{}/df_train_{}_augmented_{}labels.csv".format( self.hparams.data_path, split_seed, self.hparams.amount_labels) eval_data_path = "{}/df_eval_{}.csv".format( self.hparams.data_path, split_seed) else: train_data_path = "{}/df_test.csv".format(self.hparams.data_path) eval_data_path = "{}/df_test.csv".format(self.hparams.data_path) train_eval_path = "/".join( self.hparams.data_path.strip("/").split("/")[:-1]) test_data_path = "{}/gold-standard-testset.csv".format(train_eval_path) training_set = pd.read_csv(train_data_path) evaluation_set = pd.read_csv(eval_data_path) try: test_set = pd.read_csv(test_data_path) except: test_set = pd.read_csv("/" + test_data_path) labels = get_labels(self.hparams.amount_labels) split_length = self.hparams.split_size shift = self.hparams.shift self.tokenizer.padding_side = "right" self.tokenizer.pad_token = "<pad>" with IncreasedLoggingLevel("transformers.tokenization_utils_base"): max_length_training = max([ len(self.tokenizer(text)["input_ids"]) for text in training_set.text.values ]) max_length_evaluation = max([ len(self.tokenizer(text)["input_ids"]) for text in evaluation_set.text.values ]) max_length_test = max([ len(self.tokenizer(text)["input_ids"]) for text in test_set.text.values ]) max_padding_training = ceil(max_length_training/split_length) * \ split_length + split_length max_padding_evaluation = ceil(max_length_evaluation/split_length) * \ split_length + split_length max_padding_test = ceil(max_length_test/split_length) * \ split_length + split_length print("Computing Input") with IncreasedLoggingLevel("transformers.tokenization_utils_base"): training_inputs = [ self.tokenizer(text, max_length=max_padding_training, padding="max_length", truncation=True) for text in tqdm(training_set.text.values, total=len(training_set)) ] training_input_ids, training_attention_mask = split_articles( training_inputs, max_length=max_padding_training, split_length=split_length, shift=shift) training_labels = training_set.loc[:, labels].values with IncreasedLoggingLevel("transformers.tokenization_utils_base"): evaluation_inputs = [ self.tokenizer(text, max_length=max_padding_evaluation, padding="max_length", truncation=True) for text in tqdm(evaluation_set.text.values, total=len(evaluation_set)) ] evaluation_input_ids, evaluation_attention_mask = split_articles( evaluation_inputs, max_length=max_padding_evaluation, split_length=split_length, shift=shift) evaluation_labels = evaluation_set.loc[:, labels].values with IncreasedLoggingLevel("transformers.tokenization_utils_base"): test_inputs = [ self.tokenizer(text, max_length=max_padding_test, padding="max_length", truncation=True) for text in tqdm(test_set.text.values, total=len(test_set)) ] test_input_ids, test_attention_mask = split_articles( test_inputs, max_length=max_padding_test, split_length=split_length, shift=shift) test_labels = test_set.loc[:, labels].values training_input_ids = torch.tensor(training_input_ids) training_attention_mask = torch.tensor(training_attention_mask) training_labels = torch.tensor(training_labels) evaluation_input_ids = torch.tensor(evaluation_input_ids) evaluation_attention_mask = torch.tensor(evaluation_attention_mask) evaluation_labels = torch.tensor(evaluation_labels) test_input_ids = torch.tensor(test_input_ids) test_attention_mask = torch.tensor(test_attention_mask) test_labels = torch.tensor(test_labels) self.label_weights = 1 / \ evaluation_labels.shape[0] * evaluation_labels.sum(dim=0) self.train_data = TensorDataset(training_input_ids, training_attention_mask, training_labels) self.valid_data = TensorDataset(evaluation_input_ids, evaluation_attention_mask, evaluation_labels) self.test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)