def save_model(self): if self.results_path is not None: if self.save_type == "tf": print("Saving model. ") # Use model's graph with self._tf_graph.as_default(): # Complete path checkpoint_path = Path(self.results_path, "Checkpoint", 'model.checkpoint') # Create folder if needed ensure_folder(checkpoint_path) # Save session to path tf.train.Saver(tf.trainable_variables()).save( self._sess, str(checkpoint_path)) if self.save_type == "sk": print("Saving model. ") pickle.dump(self.model, Path(self.results_path, "model.p").open("wb"))
def __init__(self, results_path, save_type=None, summary_ignore=set(), name_formatter="{}"): # Make graph and session self._tf_graph = tf.Graph() self._sess = tf.Session(graph=self._tf_graph) self.save_type = save_type self.model = None self._auto_summary_keys = None name_formatter = name_formatter if name_formatter is not None else "{}" self._name = name_formatter.format(self._class_name()) # Create automatic summary dictionary self._create_autosummary_dict(summary_ignore) # Set path if results_path is not None: self.results_path = self.create_model_path( results_path=results_path) ensure_folder(self.results_path) else: self.results_path = None
print("\nSingle training Results -- TRAINING --\n" + "-" * 75) print( classification_results_training._to_dataset_split("Model").to_dataframe()) print("\nSingle training Results -- TEST --\n" + "-" * 75) print(classification_results_test._to_dataset_split("Model").to_dataframe()) print("\nModel Summary --\n" + "-" * 75) print(model.summary_to_string()) ################################### # Storage # Make path for database database_path = Path(overfit_like_crazy_directory, "results.db") ensure_folder(database_path) # Data for results-database headers = [ "name", "n_train_programs", "n_test_programs", *["{}_train".format(val.name()) for val in eval_functions], *["{}_test".format(val.name()) for val in eval_functions], "model_str" ] results_data = [ model.name, n_train_programs, n_test_programs, *classification_results_training.data.tolist()[0], *classification_results_test.data.tolist()[0], model.autosummary_str() ] # Append results
def leave_one_program_out_cv(tensor_provider, model_list, path, eval_functions=None, limit=None, return_predictions=False, save_ranked_sentences=True, save_full_predictions=True, save_model_weights=True): """ :param TensorProvider tensor_provider: Class providing all data to models. :param list[DetektorModel] model_list: List of model-classes for testing. :param list[Evaluation] eval_functions: List of evaluation functions used to test models. :param bool return_predictions: If True, the method stores all model test-predictions and returns them as well. Can be used to determine whether errors are the same across models. :param int | None limit: Only perform analysis on some programs (for testing) If None - run on all programs. :param Path path: Path for storing results :return: """ ensure_folder(path) # TODO: Consider also looping over loss-functions: classic ones and weighed ones n_models = len(model_list) # Default evaluation score if eval_functions is None: eval_functions = [ Accuracy(), F1(), TruePositives(), TrueNegatives(), FalsePositives(), FalseNegatives(), Samples(), AreaUnderROC(), ROC() ] # Elements keys keys = list(sorted(tensor_provider.accessible_annotated_keys)) # Get program ids and number of programs program_ids = np.array(list(zip(*keys))[0]) unique_programs = np.array(sorted(set(program_ids))) n_programs = len(unique_programs) program_names = ["P{:02d}".format(val + 1) for val in range(n_programs)] # Dictionary for holding actual predictions (they vary in length which discourages an array) test_predictions = dict() # Initialize array for holding results special_results = dict() evaluation_names = [ val.name() for val in eval_functions if val.is_single_value ] classification_results = np.full( (n_programs, n_models, len(evaluation_names)), np.nan) classification_results = xr.DataArray( classification_results, name="Loo Results", dims=["Program", "Model", "Evaluation"], coords=dict(Program=program_names, Model=[model.name for model in model_list], Evaluation=evaluation_names)) # Initialize file for storing ranked sentences if save_ranked_sentences: rank_file = Path(path, "ranked_sentences.txt").open("w") # Loop over programs loo = LeaveOneOut() limit = len(unique_programs) if limit is None else limit print("\n\nRunning Leave-One-Out Tests.\n" + "-" * 75) for program_nr, (train, test) in enumerate( list(loo.split(unique_programs))[:limit]): program_name = program_names[program_nr] # Get split indices train_idx = np.where(program_ids != unique_programs[test])[0] test_idx = np.where(program_ids == unique_programs[test])[0] # Convert to keys train_idx = [keys[val] for val in train_idx] test_idx = [keys[val] for val in test_idx] # Report print("Program {}, using {} training samples and {} test samples.". format(program_nr + 1, len(train_idx), len(test_idx))) # Make and set BoW-vocabulary bow_vocabulary = tensor_provider.extract_programs_vocabulary(train_idx) tensor_provider.set_bow_vocabulary(bow_vocabulary) # Get truth of test-set y_true = tensor_provider.load_labels(data_keys_or_idx=test_idx) # Go through models for model_nr, model in enumerate(model_list): model_name = model.name # Initialize model model.initialize_model(tensor_provider=tensor_provider) # Fit model model.fit(tensor_provider=tensor_provider, train_idx=train_idx, verbose=2) # Predict on test-data for performance y_pred, y_pred_binary = model.predict( tensor_provider=tensor_provider, predict_idx=test_idx) y_pred = np.squeeze(y_pred) y_pred_binary = np.squeeze(y_pred_binary) # Store predictions if return_predictions: test_predictions.setdefault(model_name, dict())[program_name] = y_pred # Save the best ranked senteces (in terms of claim) if save_ranked_sentences: rank_file.write("Test program: %s \n" % program_names[program_nr]) rank_file.write(model.summary_to_string()) ranked_sentences, rank_score, rank_indices \ = tensor_provider.get_ranked_predictions(y_pred, test_idx) rank_file.write("Sentence, Proability of claim, Truth \n") ranked_labels = tensor_provider.load_labels(rank_indices) for r in range(len(ranked_sentences)): rank_file.write( "%s , %.5f, %i \n" % (ranked_sentences[r], rank_score[r], ranked_labels[r])) rank_file.write("\n") # Save predictions on full test set if save_full_predictions: with Path(path, "%s_predictions.txt" % program_names[program_nr]).open("w") as file: all_sentences = tensor_provider.load_original_sentences( test_idx) for r in range(len(all_sentences)): file.write("%i;%.5f;%s\n" % (y_true[r], y_pred[r], all_sentences[r])) # Save model weights in case of logistic regression if save_model_weights and model_name == "LogisticRegressionSKLEARN": # TODO: Save most important weights in classification print(' ') # Evaluate with eval_functions evaluation_nr = 0 for evalf in eval_functions: assert y_pred.shape == y_true.shape, "y_pred ({}) and y_true ({}) " \ "do not have same shape".format(y_pred.shape, y_true.shape) if evalf.is_single_value: evaluation_result = evalf(y_true=y_true, y_pred=y_pred, y_pred_binary=y_pred_binary) classification_results[program_nr, model_nr, evaluation_nr] = evaluation_result evaluation_nr += 1 else: special_results[(model.name, evalf.name(), program_nr)] = evalf( y_true=y_true, y_pred=y_pred, y_pred_binary=y_pred_binary) ### # Plot ROC curves if wanted # Go through models models_mean_rocs = [] for model in model_list: rocs = [] labels = [] # Go through programs for program_nr in range(len(unique_programs)): key = (model.name, "ROC", program_nr) if key in special_results: rocs.append(special_results[key]) labels.append("Program {}".format(program_nr)) # Plot ROCs for each program for this model plot_multiple_rocs(rocs=rocs, labels=labels, center_line=False) mean = mean_rocs(rocs) models_mean_rocs.append(mean) plot_roc(*mean, title=model.name, label="Mean", color="black", linestyle="--") plt.legend() # Store figure file_name = "ROC_{}".format(model.name) save_fig(Path(path, file_name)) plt.close() # Plot mean-ROCs for models names = [model.name for model in model_list] plot_multiple_rocs(rocs=models_mean_rocs, labels=names, center_line=True, title="Models Mean-ROC") plt.legend() save_fig(Path(path, "Models_ROC")) plt.close() if save_ranked_sentences: rank_file.close() if return_predictions: return classification_results, special_results, test_predictions return classification_results, special_results
if not return_pad_mask: return square else: mask_square = np.zeros(a_matrix.shape).flatten() mask_square = pad_method(mask_square, 1, pad_elements) mask_square = mask_square.reshape((sides, sides)) # type: np.ndarray return square, mask_square if __name__ == "__main__": sparse = {"bow"} plt.close("all") # Save tests results_dir = Path(ProjectPaths.results, "tensor_provider_tests") ensure_folder(results_dir) redirect_stdout_to_file(Path(results_dir, "log.txt")) # Initialize the_tensor_provider = TensorProvider(verbose=True) # Get accessible keys all_keys = list(sorted(the_tensor_provider.accessible_annotated_keys)) print("\nTesting tensor provider.") the_test_keys = random.sample(all_keys, 20) test = the_tensor_provider.load_data_tensors( the_test_keys, word_counts=True, char_counts=True, word_embedding=True,
import json import sqlite3 from io import TextIOWrapper from pathlib import Path from tempfile import NamedTemporaryFile import fastText from project_paths import ProjectPaths from util.utilities import ensure_folder # Paths database_path = Path(ProjectPaths.nlp_data_dir, "nlp_data.db") storage_folder = ProjectPaths.fast_text_dir out_path = Path(storage_folder, "vectors.db") ensure_folder(storage_folder) print("CWD: {}".format(Path.cwd())) print("Creating temporary file.") temp_file = NamedTemporaryFile(mode="w+", delete=False) # type: TextIOWrapper print("Temporary file at: {}".format(temp_file.name)) print("Loading all programs' tokens") connection = sqlite3.connect(str(database_path)) cursor = connection.cursor() rows = cursor.execute("SELECT tokens FROM tagger").fetchall() print("Loading programs into temporary file") for sentence in rows: sentence = json.loads(sentence[0])
from models.PositiveLearningElkan.pu_learning import PULogisticRegressionSK from project_paths import ProjectPaths from run_files.single_train import single_training from util.learning_rate_utilities import linear_geometric_curve from util.tensor_provider import TensorProvider from util.utilities import ensure_folder if __name__ == "__main__": # Initialize tensor-provider (data-source) the_tensor_provider = TensorProvider(verbose=True) # Results path base_path = Path(ProjectPaths.results, "final_model_comparison") shutil.rmtree(str(base_path), ignore_errors=True) ensure_folder(base_path) # Get program IDs all_program_ids = the_tensor_provider.annotated_program_ids( access_restricted_data=True) training_programs = the_tensor_provider.accessible_annotated_program_ids test_programs = set(all_program_ids).difference(set(training_programs)) # Settings n_runs = 1 ################ # MULTIPLE RUNS # Paths for all models model_paths = []
print("") # Run model on unlabelled data print("Running trained model on unlabeled data") predictions, binary_predictions = model.predict( tensor_provider=the_tensor_provider, predict_idx=unlabelled) predictions = predictions.tolist() binary_predictions = [int(val) for val in binary_predictions] # Get unlabelled sentences unlabelled_sentences = the_tensor_provider.load_original_sentences(unlabelled) # Make directory for results results_path = Path(ProjectPaths.results, "active_learning_{}".format(model.name)) ensure_folder(results_path) # Make data for sql-database sql_data = [ *zip(*unlabelled), predictions, binary_predictions, unlabelled_sentences ] assert all([len(val) == len(sql_data[0]) for val in sql_data]) # Make database database_path = Path(results_path, "results.db") rows2sql_table(data=sql_data, database_path=database_path, table_name="predictions", column_headers=[ "program_id", "sentence_id", "predictions", "binary_predictions", "sentence"
def random_sequence_batch(character_list, min_length, max_length, batch_size): character_list = [val for val in character_list if val != end_of_word_char] lengths = np.random.randint(low=min_length, high=max_length + 1, size=batch_size).tolist() sequences = [] for length in lengths: sequence = "".join(np.random.choice(list(character_list), size=length, replace=True)) sequences.append(sequence) return sequences # Output directory output_dir = Path("data", f"spelling_model") ensure_folder(output_dir) # Model settings cells = 100 ##### print("Getting data.") # Get data texts = [] matrix_path = Path("data", "DeepFactData", "annotated", "data_matrix_sample_programs.csv") with matrix_path.open("r") as file: csv_reader = csv.reader(file, delimiter=",") for row in csv_reader:
def single_training(tensor_provider, model, test_split, training_split, base_path, eval_functions=None, return_predictions=False, split_is_keys=False, access_restricted_data=False): """ :param TensorProvider tensor_provider: Class providing all data to models. :param DetektorModel model: Model-class to train and test. :param list | np.ndarray test_split: List of program IDs or sentence-keys used for testing (depending on programs_are_keys). :param list | np.ndarray training_split: List of program IDs or sentence-keys used for training. (depending on programs_are_keys). :param Path base_path: Path of directory where we can put results (in a subdirectory with the model's name). :param list[Evaluation] eval_functions: List of evaluation functions used to test models. :param bool return_predictions: If True, the method stores all model test-predictions and returns them as well. Can be used to determine whether errors are the same across models. :param bool split_is_keys: False: test_split and training_split are program numbers. True: test_split and training_split are sentence KEYS (list of (program_id, sentence_id)-tuples). """ # Create model-specific path and ensure directory results_path = model.results_path if results_path is None: results_path = model.create_model_path(results_path=base_path) ensure_folder(results_path) # Write name with Path(results_path, "name.txt").open("w") as file: file.write(model.generate_settings_name()) # Redirect prints to a file and denote script start-time redirect_stdout_to_file(Path(results_path, "log.txt")) print("Script starting at: {}".format( datetime.now().strftime("%d-%m-%Y %H:%M:%S"))) # Default evaluation score if eval_functions is None: eval_functions = [ Accuracy(), F1(), TruePositives(), TrueNegatives(), FalsePositives(), FalseNegatives(), Samples(), AreaUnderROC(), ROC() ] # Initialize array for holding results special_results_train = dict() evaluation_names = [ val.name() for val in eval_functions if val.is_single_value ] classification_results_train = np.full((1, len(evaluation_names)), np.nan) classification_results_train = SDataArray(classification_results_train, name="Training Results", dims=["Model", "Evaluation"], coords=dict( Evaluation=evaluation_names, Model=[model.name])) special_results_test = dict() classification_results_test = np.full((1, len(evaluation_names)), np.nan) classification_results_test = SDataArray(classification_results_test, name="Test Results", dims=["Model", "Evaluation"], coords=dict( Evaluation=evaluation_names, Model=[model.name])) # Check if split is in keys and not programs if split_is_keys: train_idx = training_split test_idx = test_split # Otherwise use program-indices to get keys for training and test (the correct and default way) else: # Sentences keys if not access_restricted_data: keys = list(sorted(tensor_provider.accessible_annotated_keys)) else: keys = list( sorted( tensor_provider.annotated_keys( access_restricted_data=True))) # Get program ids and number of programs program_ids = np.array(list(zip(*keys))[0]) # Get test-indices test_idx = np.sum([program_ids == val for val in test_split], axis=0) test_idx = np.where(test_idx > 0.5)[0] # Get test-indices train_idx = np.sum([program_ids == val for val in training_split], axis=0) train_idx = np.where(train_idx > 0.5)[0] # Convert to keys train_idx = [keys[val] for val in train_idx] test_idx = [keys[val] for val in test_idx] # Sanity check assert not set(test_idx).intersection( set(train_idx)), "Overlap between training and test set." # Report if not split_is_keys: print( "Test programs {}, using {} training samples and {} test samples.". format(test_split, len(train_idx), len(test_idx))) else: print( "Training and testing with specifically selected keys. {} training and {} test." .format(len(train_idx), len(test_idx))) # Make and set BoW-vocabulary bow_vocabulary = tensor_provider.extract_programs_vocabulary(train_idx) tensor_provider.set_bow_vocabulary(bow_vocabulary) # Get truth of train-set y_true_train = tensor_provider.load_labels(data_keys_or_idx=train_idx) # Get truth of test-set y_true = tensor_provider.load_labels(data_keys_or_idx=test_idx) # Initialize model model.initialize_model(tensor_provider=tensor_provider) # Number of parameters if model.save_type == "tf": with model._tf_graph.as_default(): print("Number of trainable parameters: {}".format( tf_number_of_trainable_parameters())) # Fit model model.fit(tensor_provider=tensor_provider, train_idx=train_idx, verbose=2) # Predict on training-data print("\tPredicting on training data") y_pred_train, y_pred_train_binary = model.predict( tensor_provider=tensor_provider, predict_idx=train_idx) y_pred_train = np.squeeze(y_pred_train) y_pred_train_binary = np.squeeze(y_pred_train_binary) train_predictions = y_pred_train # Predict on test-data for performance print("\tPredicting on test data") y_pred, y_pred_binary = model.predict(tensor_provider=tensor_provider, predict_idx=test_idx) y_pred = np.squeeze(y_pred) y_pred_binary = np.squeeze(y_pred_binary) # Store predictions test_predictions = y_pred # Evaluate with eval_functions print("\tRunning evaluation functions") evaluation_nr = 0 for evalf in eval_functions: # Training evaluation assert y_pred_train.shape == y_true_train.shape, "y_pred ({}) and y_true ({}) " \ "do not have same shape".format(y_pred_train.shape, y_true_train.shape) if evalf.is_single_value: evaluation_result = evalf(y_true=y_true_train, y_pred=y_pred_train, y_pred_binary=y_pred_train_binary) classification_results_train[0, evaluation_nr] = evaluation_result else: special_results_train[(model.name, evalf.name())] = evalf( y_true=y_true_train, y_pred=y_pred_train, y_pred_binary=y_pred_train_binary) # Test evaluation assert y_pred.shape == y_true.shape, "y_pred ({}) and y_true ({}) " \ "do not have same shape".format(y_pred.shape, y_true.shape) if evalf.is_single_value: evaluation_result = evalf(y_true=y_true, y_pred=y_pred, y_pred_binary=y_pred_binary) classification_results_test[0, evaluation_nr] = evaluation_result evaluation_nr += 1 else: special_results_test[(model.name, evalf.name())] = evalf( y_true=y_true, y_pred=y_pred, y_pred_binary=y_pred_binary) # Save model print("\tSaving model") model.save_model() # Return list returns = [ classification_results_train, classification_results_test, special_results_train, special_results_test, model.summary_to_string() ] # Additional returns if return_predictions: returns.extend([train_predictions, test_predictions]) ############################################ # Print, plot and store! # Make summary model_summary = model.summary_to_string() # Print mean results results_train = classification_results_train.to_dataset_split( "Model").to_dataframe() results_test = classification_results_test.to_dataset_split( "Model").to_dataframe() with Path(results_path, "results.txt").open("w") as file: file.write(model_summary + "\n\n") print("Training\n") file.write(str(results_train) + "\n\n") print("Test\n") file.write(str(results_test) + "\n\n") # Store results pickle.dump(results_train, Path(results_path, "results_train.p").open("wb")) pickle.dump(results_test, Path(results_path, "results_test.p").open("wb")) # Basic settings settings = dict() if not split_is_keys: settings["test_programs"] = test_split settings["training_programs"] = training_split else: settings["test_programs"] = "specific keys" settings["training_programs"] = "specific keys" pickle.dump(settings, Path(results_path, "settings.p").open("wb")) # Print results for each data-set print("\nSingle training Results - TRAINING \n" + "-" * 75) print(results_train) print("\nSingle training Results - TEST \n" + "-" * 75) print(results_test) print("\nModel Summary \n" + "-" * 75) print(model_summary) # Plot ROC of training roc_key = (model.name, "ROC") if roc_key in special_results_train: positive_rate, negative_rate = special_results_train[roc_key] plot_roc(tp_rate=positive_rate, fp_rate=negative_rate, title="{} ROC Training".format(model.name)) save_fig(Path(results_path, "ROC_Train")) # Plot ROC of test if roc_key in special_results_test: positive_rate, negative_rate = special_results_test[roc_key] plot_roc(tp_rate=positive_rate, fp_rate=negative_rate, title="{} ROC Test".format(model.name)) save_fig(Path(results_path, "ROC_Test")) # Print ending print("Script ended at: {}".format( datetime.now().strftime("%d-%m-%Y %H:%M:%S"))) close_stdout_file() # Write a file called done.txt to mark that the script is done with Path(results_path, "done.txt").open("w") as file: file.write("The deed is done. ") return tuple(returns)