Пример #1
0
    def save_model(self):
        if self.results_path is not None:
            if self.save_type == "tf":
                print("Saving model. ")

                # Use model's graph
                with self._tf_graph.as_default():
                    # Complete path
                    checkpoint_path = Path(self.results_path, "Checkpoint",
                                           'model.checkpoint')

                    # Create folder if needed
                    ensure_folder(checkpoint_path)

                    # Save session to path
                    tf.train.Saver(tf.trainable_variables()).save(
                        self._sess, str(checkpoint_path))

            if self.save_type == "sk":
                print("Saving model. ")

                pickle.dump(self.model,
                            Path(self.results_path, "model.p").open("wb"))
Пример #2
0
    def __init__(self,
                 results_path,
                 save_type=None,
                 summary_ignore=set(),
                 name_formatter="{}"):
        # Make graph and session
        self._tf_graph = tf.Graph()
        self._sess = tf.Session(graph=self._tf_graph)
        self.save_type = save_type
        self.model = None
        self._auto_summary_keys = None
        name_formatter = name_formatter if name_formatter is not None else "{}"
        self._name = name_formatter.format(self._class_name())

        # Create automatic summary dictionary
        self._create_autosummary_dict(summary_ignore)

        # Set path
        if results_path is not None:
            self.results_path = self.create_model_path(
                results_path=results_path)
            ensure_folder(self.results_path)
        else:
            self.results_path = None
print("\nSingle training Results -- TRAINING --\n" + "-" * 75)
print(
    classification_results_training._to_dataset_split("Model").to_dataframe())

print("\nSingle training Results -- TEST --\n" + "-" * 75)
print(classification_results_test._to_dataset_split("Model").to_dataframe())

print("\nModel Summary --\n" + "-" * 75)
print(model.summary_to_string())

###################################
# Storage

# Make path for database
database_path = Path(overfit_like_crazy_directory, "results.db")
ensure_folder(database_path)

# Data for results-database
headers = [
    "name", "n_train_programs", "n_test_programs",
    *["{}_train".format(val.name()) for val in eval_functions],
    *["{}_test".format(val.name()) for val in eval_functions], "model_str"
]
results_data = [
    model.name, n_train_programs, n_test_programs,
    *classification_results_training.data.tolist()[0],
    *classification_results_test.data.tolist()[0],
    model.autosummary_str()
]

# Append results
def leave_one_program_out_cv(tensor_provider,
                             model_list,
                             path,
                             eval_functions=None,
                             limit=None,
                             return_predictions=False,
                             save_ranked_sentences=True,
                             save_full_predictions=True,
                             save_model_weights=True):
    """
    :param TensorProvider tensor_provider: Class providing all data to models.
    :param list[DetektorModel] model_list: List of model-classes for testing.
    :param list[Evaluation] eval_functions: List of evaluation functions used to test models.
    :param bool return_predictions: If True, the method stores all model test-predictions and returns them as well.
                                    Can be used to determine whether errors are the same across models.
    :param int | None limit: Only perform analysis on some programs (for testing)
                             If None - run on all programs.
    :param Path path: Path for storing results
    :return:
    """
    ensure_folder(path)

    # TODO: Consider also looping over loss-functions: classic ones and weighed ones
    n_models = len(model_list)

    # Default evaluation score
    if eval_functions is None:
        eval_functions = [
            Accuracy(),
            F1(),
            TruePositives(),
            TrueNegatives(),
            FalsePositives(),
            FalseNegatives(),
            Samples(),
            AreaUnderROC(),
            ROC()
        ]

    # Elements keys
    keys = list(sorted(tensor_provider.accessible_annotated_keys))

    # Get program ids and number of programs
    program_ids = np.array(list(zip(*keys))[0])
    unique_programs = np.array(sorted(set(program_ids)))
    n_programs = len(unique_programs)
    program_names = ["P{:02d}".format(val + 1) for val in range(n_programs)]

    # Dictionary for holding actual predictions (they vary in length which discourages an array)
    test_predictions = dict()

    # Initialize array for holding results
    special_results = dict()
    evaluation_names = [
        val.name() for val in eval_functions if val.is_single_value
    ]
    classification_results = np.full(
        (n_programs, n_models, len(evaluation_names)), np.nan)
    classification_results = xr.DataArray(
        classification_results,
        name="Loo Results",
        dims=["Program", "Model", "Evaluation"],
        coords=dict(Program=program_names,
                    Model=[model.name for model in model_list],
                    Evaluation=evaluation_names))

    # Initialize file for storing ranked sentences
    if save_ranked_sentences:
        rank_file = Path(path, "ranked_sentences.txt").open("w")

    # Loop over programs
    loo = LeaveOneOut()
    limit = len(unique_programs) if limit is None else limit
    print("\n\nRunning Leave-One-Out Tests.\n" + "-" * 75)
    for program_nr, (train, test) in enumerate(
            list(loo.split(unique_programs))[:limit]):
        program_name = program_names[program_nr]

        # Get split indices
        train_idx = np.where(program_ids != unique_programs[test])[0]
        test_idx = np.where(program_ids == unique_programs[test])[0]

        # Convert to keys
        train_idx = [keys[val] for val in train_idx]
        test_idx = [keys[val] for val in test_idx]

        # Report
        print("Program {}, using {} training samples and {} test samples.".
              format(program_nr + 1, len(train_idx), len(test_idx)))

        # Make and set BoW-vocabulary
        bow_vocabulary = tensor_provider.extract_programs_vocabulary(train_idx)
        tensor_provider.set_bow_vocabulary(bow_vocabulary)

        # Get truth of test-set
        y_true = tensor_provider.load_labels(data_keys_or_idx=test_idx)

        # Go through models
        for model_nr, model in enumerate(model_list):
            model_name = model.name

            # Initialize model
            model.initialize_model(tensor_provider=tensor_provider)

            # Fit model
            model.fit(tensor_provider=tensor_provider,
                      train_idx=train_idx,
                      verbose=2)

            # Predict on test-data for performance
            y_pred, y_pred_binary = model.predict(
                tensor_provider=tensor_provider, predict_idx=test_idx)
            y_pred = np.squeeze(y_pred)
            y_pred_binary = np.squeeze(y_pred_binary)

            # Store predictions
            if return_predictions:
                test_predictions.setdefault(model_name,
                                            dict())[program_name] = y_pred

            # Save the best ranked senteces (in terms of claim)
            if save_ranked_sentences:
                rank_file.write("Test program: %s \n" %
                                program_names[program_nr])
                rank_file.write(model.summary_to_string())
                ranked_sentences, rank_score, rank_indices \
                    = tensor_provider.get_ranked_predictions(y_pred, test_idx)
                rank_file.write("Sentence, Proability of claim, Truth \n")
                ranked_labels = tensor_provider.load_labels(rank_indices)
                for r in range(len(ranked_sentences)):
                    rank_file.write(
                        "%s , %.5f, %i \n" %
                        (ranked_sentences[r], rank_score[r], ranked_labels[r]))
                rank_file.write("\n")

            # Save predictions on full test set
            if save_full_predictions:
                with Path(path, "%s_predictions.txt" %
                          program_names[program_nr]).open("w") as file:
                    all_sentences = tensor_provider.load_original_sentences(
                        test_idx)
                    for r in range(len(all_sentences)):
                        file.write("%i;%.5f;%s\n" %
                                   (y_true[r], y_pred[r], all_sentences[r]))

            # Save model weights in case of logistic regression
            if save_model_weights and model_name == "LogisticRegressionSKLEARN":
                # TODO: Save most important weights in classification
                print(' ')

            # Evaluate with eval_functions
            evaluation_nr = 0
            for evalf in eval_functions:
                assert y_pred.shape == y_true.shape, "y_pred ({}) and y_true ({}) " \
                                                     "do not have same shape".format(y_pred.shape, y_true.shape)

                if evalf.is_single_value:
                    evaluation_result = evalf(y_true=y_true,
                                              y_pred=y_pred,
                                              y_pred_binary=y_pred_binary)
                    classification_results[program_nr, model_nr,
                                           evaluation_nr] = evaluation_result
                    evaluation_nr += 1
                else:
                    special_results[(model.name, evalf.name(),
                                     program_nr)] = evalf(
                                         y_true=y_true,
                                         y_pred=y_pred,
                                         y_pred_binary=y_pred_binary)
    ###
    # Plot ROC curves if wanted

    # Go through models
    models_mean_rocs = []
    for model in model_list:
        rocs = []
        labels = []

        # Go through programs
        for program_nr in range(len(unique_programs)):
            key = (model.name, "ROC", program_nr)
            if key in special_results:
                rocs.append(special_results[key])
                labels.append("Program {}".format(program_nr))

        # Plot ROCs for each program for this model
        plot_multiple_rocs(rocs=rocs, labels=labels, center_line=False)
        mean = mean_rocs(rocs)
        models_mean_rocs.append(mean)
        plot_roc(*mean,
                 title=model.name,
                 label="Mean",
                 color="black",
                 linestyle="--")
        plt.legend()

        # Store figure
        file_name = "ROC_{}".format(model.name)
        save_fig(Path(path, file_name))
        plt.close()

    # Plot mean-ROCs for models
    names = [model.name for model in model_list]
    plot_multiple_rocs(rocs=models_mean_rocs,
                       labels=names,
                       center_line=True,
                       title="Models Mean-ROC")
    plt.legend()
    save_fig(Path(path, "Models_ROC"))
    plt.close()

    if save_ranked_sentences:
        rank_file.close()

    if return_predictions:
        return classification_results, special_results, test_predictions
    return classification_results, special_results
Пример #5
0
    if not return_pad_mask:
        return square
    else:
        mask_square = np.zeros(a_matrix.shape).flatten()
        mask_square = pad_method(mask_square, 1, pad_elements)
        mask_square = mask_square.reshape((sides, sides))  # type: np.ndarray
        return square, mask_square


if __name__ == "__main__":
    sparse = {"bow"}
    plt.close("all")

    # Save tests
    results_dir = Path(ProjectPaths.results, "tensor_provider_tests")
    ensure_folder(results_dir)
    redirect_stdout_to_file(Path(results_dir, "log.txt"))

    # Initialize
    the_tensor_provider = TensorProvider(verbose=True)

    # Get accessible keys
    all_keys = list(sorted(the_tensor_provider.accessible_annotated_keys))

    print("\nTesting tensor provider.")
    the_test_keys = random.sample(all_keys, 20)
    test = the_tensor_provider.load_data_tensors(
        the_test_keys,
        word_counts=True,
        char_counts=True,
        word_embedding=True,
Пример #6
0
import json
import sqlite3
from io import TextIOWrapper
from pathlib import Path
from tempfile import NamedTemporaryFile

import fastText

from project_paths import ProjectPaths
from util.utilities import ensure_folder

# Paths
database_path = Path(ProjectPaths.nlp_data_dir, "nlp_data.db")
storage_folder = ProjectPaths.fast_text_dir
out_path = Path(storage_folder, "vectors.db")
ensure_folder(storage_folder)

print("CWD: {}".format(Path.cwd()))

print("Creating temporary file.")
temp_file = NamedTemporaryFile(mode="w+", delete=False)  # type: TextIOWrapper
print("Temporary file at: {}".format(temp_file.name))

print("Loading all programs' tokens")
connection = sqlite3.connect(str(database_path))
cursor = connection.cursor()
rows = cursor.execute("SELECT tokens FROM tagger").fetchall()

print("Loading programs into temporary file")
for sentence in rows:
    sentence = json.loads(sentence[0])
from models.PositiveLearningElkan.pu_learning import PULogisticRegressionSK
from project_paths import ProjectPaths
from run_files.single_train import single_training
from util.learning_rate_utilities import linear_geometric_curve
from util.tensor_provider import TensorProvider
from util.utilities import ensure_folder

if __name__ == "__main__":

    # Initialize tensor-provider (data-source)
    the_tensor_provider = TensorProvider(verbose=True)

    # Results path
    base_path = Path(ProjectPaths.results, "final_model_comparison")
    shutil.rmtree(str(base_path), ignore_errors=True)
    ensure_folder(base_path)

    # Get program IDs
    all_program_ids = the_tensor_provider.annotated_program_ids(
        access_restricted_data=True)
    training_programs = the_tensor_provider.accessible_annotated_program_ids
    test_programs = set(all_program_ids).difference(set(training_programs))

    # Settings
    n_runs = 1

    ################
    # MULTIPLE RUNS

    # Paths for all models
    model_paths = []
print("")

# Run model on unlabelled data
print("Running trained model on unlabeled data")
predictions, binary_predictions = model.predict(
    tensor_provider=the_tensor_provider, predict_idx=unlabelled)
predictions = predictions.tolist()
binary_predictions = [int(val) for val in binary_predictions]

# Get unlabelled sentences
unlabelled_sentences = the_tensor_provider.load_original_sentences(unlabelled)

# Make directory for results
results_path = Path(ProjectPaths.results,
                    "active_learning_{}".format(model.name))
ensure_folder(results_path)

# Make data for sql-database
sql_data = [
    *zip(*unlabelled), predictions, binary_predictions, unlabelled_sentences
]
assert all([len(val) == len(sql_data[0]) for val in sql_data])

# Make database
database_path = Path(results_path, "results.db")
rows2sql_table(data=sql_data,
               database_path=database_path,
               table_name="predictions",
               column_headers=[
                   "program_id", "sentence_id", "predictions",
                   "binary_predictions", "sentence"
Пример #9
0
def random_sequence_batch(character_list, min_length, max_length, batch_size):
    character_list = [val for val in character_list if val != end_of_word_char]
    lengths = np.random.randint(low=min_length, high=max_length + 1, size=batch_size).tolist()

    sequences = []
    for length in lengths:
        sequence = "".join(np.random.choice(list(character_list), size=length, replace=True))
        sequences.append(sequence)

    return sequences


# Output directory
output_dir = Path("data", f"spelling_model")
ensure_folder(output_dir)

# Model settings
cells = 100


#####

print("Getting data.")

# Get data
texts = []
matrix_path = Path("data", "DeepFactData", "annotated", "data_matrix_sample_programs.csv")
with matrix_path.open("r") as file:
    csv_reader = csv.reader(file, delimiter=",")
    for row in csv_reader:
def single_training(tensor_provider,
                    model,
                    test_split,
                    training_split,
                    base_path,
                    eval_functions=None,
                    return_predictions=False,
                    split_is_keys=False,
                    access_restricted_data=False):
    """
    :param TensorProvider tensor_provider: Class providing all data to models.
    :param DetektorModel model: Model-class to train and test.
    :param list | np.ndarray test_split: List of program IDs or sentence-keys used for testing
                                            (depending on programs_are_keys).
    :param list | np.ndarray training_split: List of program IDs or sentence-keys used for training.
                                                (depending on programs_are_keys).
    :param Path base_path: Path of directory where we can put results (in a subdirectory with the model's name).
    :param list[Evaluation] eval_functions: List of evaluation functions used to test models.
    :param bool return_predictions: If True, the method stores all model test-predictions and returns them as well.
                                    Can be used to determine whether errors are the same across models.
    :param bool split_is_keys:
        False: test_split and training_split are program numbers.
        True: test_split and training_split are sentence KEYS (list of (program_id, sentence_id)-tuples).
    """
    # Create model-specific path and ensure directory
    results_path = model.results_path
    if results_path is None:
        results_path = model.create_model_path(results_path=base_path)
    ensure_folder(results_path)

    # Write name
    with Path(results_path, "name.txt").open("w") as file:
        file.write(model.generate_settings_name())

    # Redirect prints to a file and denote script start-time
    redirect_stdout_to_file(Path(results_path, "log.txt"))
    print("Script starting at: {}".format(
        datetime.now().strftime("%d-%m-%Y %H:%M:%S")))

    # Default evaluation score
    if eval_functions is None:
        eval_functions = [
            Accuracy(),
            F1(),
            TruePositives(),
            TrueNegatives(),
            FalsePositives(),
            FalseNegatives(),
            Samples(),
            AreaUnderROC(),
            ROC()
        ]

    # Initialize array for holding results
    special_results_train = dict()
    evaluation_names = [
        val.name() for val in eval_functions if val.is_single_value
    ]
    classification_results_train = np.full((1, len(evaluation_names)), np.nan)
    classification_results_train = SDataArray(classification_results_train,
                                              name="Training Results",
                                              dims=["Model", "Evaluation"],
                                              coords=dict(
                                                  Evaluation=evaluation_names,
                                                  Model=[model.name]))
    special_results_test = dict()
    classification_results_test = np.full((1, len(evaluation_names)), np.nan)
    classification_results_test = SDataArray(classification_results_test,
                                             name="Test Results",
                                             dims=["Model", "Evaluation"],
                                             coords=dict(
                                                 Evaluation=evaluation_names,
                                                 Model=[model.name]))

    # Check if split is in keys and not programs
    if split_is_keys:
        train_idx = training_split
        test_idx = test_split

    # Otherwise use program-indices to get keys for training and test (the correct and default way)
    else:
        # Sentences keys
        if not access_restricted_data:
            keys = list(sorted(tensor_provider.accessible_annotated_keys))
        else:
            keys = list(
                sorted(
                    tensor_provider.annotated_keys(
                        access_restricted_data=True)))

        # Get program ids and number of programs
        program_ids = np.array(list(zip(*keys))[0])

        # Get test-indices
        test_idx = np.sum([program_ids == val for val in test_split], axis=0)
        test_idx = np.where(test_idx > 0.5)[0]

        # Get test-indices
        train_idx = np.sum([program_ids == val for val in training_split],
                           axis=0)
        train_idx = np.where(train_idx > 0.5)[0]

        # Convert to keys
        train_idx = [keys[val] for val in train_idx]
        test_idx = [keys[val] for val in test_idx]

    # Sanity check
    assert not set(test_idx).intersection(
        set(train_idx)), "Overlap between training and test set."

    # Report
    if not split_is_keys:
        print(
            "Test programs {}, using {} training samples and {} test samples.".
            format(test_split, len(train_idx), len(test_idx)))
    else:
        print(
            "Training and testing with specifically selected keys. {} training and {} test."
            .format(len(train_idx), len(test_idx)))

    # Make and set BoW-vocabulary
    bow_vocabulary = tensor_provider.extract_programs_vocabulary(train_idx)
    tensor_provider.set_bow_vocabulary(bow_vocabulary)

    # Get truth of train-set
    y_true_train = tensor_provider.load_labels(data_keys_or_idx=train_idx)

    # Get truth of test-set
    y_true = tensor_provider.load_labels(data_keys_or_idx=test_idx)

    # Initialize model
    model.initialize_model(tensor_provider=tensor_provider)

    # Number of parameters
    if model.save_type == "tf":
        with model._tf_graph.as_default():
            print("Number of trainable parameters: {}".format(
                tf_number_of_trainable_parameters()))

    # Fit model
    model.fit(tensor_provider=tensor_provider, train_idx=train_idx, verbose=2)

    # Predict on training-data
    print("\tPredicting on training data")
    y_pred_train, y_pred_train_binary = model.predict(
        tensor_provider=tensor_provider, predict_idx=train_idx)
    y_pred_train = np.squeeze(y_pred_train)
    y_pred_train_binary = np.squeeze(y_pred_train_binary)

    train_predictions = y_pred_train

    # Predict on test-data for performance
    print("\tPredicting on test data")
    y_pred, y_pred_binary = model.predict(tensor_provider=tensor_provider,
                                          predict_idx=test_idx)
    y_pred = np.squeeze(y_pred)
    y_pred_binary = np.squeeze(y_pred_binary)

    # Store predictions
    test_predictions = y_pred

    # Evaluate with eval_functions
    print("\tRunning evaluation functions")
    evaluation_nr = 0
    for evalf in eval_functions:
        # Training evaluation
        assert y_pred_train.shape == y_true_train.shape, "y_pred ({}) and y_true ({}) " \
                                                         "do not have same shape".format(y_pred_train.shape,
                                                                                         y_true_train.shape)

        if evalf.is_single_value:
            evaluation_result = evalf(y_true=y_true_train,
                                      y_pred=y_pred_train,
                                      y_pred_binary=y_pred_train_binary)
            classification_results_train[0, evaluation_nr] = evaluation_result
        else:
            special_results_train[(model.name, evalf.name())] = evalf(
                y_true=y_true_train,
                y_pred=y_pred_train,
                y_pred_binary=y_pred_train_binary)

        # Test evaluation
        assert y_pred.shape == y_true.shape, "y_pred ({}) and y_true ({}) " \
                                             "do not have same shape".format(y_pred.shape, y_true.shape)

        if evalf.is_single_value:
            evaluation_result = evalf(y_true=y_true,
                                      y_pred=y_pred,
                                      y_pred_binary=y_pred_binary)
            classification_results_test[0, evaluation_nr] = evaluation_result
            evaluation_nr += 1
        else:
            special_results_test[(model.name, evalf.name())] = evalf(
                y_true=y_true, y_pred=y_pred, y_pred_binary=y_pred_binary)

    # Save model
    print("\tSaving model")
    model.save_model()

    # Return list
    returns = [
        classification_results_train, classification_results_test,
        special_results_train, special_results_test,
        model.summary_to_string()
    ]

    # Additional returns
    if return_predictions:
        returns.extend([train_predictions, test_predictions])

    ############################################
    # Print, plot and store!

    # Make summary
    model_summary = model.summary_to_string()

    # Print mean results
    results_train = classification_results_train.to_dataset_split(
        "Model").to_dataframe()
    results_test = classification_results_test.to_dataset_split(
        "Model").to_dataframe()
    with Path(results_path, "results.txt").open("w") as file:
        file.write(model_summary + "\n\n")
        print("Training\n")
        file.write(str(results_train) + "\n\n")
        print("Test\n")
        file.write(str(results_test) + "\n\n")

    # Store results
    pickle.dump(results_train,
                Path(results_path, "results_train.p").open("wb"))
    pickle.dump(results_test, Path(results_path, "results_test.p").open("wb"))

    # Basic settings
    settings = dict()
    if not split_is_keys:
        settings["test_programs"] = test_split
        settings["training_programs"] = training_split
    else:
        settings["test_programs"] = "specific keys"
        settings["training_programs"] = "specific keys"
    pickle.dump(settings, Path(results_path, "settings.p").open("wb"))

    # Print results for each data-set
    print("\nSingle training Results - TRAINING \n" + "-" * 75)
    print(results_train)
    print("\nSingle training Results - TEST \n" + "-" * 75)
    print(results_test)
    print("\nModel Summary \n" + "-" * 75)
    print(model_summary)

    # Plot ROC of training
    roc_key = (model.name, "ROC")
    if roc_key in special_results_train:
        positive_rate, negative_rate = special_results_train[roc_key]
        plot_roc(tp_rate=positive_rate,
                 fp_rate=negative_rate,
                 title="{} ROC Training".format(model.name))
        save_fig(Path(results_path, "ROC_Train"))

    # Plot ROC of test
    if roc_key in special_results_test:
        positive_rate, negative_rate = special_results_test[roc_key]
        plot_roc(tp_rate=positive_rate,
                 fp_rate=negative_rate,
                 title="{} ROC Test".format(model.name))
        save_fig(Path(results_path, "ROC_Test"))

    # Print ending
    print("Script ended at: {}".format(
        datetime.now().strftime("%d-%m-%Y %H:%M:%S")))
    close_stdout_file()

    # Write a file called done.txt to mark that the script is done
    with Path(results_path, "done.txt").open("w") as file:
        file.write("The deed is done. ")

    return tuple(returns)