예제 #1
0
def test(netFile, dataSet):
    trees = tr.loadTrees(dataSet)
    assert netFile is not None, "Must give model to test"
    with open(netFile, 'r') as fid:
        opts = pickle.load(fid)
        _ = pickle.load(fid)
        rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords,
                       opts.minibatch)
        rnn.initParams()
        rnn.fromFile(fid)
    print "Testing..."
    cost, correct, total = rnn.costAndGrad(trees, test=True)
    print "Cost %f, Correct %d/%d, Acc %f" % (cost, correct, total,
                                              correct / float(total))
예제 #2
0
def test(model_dir, dataSet):
    trees = tr.loadTrees(dataSet)
    total_df = pd.DataFrame()
    assert model_dir is not None, "Must give model to test"
    with open(model_dir + "/checkpoint.bin", 'r') as fid:
        opts = pickle.load(fid)
        _ = pickle.load(fid)
        rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.optimizer_settings['minibatch'])
        rnn.initParams()
        rnn.fromFile(fid)
    log.info("Testing...")
    cost, correct, total, df = rnn.costAndGrad(trees, test = True)
    total_df = total_df.append(df, ignore_index = True)
    total_df.to_csv(model_dir + "/test_preds.csv", header = True, index = False)
    test_performance = pd.DataFrame()
    row = {"Cost": cost, "Correct": correct, "Total": total, "Accuracy": correct / float(total)}
    test_performance = test_performance.append(row, ignore_index = True)
    test_performance.to_csv(model_dir + "/test_performance.csv", header = True, index = False)
    log.info("Cost %f, Correct %d/%d, Acc %f" % (cost, correct, total, correct / float(total)))
예제 #3
0
def perform_evaluation(iterations=1000):
    df_unknown = pd.DataFrame()
    df_preds = pd.DataFrame()
    RESULTS_DIR = './results/new_results/results_account3/pos_tags/2020_07_23---15_44_766248'

    with open(RESULTS_DIR + "/checkpoint.bin", 'r') as fid:
        opts = pickle.load(fid)
        _ = pickle.load(fid)
        rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords,
                       opts.optimizer_settings['minibatch'])
        rnn.initParams()
        rnn.fromFile(fid)
    rnn.L, rnn.V, rnn.W, rnn.b, rnn.Ws, rnn.bs = rnn.stack

    texts = [
        "If the content is not available in the detected system language, English is selected by default."
    ]

    for text in texts:
        vocab_index, tokenized_sentence_original, df_unknown = load_vocab_index(
            text, df_unknown)
        word_embeddings = np.zeros((opts.wvecDim, len(vocab_index)))
        for i, index in enumerate(vocab_index):
            word_embeddings[:, i] = rnn.L[:, index]

        probabilities = np.zeros((2, word_embeddings.shape[1]))

        for index in range(word_embeddings.shape[1]):
            # Softmax
            word_embedding = word_embeddings[:, index]
            probs = np.dot(rnn.Ws, word_embedding) + rnn.bs
            probs -= np.max(probs)
            probs = np.exp(probs)
            probs = probs / np.sum(probs)
            probabilities[:,
                          index] = [probs[np.argmax(probs)],
                                    np.argmax(probs)]

        start = np.concatenate([word_embeddings, probabilities])
        token_index = np.arange(0, len(tokenized_sentence_original))
        token_index = token_index[:, None].T
        start = np.concatenate([start, token_index])

        # add two dimension two store the indexes of the childs
        cache = np.zeros((start.shape[0] + 2, start.shape[1]))
        # init the total_certainteis dimenison
        working_tensor_empty = np.repeat(start[:, :, np.newaxis],
                                         start.shape[1] - 1,
                                         axis=2)

        num_runs = iterations
        tokenized_sentences_array = []
        certainties_runs = []
        total_certainties = np.zeros_like(working_tensor_empty)
        total_certainties = np.repeat(total_certainties[:, :, :, np.newaxis],
                                      num_runs,
                                      axis=3)
        for h in range(0, num_runs):
            certainty = 0.0
            tokenized_sentence = list(tokenized_sentence_original)
            # binary tree has maximum height of n-1 (start.shape[1]-1)
            working_tensor = np.repeat(start[:, :, np.newaxis],
                                       start.shape[1] - 1,
                                       axis=2)
            # i is the counter for the height of the tree, going through the third dimension of the tensor
            for i in range(0, start.shape[1] - 1):
                # j is the counter for the predictions done in one step
                cache = np.zeros_like(cache)
                for j in range(0, start.shape[1] - 1):
                    k = j + 1
                    # affine with j and j+1
                    if np.isnan(working_tensor[0, j, i]):
                        continue

                    while np.isnan(
                            working_tensor[0, k,
                                           i]) and k < start.shape[1] - 1:
                        k = k + 1
                        if not np.isnan(working_tensor[0, k, i]):
                            break

                    lr = np.hstack(
                        [working_tensor[:-3, j, i], working_tensor[:-3, k, i]])
                    cache[:-5, k] = np.dot(rnn.W, lr) + rnn.b
                    cache[:-5, k] += np.tensordot(rnn.V,
                                                  np.outer(lr, lr),
                                                  axes=([1, 2], [0, 1]))
                    # Tanh
                    cache[:-5, k] = np.tanh(cache[:-5, k])

                    # Compute softmax
                    probs = np.dot(rnn.Ws, cache[:-5, k]) + rnn.bs
                    probs -= np.max(probs)
                    probs = np.exp(probs)
                    probs = probs / np.sum(probs)
                    # add probs and label to the vector
                    cache[-5:-3,
                          k] = [probs[np.argmax(probs)],
                                np.argmax(probs)]
                    cache[-3, k], tokenized_sentence = create_new_token(
                        working_tensor[-1, j, i].astype(int),
                        working_tensor[-1, k, i].astype(int),
                        tokenized_sentence_in_method=tokenized_sentence)
                    # we store here the indices from the working tensor, to know which two cache items are merged
                    cache[-2, k] = j
                    cache[-1, k] = k

                # select one of the best five predictions
                best_three_preds = cache[-5, :].argsort()[-5:]
                # look at the best five preds and filter if they are below
                filterd_best_preds = []
                for index in best_three_preds:
                    prediction = cache[-5, index]
                    if index == cache.shape[1] - 1 and np.count_nonzero(
                            cache[-5, :]) > 1:
                        continue
                    if prediction > 0.01:
                        filterd_best_preds.append(index)
                if not len(filterd_best_preds) == 0:
                    # use random
                    random_index = np.random.choice(len(filterd_best_preds),
                                                    1,
                                                    replace=False)
                    best_cache_index = filterd_best_preds[random_index[0]]

                best_cache = cache[:, best_cache_index]
                certainty += best_cache[-5]
                # we push the best merge to the next tensor slice
                second_child_of_cache = best_cache[-1].astype(int)
                # We need to propagate/repeat the merged values to every higher dimension
                best_cache_all_dims = np.repeat(best_cache[:-2, np.newaxis],
                                                working_tensor.shape[2] - i -
                                                1,
                                                axis=1)

                working_tensor[:, second_child_of_cache,
                               i + 1:] = best_cache_all_dims
                # since we always put the merged result in the second place, set the first place in the upcoming slices to NaN
                # We need to set the value to nan which is the first child node, if there are NaNs in betweeen this does not work
                # We can find the first child by extending the cache with two dimenesions and setting there the index of the childs
                first_child_of_cache = best_cache[-2].astype(int)
                working_tensor[:, first_child_of_cache, i + 1:] = np.nan

            tokenized_sentences_array.append(tokenized_sentence)
            total_certainties[:, :, :, h] = working_tensor
            certainties_runs.append(certainty)

        best_run = np.argmax(certainties_runs)

        best_working_tensor = total_certainties[:, :, :, best_run]

        tensor_slice = best_working_tensor[-2:, :, :]

        test = np.reshape(tensor_slice, (2, -1))

        predictions_dict = []

        for col in test.T:
            label = col[0]
            if not np.isnan(col[1]):
                token123 = tokenized_sentences_array[best_run][col[1].astype(
                    int)]
                if not [token123, label_names[label.astype(int)]
                        ] in predictions_dict:
                    predictions_dict.append(
                        [token123, label_names[label.astype(int)], text])

        predictions_dict.sort(key=lambda x: len(x[0]), reverse=False)
        df = pd.DataFrame.from_records(predictions_dict,
                                       columns=['token', 'label', 'sentence'])
        df_preds = df_preds.append(df)

    df_preds.to_csv("predictions.csv", header=True, index=False)
    df_unknown.to_csv("unknown_words.csv", header=True, index=False)
예제 #4
0
def run(args=None):
    usage = "usage : %prog [options]"
    parser = optparse.OptionParser(usage = usage)

    parser.add_option("--test", action = "store_true", dest = "test", default = False)

    # Paramsfile includes hyperparameters for training
    parser.add_option('--params_file', dest = "params_file", default = './params/exp_params.json',
                      help = "Path to the file  containing the training settings")
    parser.add_option('--data_dir', dest = "data_dir", default = './trees',
                      help = "Directory containing the trees")

    # Directory containing the model to test
    parser.add_option("--model_directory", dest = "test_dir", type = "string")
    parser.add_option("--data", dest = "data", type = "string", default = "train")

    (opts, args) = parser.parse_args(args)

    results_dir = "./results"
    if opts.test:
        pass
    else:
        results_dir_current_job = os.path.join(results_dir, utils.now_as_str_f())
        while os.path.isdir(results_dir_current_job):  # generate a new timestamp if the current one already exists
            results_dir_current_job = os.path.join(results_dir, utils.now_as_str_f())
        os.makedirs(results_dir_current_job)

    # Load training settings (e.g. hyperparameters)
    params = utils.Params(opts.params_file)

    if opts.test:
        pass
    else:
        # Copy the settings file into the results directory
        copyfile(opts.params_file, os.path.join(results_dir_current_job, os.path.basename(opts.params_file)))

    # Get the logger
    if opts.test:
        log_path = os.path.join(opts.test_dir, 'testing.log')
    else:
        log_path = os.path.join(results_dir_current_job, 'training.log')
    log_level = params.log_level if hasattr(params, 'log_level') else logging.DEBUG
    log = utils.get_logger(log_path, log_level)

    if opts.test:
        log.info("Testing directory: " + opts.test_dir)
        log.info("Dataset used for testing: " + opts.data)
    else:
        log.info("Results directory: " + results_dir_current_job)
        log.info("Minibatch: " + str(params.optimizer_settings['minibatch']))
        log.info("Optimizer: " + params.optimizer)
        log.info("Epsilon: " + str(params.optimizer_settings['epsilon']))
        log.info("Alpha: " + str(params.optimizer_settings['alpha']))
        log.info("Number of samples used: " + str(params.sample_size))

    # Testing
    if opts.test:
        test(opts.test_dir, opts.data)
        return

    log.info("Loading data...")
    # load training data
    trees = tr.loadTrees(sample_size = params.sample_size)
    params.numWords = len(tr.loadWordMap())
    overall_performance = pd.DataFrame()

    rnn = nnet.RNN(params.wvecDim, params.outputDim, params.numWords, params.optimizer_settings['minibatch'])
    rnn.initParams()

    sgd = optimizer.SGD(rnn, alpha = params.optimizer_settings['alpha'],
                        minibatch = params.optimizer_settings['minibatch'],
                        optimizer = params.optimizer, epsilon = params.optimizer_settings['epsilon'])

    best_val_cost = float('inf')
    best_epoch = 0

    for e in range(params.num_epochs):
        start = time.time()
        log.info("Running epoch %d" % e)
        df, updated_model, train_cost, train_acc = sgd.run(trees)
        end = time.time()
        log.info("Time per epoch : %f" % (end - start))
        log.info("Training accuracy : %f" % train_acc)
        # VALIDATION
        val_df, val_cost, val_acc = validate(updated_model, results_dir_current_job)

        if val_cost < best_val_cost:
            # best validation cost we have seen so far
            log.info("Validation score improved, saving model")
            best_val_cost = val_cost
            best_epoch = e
            best_epoch_row = {"epoch": e, "train_cost": train_cost, "val_cost": val_cost, "train_acc": train_acc,
                              "val_acc": val_acc}
            with open(results_dir_current_job + "/checkpoint.bin", 'w') as fid:
                pickle.dump(params, fid)
                pickle.dump(sgd.costt, fid)
                rnn.toFile(fid)

        val_df.to_csv(results_dir_current_job + "/validation_preds_epoch_ " + str(e) + ".csv", header = True, index = False)
        df.to_csv(results_dir_current_job + "/training_preds_epoch_" + str(e) + ".csv", header = True, index = False)

        row = {"epoch": e, "train_cost": train_cost, "val_cost": val_cost, "train_acc": train_acc, "val_acc": val_acc}
        overall_performance = overall_performance.append(row, ignore_index = True)

        # break if no val loss improvement in the last epochs
        if (e - best_epoch) >= params.num_epochs_early_stop:
            log.tinfo("No improvement in the last {num_epochs_early_stop} epochs, stop training.".format(num_epochs_early_stop=params.num_epochs_early_stop))
            break

    overall_performance = overall_performance.append(best_epoch_row, ignore_index = True)
    overall_performance.to_csv(results_dir_current_job + "/train_val_costs.csv", header = True, index = False)
    log.info("Experiment end")
예제 #5
0
def run(args=None):
    usage = "usage : %prog [options]"
    parser = optparse.OptionParser(usage=usage)

    parser.add_option("--test",
                      action="store_true",
                      dest="test",
                      default=False)

    # Optimizer
    parser.add_option("--minibatch", dest="minibatch", type="int", default=30)
    parser.add_option("--optimizer",
                      dest="optimizer",
                      type="string",
                      default="adagrad")
    parser.add_option("--epochs", dest="epochs", type="int", default=50)
    parser.add_option("--step", dest="step", type="float", default=1e-2)

    parser.add_option("--outputDim", dest="outputDim", type="int", default=5)
    parser.add_option("--wvecDim", dest="wvecDim", type="int", default=30)
    parser.add_option("--outFile",
                      dest="outFile",
                      type="string",
                      default="models/test.bin")
    parser.add_option("--inFile",
                      dest="inFile",
                      type="string",
                      default="models/test.bin")
    parser.add_option("--data", dest="data", type="string", default="train")

    (opts, args) = parser.parse_args(args)

    # Testing
    if opts.test:
        test(opts.inFile, opts.data)
        return

    print "Loading data..."
    # load training data
    trees = tr.loadTrees()
    opts.numWords = len(tr.loadWordMap())

    rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.minibatch)
    rnn.initParams()

    sgd = optimizer.SGD(rnn,
                        alpha=opts.step,
                        minibatch=opts.minibatch,
                        optimizer=opts.optimizer)

    for e in range(opts.epochs):
        start = time.time()
        print "Running epoch %d" % e
        sgd.run(trees)
        end = time.time()
        print "Time per epoch : %f" % (end - start)

        with open(opts.outFile, 'w') as fid:
            pickle.dump(opts, fid)
            pickle.dump(sgd.costt, fid)
            rnn.toFile(fid)