예제 #1
0
def trainAndTest(dataset, enable_data_augmentation = False, percentage_similarity_loss = 0, LSTM = False, EPOCHS = 500, enable_same_noise = False, save_output = True, NlogN = True):
    X_train, y_train, X_test, y_test, info = py_ts_data.load_data(dataset, variables_as_channels=True)

    print("Dataset shape: Train: {}, Test: {}".format(X_train.shape, X_test.shape))
    print(np.shape(y_train))

    if enable_data_augmentation or len(X_train) >= 1000:
        # LSTM will greatly extend the training time, so disable it if we have large data
        LSTM = False

    title = "{}-DA:{}-CoefSimilar:{}-LSTM:{}".format(dataset, enable_data_augmentation, percentage_similarity_loss, LSTM)

    ##### Preprocess Data ####
    num_train = len(X_train)
    if num_train < 1000 and enable_data_augmentation:
        X_train= augment_data(X_train, enable_same_noise = enable_same_noise)
        num_train = len(X_train)

    # randomly generate N pairs:
    # NlogN woule be int(num_train * math.log2(num_train))
    if NlogN:
        num_of_pairs = num_train * int(math.log2(num_train))
    else:
        num_of_pairs = num_train
    X, Y = generateRandomPairs(num_of_pairs, X_train)
    # NlogN is too large, for N = 1000, NlogN would be 10K

    normalized_X, normalized_Y, distance = calculatePreSBD(X, Y)

    ###### Training Stage #####
    kwargs = {
        "input_shape": (X_train.shape[1], X_train.shape[2]),
        "filters": [32, 64, 128],
        "kernel_sizes": [5, 5, 5],
        "code_size": 16,
    }

    ae = AutoEncoder(**kwargs)

    # # Training
    loss_history = []
    t1 = time.time()
    for epoch in range(EPOCHS):

        if epoch % 100 == 50:
            print("Epoch {}/{}".format(epoch, EPOCHS))
        total_loss = train_step(normalized_X, normalized_Y, distance, ae, alpha = percentage_similarity_loss, LSTM = LSTM)
        loss_history.append(total_loss)
        # print("Epoch {}: {}".format(epoch, total_loss), end="\r")
        
    print("The training time for dataset {} is: {}".format(dataset, (time.time() - t1) / 60))


    #%%
    plt.clf()
    plt.xlabel("epoch starting from 5")
    plt.ylabel("loss")
    plt.title("Loss vs epoch")
    plt.plot(loss_history[5:])
    # plt.show()
    if save_output:
        if not os.path.isdir(ouput_dir_name + dataset):
            os.mkdir(ouput_dir_name + dataset)
            with open(ouput_dir_name + dataset + "/record.txt", "a") as f:
                f.write("Dataset, Data Augmentation, Coefficient of Similarity Loss, LSTM, EPOCHS, Distance Measure, L2 Distance, 10-nn score\n")
        
        plt.savefig(ouput_dir_name + dataset + "/" + title + "-loss.png")

    #%%
    X_test = normalize(X_test)
    code_test = ae.encode(X_test, LSTM = LSTM)
    decoded_test = ae.decode(code_test)
    plt.clf()
    plt.plot(X_test[0], label = "Original TS")
    plt.plot(decoded_test[0], label = "reconstructed TS")
    if save_output:
        plt.savefig(ouput_dir_name + dataset + "/" + title + "-reconstruction.png")
    # plt.show()

    losses = []
    for ground, predict in zip(X_test, decoded_test):
        losses.append(np.linalg.norm(ground - predict))

    L2_distance = np.array(losses).mean()
    print("Mean L2 distance: {}".format(L2_distance))


    #%%
    from sklearn.neighbors import NearestNeighbors

    nn_x_test = np.squeeze(X_test)
    baseline_nn = NearestNeighbors(n_neighbors=10, metric = SBD).fit(nn_x_test)
    code_nn = NearestNeighbors(n_neighbors=10).fit(code_test)# the default metric is euclidean distance

    # For each item in the test data, find its 11 nearest neighbors in that dataset (the nn is itself)
    baseline_11nn = baseline_nn.kneighbors(nn_x_test, 11, return_distance=False)
    code_11nn     = code_nn.kneighbors(code_test, 11, return_distance=False)

    # On average, how many common items are in the 10nn?
    result = []
    for b, c in zip(baseline_11nn, code_11nn):
        # remove the first nn (itself)
        b = set(b[1:])
        c = set(c[1:])
        result.append(len(b.intersection(c)))

    ten_nn_score = np.array(result).mean()
    print("10-nn score is:", ten_nn_score)
    if save_output:
        with open(ouput_dir_name + dataset + "/record.txt", "a") as f:
            f.write(",".join([dataset, str(enable_data_augmentation), str(percentage_similarity_loss), str(LSTM), str(EPOCHS), distance_measure, str(round(L2_distance,2)), str(round(ten_nn_score,2)), str(NlogN)]) + "\n")
예제 #2
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-a",
                        "--auto",
                        action="store_true",
                        help="autoencoder")
    parser.add_argument("-e",
                        "--encauto",
                        action="store_true",
                        help="encoder + autoencoder")
    parser.add_argument("-s",
                        "--seqencauto",
                        action="store_true",
                        help="Encoder(sim) + autoencoder(rec)")
    parser.add_argument("l")
    parser.add_argument("filter1")
    parser.add_argument("filter2")
    parser.add_argument("filter3")
    parser.add_argument("epoch")
    parser.add_argument("batch")
    args = parser.parse_args()

    m_type = None
    if args.auto:
        m_type = "autoencoder"
    elif args.encauto:
        m_type = "encoder_autoencoder"
    elif args.seqencauto:
        m_type = "Encoder_sim_autoencoder_rec"
    else:
        raise Exception("model type flag not set")

    model_type_log = "{m_type} lambda={l} filter=[{filter1}, {filter2}, {filter3}] epoch={epoch} batch={batch}".format(
        m_type=m_type,
        l=args.l,
        filter1=args.filter1,
        filter2=args.filter2,
        filter3=args.filter3,
        epoch=args.epoch,
        batch=args.batch)

    filters = [int(args.filter1), int(args.filter2), int(args.filter3)]
    BATCH = int(args.batch)
    EPOCHS = int(args.epoch)
    lam = float(args.l)

    hyperparams["model_type"] = model_type_log
    hyperparams["epochs"] = EPOCHS
    hyperparams["batch_size"] = BATCH

    experiment = Experiment(log_code=False)
    experiment.log_parameters(LAMBDA)
    experiment.log_parameters(hyperparams)

    dataset_name = "GunPoint"

    X_train, y_train, X_test, y_test, info = py_ts_data.load_data(
        dataset_name, variables_as_channels=True)
    print("Dataset shape: Train: {}, Test: {}".format(X_train.shape,
                                                      X_test.shape))

    print(X_train.shape, y_train.shape)
    X_train, y_train = augmentation(X_train, y_train)
    # X_test, y_test = augmentation(X_test, y_test)
    print(X_train.shape, y_train.shape)
    # fig, axs = plt.subplots(1, 2, figsize=(10, 3))
    # axs[0].plot(X_train[200])
    X_train = min_max(X_train, feature_range=(-1, 1))
    # axs[1].plot(X_train[200])
    X_test = min_max(X_test, feature_range=(-1, 1))
    # plt.show()

    kwargs = {
        "input_shape": (X_train.shape[1], X_train.shape[2]),
        # "filters": [32, 64, 128],
        # "filters": [128, 64, 32],
        "filters": filters,
        # "filters": [32, 32, 32],
        # "filters": [32, 32, 16],
        "kernel_sizes": [5, 5, 5],
        "code_size": 16,
    }

    # lambda_to_test = [0.9, ]
    # for l in range(1, 10):
    #     lam = l / 10

    # lam = 0.99
    ae = AutoEncoder(**kwargs)

    input_shape = kwargs["input_shape"]
    code_size = kwargs["code_size"]
    filters = kwargs["filters"]
    kernel_sizes = kwargs["kernel_sizes"]
    encoder = Encoder(input_shape, code_size, filters, kernel_sizes)
    # training

    SHUFFLE_BUFFER = 100
    K = len(set(y_train))

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH)

    suffix = "lam={lam}".format(lam=lam)
    train(ae, encoder, EPOCHS, train_dataset, suffix, experiment, lam, args)

    code_test = recon_eval(ae, X_test, suffix, experiment)
    sim_eval(X_test, code_test, suffix, experiment)

    cwd = os.path.abspath(os.getcwd())
    metadata = "lambda_{l}_filter_{filter1}{filter2}{filter3}_epoch_{epoch}_batch_{batch}".format(
        l=args.l,
        filter1=args.filter1,
        filter2=args.filter2,
        filter3=args.filter3,
        epoch=args.epoch,
        batch=args.batch)
    encoder_path = os.path.join(cwd, m_type, dataset_name, metadata, "encoder")
    ae_encoder_path = os.path.join(cwd, m_type, dataset_name, metadata,
                                   "auto_encoder")
    ae_decoder_path = os.path.join(cwd, m_type, dataset_name, metadata,
                                   "decoder")

    if not args.auto:
        encoder.save(encoder_path)
    ae.encode.save(ae_encoder_path)
    ae.decode.save(ae_decoder_path)
    sample_evaluation(ae.encode,
                      ae.encode,
                      ae.decode,
                      experiment,
                      suffix,
                      DATA=dataset_name)
예제 #3
0
                    help="dataset to run")
PARSER.add_argument('-m',
                    '--models',
                    default="sample_model",
                    required=False,
                    help="dataset to run")
ARGS = PARSER.parse_args()

DATA = ARGS.dataset
MODELS_PATH = ARGS.models

ENCODER = tf.keras.models.load_model(os.path.join(MODELS_PATH, DATA,
                                                  "encoder"))
DECODER = tf.keras.models.load_model(os.path.join(MODELS_PATH, DATA,
                                                  "decoder"))
X_TRAIN, Y_TRAIN, X_TEST, Y_TEST, _ = py_ts_data.load_data(
    DATA, variables_as_channels=True)
# all are read in with 3 dims, last is num of variables in the TS
assert len(X_TRAIN.shape) == 3
# we care only about univariate TS
assert X_TRAIN.shape[2] == 1
X_TRAIN = np.squeeze(X_TRAIN, axis=2)
X_TEST = np.squeeze(X_TEST, axis=2)

N_NEIGHBORS = 10
N_CLUSTERS = len(set(Y_TRAIN))
CLUSTERING = KMeans(N_CLUSTERS).fit(X_TRAIN)


def encoder(x):
    assert len(x.shape) == 2
    x = x[..., np.newaxis]
예제 #4
0
def main():

    experiment = Experiment(log_code=False)
    experiment.log_parameters(LAMBDA)
    experiment.log_parameters(hyperparams)

    dataset_name = "GunPoint"

    X_train, y_train, X_test, y_test, info = py_ts_data.load_data(
        dataset_name, variables_as_channels=True)
    print("Dataset shape: Train: {}, Test: {}".format(X_train.shape,
                                                      X_test.shape))

    print(X_train.shape, y_train.shape)
    X_train, y_train = augmentation(X_train, y_train)
    # X_test, y_test = augmentation(X_test, y_test)
    print(X_train.shape, y_train.shape)
    # fig, axs = plt.subplots(1, 2, figsize=(10, 3))
    # axs[0].plot(X_train[200])
    X_train = min_max(X_train, feature_range=(-1, 1))
    # axs[1].plot(X_train[200])
    X_test = min_max(X_test, feature_range=(-1, 1))
    # plt.show()

    kwargs = {
        "input_shape": (X_train.shape[1], X_train.shape[2]),
        # "filters": [32, 64, 128],
        # "filters": [128, 64, 32],
        "filters": [64, 32, 16],
        # "filters": [32, 32, 32],
        # "filters": [32, 32, 16],
        "kernel_sizes": [5, 5, 5],
        "code_size": 16,
    }

    # lambda_to_test = [0.9, ]
    # for l in range(1, 10):
    #     lam = l / 10

    lam = 0.99
    ae = AutoEncoder(**kwargs)

    input_shape = kwargs["input_shape"]
    code_size = kwargs["code_size"]
    filters = kwargs["filters"]
    kernel_sizes = kwargs["kernel_sizes"]
    encoder = Encoder(input_shape, code_size, filters, kernel_sizes)
    # training

    SHUFFLE_BUFFER = 100
    K = len(set(y_train))

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH)

    suffix = "lam={lam}".format(lam=lam)
    train(ae, encoder, EPOCHS, train_dataset, suffix, experiment, lam)

    code_test = recon_eval(ae, X_test, suffix, experiment)
    sim_eval(X_test, code_test, suffix, experiment)

    encoder.save(
        r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\encoder"
    )
    ae.encode.save(
        r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\auto_encoder"
    )
    ae.decode.save(
        r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\decoder"
    )
    sample_evaluation(ae.encode,
                      ae.encode,
                      ae.decode,
                      experiment,
                      suffix,
                      DATA=dataset_name)