예제 #1
0
def main():

    # Reads in MNIST data, Initializes the Model, and Trains and Tests the Model for one Epoch
    num_train = 60000
    num_test = 10000
    random_num = randint(0, num_test - 10)
    train_inputs, train_labels = get_data(
        'MNIST_data/train-images-idx3-ubyte.gz',
        'MNIST_data/train-labels-idx1-ubyte.gz', num_train)
    test_inputs, test_labels = get_data(
        'MNIST_data/t10k-images-idx3-ubyte.gz',
        'MNIST_data/t10k-labels-idx1-ubyte.gz', num_test)

    # Creates and Trains Model
    model = Model()
    train(model, train_inputs, train_labels)

    # Tests Accuracy
    accuracy = test(model, test_inputs, test_labels)
    print('accuracy =', accuracy)

    # Randomly Take 10 Images for Visualization
    input = test_inputs[random_num:random_num + 10, :]
    label = test_labels[random_num:random_num + 10]
    visualize_results(input, model.call(input), label)
예제 #2
0
def main():
    # Create model
    model = YOLO()

    # For saving/loading models
    checkpoint_dir = './checkpoints'
    checkpoint = tf.train.Checkpoint(model=model)
    manager = tf.train.CheckpointManager(checkpoint,
                                         checkpoint_dir,
                                         max_to_keep=3)

    if args.restore_checkpoint or args.mode == 'test':
        # restores the latest checkpoint using from the manager
        checkpoint.restore(manager.latest_checkpoint).expect_partial()

    if args.mode == 'train':
        train_boxes, train_size = get_data('train')
        # Train model
        for epoch in range(model.num_epochs):
            print(
                '========================== EPOCH %d  =========================='
                % epoch)
            train(model, train_boxes, train_size)
            print("**** SAVING CHECKPOINT AT END OF EPOCH ****")
            manager.save()
            test_boxes, test_size = get_data('test')
            test(model, test_boxes, test_size)
    elif args.mode == 'test':
        # Test model
        test_boxes, test_size = get_data('test')
        test(model, test_boxes, test_size)

    return
예제 #3
0
def main():
    '''
	Read in CIFAR10 data (limited to 2 classes), initialize model, and train and
	test model for a number of epochs.

	:return: None
	'''

    train_inputs, train_labels = get_data('CIFAR_data_compressed/train', 3, 5)
    test_inputs, test_labels = get_data('CIFAR_data_compressed/test', 3, 5)

    model = Model()

    t1 = time.time()
    for epoch in range(10):
        print("epoch", epoch)
        train(model, train_inputs, train_labels)
    t2 = time.time()

    print("training took", t2 - t1)

    acc = test(model, test_inputs, test_labels)
    t3 = time.time()

    print("testing took", t3 - t2)

    print("accuracy", acc)

    #visualize_results(test_inputs[0:10], model.call(test_inputs[0:10]), test_labels[0:10], "cat", "dog")

    return
예제 #4
0
def main():
    # Preprocess data
    train_labels, train_images = get_data(args.train_csv_path, args.warp_size)
    test_labels, test_images = get_data(args.test_csv_path, args.warp_size)

    # Convert to tensors
    train_labels = tf.convert_to_tensor(train_labels)
    train_images = tf.convert_to_tensor(train_images)

    test_labels = tf.convert_to_tensor(test_labels)
    test_images = tf.convert_to_tensor(test_images)

    # Instantiate, compile, and train

    # One channel for our black and white images
    input_shape = args.warp_size + (1, )

    # TODO: How do we make training=False false later?
    model = DenseNet(input_shape, args.growth_k, args.drop_rate, True)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(train_images,
              train_labels,
              batch_size=args.batch_size,
              epochs=args.num_epochs,
              validation_data=(test_images, test_labels))
def main():
    '''
    Read in MNIST data, initialize model, and train and test model for one 
    epoch. The number of training steps should be the number of batches there 
    are in a single epoch.
    
    :return: None
    '''

    # TODO: load MNIST train and test examples
    train_images, train_labels = get_data('MNIST_data/train-images-idx3-ubyte.gz',
                          'MNIST_data/train-labels-idx1-ubyte.gz',60000)
    test_images, test_labels = get_data('MNIST_data/t10k-images-idx3-ubyte.gz', 
                                        'MNIST_data/t10k-labels-idx1-ubyte.gz', 10000)
    
    # Create Model
    model = Model()

    # Train model by calling train() ONCE on all data
    train(model, train_images, train_labels)

    # Test the accuracy by calling test() after running train()
    accuracy = test(model, test_images, test_labels)
    print (accuracy)

    # Visualize the data by using visualize_results()
    # generate 10 random numbers between 0 and 10000
    selection = np.random.randint(low=0, high=9999, size=10)
    vis_images = test_images[selection,:]
    vis_labels = test_labels[selection]
    prob = model.call(vis_images)
    visualize_results(vis_images, prob, vis_labels)
예제 #6
0
def main():
    '''
    Read in MNIST data, initialize your model, and train and test your model for one epoch.
    The number of training steps should be the number of batches you run through in a single epoch.
    You should receive a final accuracy on the testing examples of > 80%.
    :return: None
    '''
    # TODO: load MNIST train and test examples into train_inputs, train_labels, test_inputs, test_labels
    num_train = 60000
    train_inputs, train_labels = \
        get_data('MNIST_data/train-images-idx3-ubyte.gz', 'MNIST_data/train-labels-idx1-ubyte.gz', num_train)
    num_test = 10000
    test_inputs, test_labels = \
        get_data('MNIST_data/t10k-images-idx3-ubyte.gz', 'MNIST_data/t10k-labels-idx1-ubyte.gz', num_test)

    # TODO: Create Model
    model = Model()

    # TODO: Train model by calling train() ONCE on all data
    train(model, train_inputs, train_labels)

    # TODO: Test the accuracy by calling test() after running train()
    accuracy = test(model, test_inputs, test_labels)
    print('accuracy =', accuracy)

    # TODO: Visualize the data by using visualize_results()
    # take the first 10 images for visualization
    input = test_inputs[0:10, :]
    label = test_labels[0:10]
    visualize_results(input, model.call(input), label)
예제 #7
0
def main():
    #preprocess cifar dataset
    train_inputs, train_labels = get_data("CIFAR_data_compressed/train", 3, 5)
    test_inputs, test_labels = get_data("CIFAR_data_compressed/test", 3, 5)

    #run model
    model = Model()
    for i in range(25):
        holder = time.time()
        train(model, train_inputs, train_labels)
예제 #8
0
def main():
    # store data in folder called data
    train_inp, train_lab = get_data("data/train", 3, 5)
    test_inp, test_lab = get_data("data/test", 3, 5)
    mod = Model()
    for i in range(0, mod.epochs):
        train(mod, train_inp, train_lab)
    acc = test(mod, test_inp, test_lab)
    print(acc)
    return
예제 #9
0
def main():
    # TO-DO: Pre-process and vectorize the data
    # HINT: Please note that you are predicting the next word at each timestep, so you want to remove the last element
    # from train_x and test_x. You also need to drop the first element from train_y and test_y.
    # If you don't do this, you will see impossibly small perplexities.

    # TO-DO:  Separate your train and test data into inputs and labels
    print("Main starts")
    trainOut, test_data, notes_dict = get_data()

    trainInputs = np.array(trainOut[:-1])
    trainLabels = np.array(trainOut[1:])

    # TODO: initialize model and tensorflow variables
    # vocab_size, window_size, embedding_size, batch_size, rnn_size, hidden_layer
    window_size = 20
    embedding_size = 256
    batch_size = 128
    rnn_size = 512
    hidden_layer = 512
    model = Model(len(notes_dict), window_size, embedding_size, batch_size,
                  rnn_size, hidden_layer)

    # TODO: Set-up the training step
    lossArray, EW, LSTMLayerW, dense1W, dense2W = train(
        model, trainInputs, trainLabels)
예제 #10
0
def main():
    args = get_train_args()

    X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data(
        args["data_path"])

    # Now that we found the best hyperparameters, let's use them to train our model for a longer time.
    gbm = lgb.LGBMRanker(
        n_estimators=10000,
        num_leaves=args["num_leaves"],
        learning_rate=args["learning_rate"],
        reg_lambda=args["reg_lambda"],
    )

    gbm.fit(
        X_train,
        y_train,
        group=group_train,
        eval_group=[group_vali],
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=150,
    )

    gbm.booster_.save_model(args["output_file_name"],
                            num_iteration=gbm.best_iteration)
예제 #11
0
def main():
    # TODO: Pre-process and vectorize the data
    train_token, test_token, vocab_dict = get_data('data/train.txt', 'data/test.txt')
    num_train = train_token.shape[0]
    num_test = test_token.shape[0]
    # HINT: Please note that you are predicting the next word at each timestep, so you want to remove the last element
    # from train_x and test_x. You also need to drop the first element from train_y and test_y.
    # If you don't do this, you will see very, very small perplexities.

    # TODO: initialize model and tensorflow variables
    vocab_size = 7342
    model = Model(vocab_size)

    # TODO:  Separate your train and test data into inputs and labels
    num_train = (num_train - 1) // model.window_size
    num_test = (num_test - 1) // model.window_size
    train_inputs = np.zeros((num_train, model.window_size), dtype=np.int32)
    train_labels = np.zeros((num_train, model.window_size), dtype=np.int32)
    for i in range(num_train):
        train_inputs[i] = train_token[i * model.window_size: (i + 1) * model.window_size]
        train_labels[i] = train_token[i * model.window_size + 1: (i + 1) * model.window_size + 1]
    test_inputs = np.zeros((num_test, model.window_size), dtype=np.int32)
    test_labels = np.zeros((num_test, model.window_size), dtype=np.int32)
    for i in range(num_test):
        test_inputs[i] = test_token[i * model.window_size: (i + 1) * model.window_size]
        test_labels[i] = test_token[i * model.window_size + 1: (i + 1) * model.window_size + 1]

    # TODO: Set-up the training step
    train(model, train_inputs, train_labels)

    # TODO: Set up the testing steps
    Perplexity = test(model, test_inputs, test_labels)
    print('Perplexity = {}'.format(Perplexity))

    generate_sentence('I', 10, vocab_dict, model)
예제 #12
0
def train(model, img_dir, train_img_names, img_to_encodings, manager):
    num_inputs = len(train_img_names)
    steps = int(num_inputs / model.batch_size)

    random.shuffle(train_img_names)

    for i in range(0, steps):
        start = i * model.batch_size
        end = (i + 1) * model.batch_size
        # now we load the actual content of the images, which is a huge amount of data
        inputs, labels = get_data(img_dir, train_img_names[start:end],
                                  img_to_encodings)

        with tf.GradientTape() as tape:
            probs = model(inputs)
            loss = model.dice_loss(probs, labels)

            gradients = tape.gradient(loss, model.trainable_variables)
            model.optimizer.apply_gradients(
                zip(gradients, model.trainable_variables))

        if i % args.log_every == 0:
            train_acc, train_iou = model.accuracy(probs, labels)
            r = tf.reduce_mean(recall(probs, labels)).numpy()
            p = tf.reduce_mean(precision(probs, labels)).numpy()
            print(
                "========>Step %2d, accuracy = %3.4f, loss = %3.4f, IoU = %3.4f, recall = %3.4f, precision = %3.4f"
                % (i, train_acc, loss, train_iou, r, p))

        if i % args.save_every == 0:
            manager.save()
예제 #13
0
 def test_preprocess_1000(self):
     location = os.path.join('data', 'unit_test')
     sample_images.sample_images(location, seed=123, sample_num=1000)
     labelled, unlabelled = preprocess.get_data(
         'data/unit_test/hirise-map-proj-v3_2')
     self.assertTrue(198 == len(list(labelled)))
     self.assertTrue(802 == len(list(unlabelled)))
예제 #14
0
def initialize():
    print("initialize")
    X, T = get_data()
    T = T.astype(float)
    W = np.random.randn(X.shape[1], 1)
    b = 0
    return X, T, W, b
예제 #15
0
def main():
    train_ids, test_ids, vocab = get_data("../../data/train.txt",
                                          "../../data/test.txt")
    model = Model(len(vocab))
    print("model initialized")

    train_inputs = train_ids[:-1]
    train_labels = train_ids[1:]

    test_inputs = test_ids[:-1]
    test_labels = test_ids[1:]

    for epoch in range(1):
        train(model, train_inputs, train_labels)

    print("training complete")

    checkpoint = tf.train.Checkpoint(optimizer=model.optimizer,
                                     embedding=model.embedding,
                                     lstm=model.lstm,
                                     dense_1=model.dense_1,
                                     dense_2=model.dense_2)
    manager = tf.train.CheckpointManager(checkpoint, './ckpts', max_to_keep=10)
    manager.save()

    print("Train perplexity: " + str(test(model, train_inputs, train_labels)))
    print("Test perplexity: " + str(test(model, test_inputs, test_labels)))

    return
예제 #16
0
def main():
    # preprocess inputs into (num inputs-2,2)
    train_corpus, test_corpus, dictionary = get_data('data/train.txt',
                                                     'data/test.txt')
    train_input = np.zeros((len(train_corpus) - 2, 2), dtype=np.int)
    train_input[:, 0] = train_corpus[0:len(train_corpus) - 2]
    train_input[:, 1] = train_corpus[1:len(train_corpus) - 1]
    train_labels = train_corpus[2:len(train_corpus)]

    # Separate train and test data into inputs and labels
    test_input = np.zeros((len(test_corpus) - 2, 2), dtype=np.int)
    test_input[:, 0] = test_corpus[0:len(test_corpus) - 2]
    test_input[:, 1] = test_corpus[1:len(test_corpus) - 1]
    test_labels = test_corpus[2:len(test_corpus)]

    # initialize model and tensorflow variables
    model = Model(len(dictionary))

    # Set-up the training step
    train(model, train_input, train_labels)

    # Set up the testing steps
    perp = test(model, test_input, test_labels)

    # Print out perplexity
    print('perplexity', perp)
예제 #17
0
def main():
    # Evaluate model performance
    # Get the "ideal" order of y_test by sorting in descending order.

    args = parse.get_test_args()

    X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data(
        args["data_path"])

    gbm = lgb.Booster(model_file=args["model_path"])

    true_relevance = y_test.sort_values(ascending=False)

    # Get the actual order of y_test by sorting it according to our model's predictions.

    test_pred = gbm.predict(X_test)
    y_test = pd.DataFrame({
        "relevance_score": y_test,
        "predicted_ranking": test_pred
    })

    relevance_score = y_test.sort_values("predicted_ranking", ascending=False)

    # Use computed variables to calculate the nDCG score
    print(
        "nDCG score: ",
        ndcg_score([true_relevance.to_numpy()],
                   [relevance_score["relevance_score"].to_numpy()]),
    )
예제 #18
0
 def __init__(self, data_num=None):
     self.model = tk.Sequential()
     self.data = {'train': {}, 'test': {}}
     self.tokens = None
     self.history = None
     fname = self.data_name + '.csv'
     self.raw_data = pp.get_data(fname, lib.data[self.data_name], data_num)
예제 #19
0
def bad_visualization():
    train_data, test_data, daily_data = get_data()
    x = daily_data[0, 1, :]
    print(x.shape)
    for i in range(4):
        plt.plot(daily_data[i, 1, :])
    plt.show()
예제 #20
0
def main():

    print("Preprocessing...")
    train_inputs, train_labels, test_inputs, test_labels = preprocess.get_data(
        "../data/fma_metadata/tracks.csv")

    char2id, char_inputs = preprocess.make_char_dict(train_inputs)
    _, test_char_inputs = preprocess.make_char_dict(test_inputs)

    actor_char2id, actor_char_inputs = preprocess.make_char_dict(train_inputs)
    _, actor_test_char_inputs = preprocess.make_char_dict(test_inputs)

    model = Model(len(char2id), len(actor_char2id))

    numerical_train, numerical_test = preprocess.make_numerical_lists(
        train_inputs, test_inputs)
    feature_train, feature_test = preprocess.make_feature_lists(
        train_inputs, test_inputs)

    print("Training...")
    losses = []
    for epoch in range(2):
        losses.extend(
            train(model, numerical_train, feature_train, char_inputs,
                  actor_char_inputs, train_labels))

    print("Testing...")
    test(model, numerical_test, feature_test, test_char_inputs,
         actor_test_char_inputs, test_labels)

    visualize_loss(losses)
예제 #21
0
def main():
    # Parse command line arguments
    host, port = parse_args()

    # Create a grpc channel using the IP adress and the port
    channel = implementations.insecure_channel(host, int(port))
    # Create a stub
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    # Name of the data that should be send to the server

    file = '../data/raw/creditcard_dataset_fraud.csv'
    data = get_data(data_type="test", data_dir=file)

    sample_df = data.sample(20)

    for _, row in sample_df.iterrows():

        input_data = [[s for s in row]]
        # print(input_data)

        # Create a request object
        request = predict_pb2.PredictRequest()

        # Name of the model running on the tensorflow_model_server (either locally or in Docker container)
        request.model_spec.name = 'anamoly_detection'
        # Name of the defined prediction signature in the SavedModelInstance on the server (either locally or in Docker container)
        request.model_spec.signature_name = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY

        # Make a request (time-out after 20 seconds)
        request.inputs['inputs'].CopyFrom(
            make_tensor_proto(input_data, shape=[1, 29]))

        result = stub.Predict(request, 20.0)  # 60 secs timeout
        print(result)
예제 #22
0
def main():
    # TODO: Pre-process and vectorize the data using get_data from preprocess
    train_token, test_token, vocab_dict = get_data('data/train.txt',
                                                   'data/test.txt')
    num_train = train_token.shape[0]
    num_test = test_token.shape[0]

    # TODO:  Separate your train and test data into inputs and labels
    train_inputs = np.zeros((num_train, 2), dtype=np.int32)
    train_labels = np.zeros((num_train, 1), dtype=np.int32)
    for i in range(num_train - 2):
        train_inputs[i, :] = train_token[i:i + 2]
        train_labels[i] = train_token[i + 2]
    test_inputs = np.zeros((num_test, 2), dtype=np.int32)
    test_labels = np.zeros((num_test, 1), dtype=np.int32)
    for i in range(num_test - 2):
        test_inputs[i, :] = test_token[i:i + 2]
        test_labels[i] = test_token[i + 2]

    # TODO: initialize model and tensorflow variables
    vocab_size = 7342
    model = Model(vocab_size)

    # TODO: Set-up the training step
    train(model, train_inputs, train_labels)

    # TODO: Set up the testing steps
    perplexity = test(model, test_inputs, test_labels)
    print('Perplexity = {}'.format(perplexity))
    # Print out perplexity

    word1 = 'I'
    word2 = 'like'
    length = 10
    generate_sentence(word1, word2, length, vocab_dict, model)
def main():
    model = ResNet16()
    train_images, train_labels, test_images, test_labels = get_data()
    accuracy_list = []
    matplotlib.use('Agg')
    label_names = {
        "0": "airplane",
        "1": "automobile",
        "2": "bird",
        "3": "cat",
        "4": "deer",
        "5": "dog",
        "6": "frog",
        "7": "horse",
        "8": "ship",
        "9": "truck"
    }
    for i in range(model.epochs):
        print(i)
        train(model, train_images, train_labels)
        test_accuracy = test(model, test_images, test_labels)
        accuracy_list.append(test_accuracy)
        print(test_accuracy)
    visualize_loss_accuracy(model.loss_list, accuracy_list)
    visualize_results(test_images[0:10], model(test_images[0:10]),
                      test_labels[0:10], label_names)
예제 #24
0
def main():
    train_inputs, train_labels, test_inputs, test_labels, vocab_dict = get_data(
        'data_set/train.csv', 'data_set/test.csv', 'data_set/test_labels.csv')
    model = Model(len(vocab_dict), train_inputs.shape[1])
    train(model, train_inputs, train_labels)
    accuracy, roc_score = test(model, test_inputs, test_labels)
    print("Accuracy: ", accuracy)
    print("roc_score: ", roc_score)
    visualize_embeddings(model, vocab_dict)
예제 #25
0
def train():
    logging.info("Start training.")
    # get data
    trainloader = get_data(config.data_path)
    # get HRNet
    net = cls_net()
    # define a optimizer
    optimizer = optim.Adam(net.parameters(), lr=config.lr)
    # load model if exists
    if os.path.exists(config.model_load_path) is not True:
        criterion = nn.CrossEntropyLoss()
    else:
        checkpoint = torch.load(path)
        net.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initepoch = checkpoint['epoch']
        criterion = checkpoint['loss']

    # train
    total = correct = 0
    for epoch in range(config.epoch): 
        timestart = time.time()

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            # forward, backward and step
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 200 == 199:
                logging.info('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 200))
                running_loss = 0.0
                # get accuaracy of train dataset
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                logging.info('Accuracy of the network on the %d train images: %.3f %%' % (total,
                        100.0 * correct / total))


                # save the model
                torch.save({'epoch':epoch,
                            'model_state_dict':net.state_dict(),
                            'optimizer_state_dict':optimizer.state_dict(),
                            'loss':criterion
                            }, config.model_save_path)

        logging.info('epoch %d cost %3f sec' %(epoch,time.time()-timestart))

    logging.info('Finished Training')
예제 #26
0
 def __init__(self, fold=5, rotate=False, crop=False):
     self.fold = fold
     (self.X, self.Y) = get_data(reverse=True,
                                 crop=crop,
                                 rotate_extend=rotate)
     self.sample_num = self.X.shape[0]
     self.blockSize = round(self.sample_num / self.fold)
     self.I = np.arange(self.sample_num)
     for i in range(3):
         random.shuffle(self.I)
def main():
    """
    the main function for creating, training, saving, and testing the RNN model

    return: nothing
    """
    SAVE = True  # whether to save the transformer model parameters
    RESUME = False  # resume from a saved model
    SAVED_DATA = True  # whether the preprocessed data is saved
    PATH_TO_MODEL = "rnn_model"
    # tickers = ["AAPL", "AMZN", "GOOGL", "MSFT", "CVS", "DIS", "FDX", "JPM", "REGN", "WMT", "JNJ", "HON"]  # all stocks
    tickers = ["AAPL", "AMZN"]
    train_data, test_data, _ = get_data(
        tickers)  # data of shape (num_stocks, num_days, datum_size)
    past_num = 50  # the number of days the model looks at to predict the next stock price

    # create model
    if RESUME:
        model = load_model(PATH_TO_MODEL)
    else:
        model = rnn_create()

    # pre-process data
    print("========== beginning to pre-process data ==========")
    train_x_path = "data/new_rnn_train_x.npy"
    train_y_path = "data/new_rnn_train_y.npy"
    test_x_path = "data/new_rnn_test_x.npy"
    test_y_path = "data/new_rnn_test_y.npy"
    if SAVED_DATA:
        print("loading saved data")
        train_x, train_y = np.load(train_x_path), np.load(train_y_path)
        test_x, test_y = np.load(test_x_path), np.load(test_y_path)
    else:
        train_x, train_y = process_data(train_data, past_num, train_x_path,
                                        train_y_path)
        test_x, test_y = process_data(test_data, past_num, test_x_path,
                                      test_y_path)
    print("========== finished pre-process data ==========")

    # train the model
    NUM_EPOCH = 300
    print("========== begin training ==========")
    train_history = rnn_train(model, train_x, train_y, NUM_EPOCH)
    print("========== finish training ==========")
    if SAVE:
        save_model(model, PATH_TO_MODEL)
        print("====== saved model =======")

    # evaluate the model
    print("========== begin testing ==========")
    result = rnn_test(model, test_x, test_y)
    print("printing results")
    print(model.metrics_names)
    print(result)
    print("========== finished testing ==========")
예제 #28
0
def main_predict():
    lst = [
        1768, 1555, 894, 674, 65, 54, 137, 327, 354, 492, 553, 634, 844, 928,
        1054, 1118, 1228, 1449, 1474, 1483, 1504, 1529, 1559, 1733, 1881, 1917
    ]
    data = get_data("products_sentiment_train.tsv",
                    "products_sentiment_test_copy.tsv",
                    balance=True,
                    drop_lst=lst)
    train_size = (np.array(data[0]["text"])).shape[0]

    X_full, Y_full = vectorizer(np.append(np.array(data[0]["text"]), np.array(data[1]["text"])),
    tokenizer=tokenize,
    ngram_range=(1, 4),
    max_df=0.85,
    min_df=1,
    max_features=None), \
       np.array(data[0]["label"])

    network_class = KerasNeuralNetwork(hidden_layer_sizes=(400, 200, 10),
                                       nonlin_functions=("tanh", "tanh",
                                                         "tanh"),
                                       dropout_coef=(0.7, 0.7, 0.7))

    X_train, Y_train, Y_flat_train = network_class.text_process_data(
        X_full[:train_size, :], Y=Y_full)
    neural_network = network_class.init_model(D=X_train.shape[1],
                                              K=Y_train.shape[1],
                                              loss='categorical_crossentropy',
                                              optimizer=keras.optimizers.Adam(
                                                  lr=0.0001,
                                                  beta_1=0.9,
                                                  beta_2=0.99,
                                                  epsilon=1e-08,
                                                  decay=0.0),
                                              metrics=['accuracy'])

    r = network_class.fit_network(X_train,
                                  Y_train,
                                  neural_network,
                                  epochs=30,
                                  batch=130,
                                  show=False)

    X_predict = network_class.text_process_data(X_full[train_size:, :])
    result = network_class.make_prediction(X_predict,
                                           neural_network,
                                           batch=130)
    with open("keras_adam.csv", 'w') as f_out:
        f_out.write(
            pd.DataFrame(pd.Series(map(str, range(0, 500))).str.cat(list(
                map(str, result)),
                                                                    sep=','),
                         columns=["Id,y"]).to_csv(sep=" ", index=False))
예제 #29
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--epochs", type=int, default=10000, help="number of epochs to run the model")
    parser.add_argument("--checkpoint_folder", type=str, default="./checkpoints",
                        help="folder to save the model checkpoints to")
    parser.add_argument("--print_every", type=int, default=1000, help="how many batches between each report")
    opt = parser.parse_args()

    iter_train, iter_validate, iter_test = get_data(BATCH_SIZE)
    input_dim = len(DE_FIELD.vocab)
    output_dim = len(EN_FIELD.vocab)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    sos_token = DE_FIELD.vocab.stoi["<SOS>"]
    translator = Translator(ENCODER_HIDDEN, DECODER_HIDDEN, input_dim, output_dim, EMBEDDING_DIM_ENCODER,
                            EMBEDDING_DIM_DECODER, sos_token=sos_token, device=device).to(device)
    cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=EN_FIELD.vocab.stoi["<PAD>"])
    optimizer = torch.optim.Adam(translator.parameters(), lr=0.001)
    epoch = load_model(translator, opt.checkpoint_folder)
    total_iter = 0
    total_loss = 0
    print("running...")
    for epoch_ in range(epoch, epoch + opt.epochs):
        for i, batch in enumerate(iter_train):
            if i * BATCH_SIZE < NUMBER_OF_EXAMPLES:
                try:
                    optimizer.zero_grad()
                    source = batch.src.to(device)
                    target = batch.trg.to(device)[1:, :]  # without <SOS> token
                    output = translator(source, target.shape[0])
                    try:
                        loss = cross_entropy(output.reshape(-1, output_dim), target.view(-1))
                        loss.backward()
                        optimizer.step()
                        total_loss += loss.item()
                        total_iter += 1

                        if total_iter % opt.print_every == 0:
                            print(f"iter #{total_iter} loss: {total_loss / opt.print_every}")
                            total_loss = 0
                            test_batch_ = None
                            for j, batch_validate in enumerate(iter_validate):
                                test_batch_ = batch_validate
                                if j == (total_iter / opt.print_every) % 50:
                                    break
                            test_batch(DE_FIELD, EN_FIELD, translator, test_batch_, device=device, max_examples=5)
                            del test_batch_
                            save_model(translator, opt.checkpoint_folder, epoch_)
                    finally:
                        del source, target, output, loss
                except Exception as e:
                    print(e)
                    sleep(120)
            else:
                break
예제 #30
0
def unit_test_visualization():
    train, test, _ = get_data()
    print(f'shape: {train.shape}')
    print(train[:, :10, :])
    visualize_linegraph(train[0, :, 0])
    visualize_linegraph(train[1, :, 0])

    tickers = ['AAPL', 'AMZN', 'MSFT', 'INTC', 'REGN', 'CASH']
    portfolio = [[1, 2, 3, 2, 1], [2, 3, 4, 3, 1], [3, 2, 1, 4, 2],
                 [5, 9, 2, 1, 8], [1, 3, 2, 2, 3], [4, 3, 1, 1, 4]]
    visualize_portfolio(portfolio, tickers)
예제 #31
0
def main():
	# get dataset
	data, label, ip = preprocess.get_data("../data/")
	data = data[:, :3]
	datasize, numfeat = data.shape

	# set variables
	x = tf.placeholder("float", shape=[None, numfeat])
	y = tf.placeholder("float", shape=[None, 1])
	wgt  = tf.Variable(tf.zeros([numfeat, 1]))
	bias = tf.Variable(tf.zeros([1]))
	yraw = tf.matmul(x, wgt) + bias

	# initialize variables
	init = tf.initialize_all_variables()

	# set optimization function
	reg_loss = 0.5 * tf.reduce_sum(tf.square(wgt))
	hng_loss = svmC * tf.reduce_sum(tf.maximum(tf.zeros([tf.minimum(BATCH_SIZE, REAL_SIZE), 1]), 1 - y*yraw))
	svm_loss = reg_loss + hng_loss
	min_loss = tf.train.GradientDescentOptimizer(0.01).minimize(svm_loss)

	# set prediction function
	pred = tf.sign(yraw)
	corr = tf.equal(y, pred)
	pred_type = tf.reduce_mean(tf.cast(corr, "float"))

	# run everything
	with tf.Session() as sess:
		sess.run(init)

		# before training
		print sess.run(pred_type, feed_dict={x:data, y:label})

		# training
		for step in xrange(EPOCHS * datasize / BATCH_SIZE):
			offset = (step * BATCH_SIZE) % datasize
			batch_data  = data[offset:offset+BATCH_SIZE]
			batch_label = label[offset:offset+BATCH_SIZE]
			realsize = batch_label.shape[0]
			sess.run(min_loss, feed_dict={x:batch_data, y:batch_label, REAL_SIZE:realsize})

		# after training
		print sess.run(pred_type, feed_dict={x:data, y:label})
예제 #32
0
파일: prepare.py 프로젝트: pdsujnow/BioSum
 def prep(self,
          docs_path='../data/TAC_2014_BiomedSumm_Training_Data',
          json_data_path='../data/v1-2a.json'):
     data = get_data(docs_path, json_data_path)
     train_set = {}
     for tid in data:
         train_set[tid] = []
         # citation number
         for cit in data[tid]:
             offsets = []
             ref_art = ''
             for ann in data[tid][cit].values():
                 for off in ann['ref_offset']:
                     offsets.append(off)
                 query = ann['cit_text']
                 ref_art = ann['ref_art']
             # union of all annotators reference offsets
             offsets = union(offsets)
             doc_type = tid.lower() + '_' + ref_art.lower()[:-4]
             d = self._prep_data(clean(query), doc_type, offsets)
             train_set[tid].append(
                 d)
     return train_set
예제 #33
0
    def prep(self,
             docs_path='../data/TAC_2014_BiomedSumm_Training_Data',
             json_data_path='../data/v1-2a.json'):
        '''
        Converts the raw data into a list of sentences for each
            Topic.

        Args:
            docs_path (str), json_data_path (str)

        Returns:
            dict
                kes: topic_ids - UPPER CASE: e.g. D1410_TRAIN
                value: (list of tuples) - a list of training tuples
                    for format see _prep_data

        '''
        data = get_data(docs_path, json_data_path)
        train_set = {}
        for tid in data:
            train_set[tid] = []
            # citation number
            for cit in data[tid]:
                offsets = []
                ref_art = ''
                for ann in data[tid][cit].values():
                    for off in ann['ref_offset']:
                        offsets.append(off)
                    query = ann['cit_text']
                    ref_art = ann['ref_art']
                # union of all annotators reference offsets
                offsets = union(offsets)
                doc_type = tid.lower() + '_' + ref_art.lower()[:-4]
                d = self._prep_data(clean(query), doc_type, offsets)
                train_set[tid].extend(
                    d)
        return train_set
예제 #34
0
        rest = res[1:]
        for max_sequence_length, loss, acc in rest:
            print('%d\t%f\t%f' % (max_sequence_length, loss, acc), end='\t')
        print()
    print()

if __name__ == '__main__':

    print('Indexing word vectors.')
    embeddings_index = get_embeddings_index()
    print('Collected %s word vectors.' % len(embeddings_index))
    print()

    print('Loading data')

    categories, data_train, data_test = get_data(True, False, True)
    all_unproc_X_train, all_unproc_X_test = data_train.data, data_test.data
    all_y_train, y_test = data_train.target, data_test.target

    if args.frac:
        unproc_X_train, y_train = get_frac(args.frac, all_unproc_X_train, all_y_train)
        all_unproc_X_test, y_test = get_frac(args.frac, all_unproc_X_test, y_test)
    else:
        unproc_X_train = all_unproc_X_train
        y_train = all_y_train

    #texts, labels, labels_index = get_data_locally()

    print('Preprocessing training text')
    texts_train, avg_length, max_length = preproc(unproc_X_train)
def main():
	# get dataset
	data, label, ip = preprocess.get_data("../data/")
	data = data[:, :3]
	datasize, numfeat = data.shape
예제 #36
0
import som
import numpy as np
import preprocess

from matplotlib import pyplot as plt
 
#Training inputs for RGBcolors
data, label, ip = preprocess.get_data("../data/")
color_names = ['black', 'blue', 'darkblue', 'skyblue',
     'greyblue', 'lilac', 'green', 'red', 'cyan', 'violet', 'yellow', 'white', 'darkgrey', 'mediumgrey', 'lightgrey'
]

datasize, numfeat = data.shape
 
#Train a 20x30 SOM with 400 iterations
som = som.SOM(30, 30, numfeat, 400)
som.train(data)
exit()
 
#Get output grid
image_grid = som.get_centroids()
 
#Map colours to their closest neurons
mapped = som.map_vects(colors)
 
#Plot
plt.imshow(image_grid)
plt.title('Color SOM')
for i, m in enumerate(mapped):
    plt.text(m[1], m[0], color_names[i], ha='center', va='center',