def format_dataset(formatted_dataset_dir=DEFAULT_FORMATTED_DATATSET_DIR,
                   log_file=io.StringIO()):

    dataset, labels, label_map = load_dataset()
    print("randomizing the dataset...", file=log_file)

    print("train_test_split the dataset...", file=log_file)
    train_data, test_data, train_labels, test_labels = train_test_split(dataset, labels)

    print("reformating the dataset...", file=log_file)
    train_data, train_labels = _format_dataset(train_data, train_labels, IMAGE_SIZE, len(label_map))
    test_data, test_labels = _format_dataset(test_data, test_labels, IMAGE_SIZE, len(label_map))
    print("train_data:", train_data.shape, file=log_file)
    print("train_labels:", train_labels.shape, file=log_file)
    print("test_data:", test_data.shape, file=log_file)
    print("test_labels:", test_labels.shape, file=log_file)

    print("pickling the dataset...", file=log_file)

    formatted_train_dataset_path = os.path.join(formatted_dataset_dir, 'train_dataset.pickle')
    train_dataset = DataSet(train_data, train_labels, label_map)
    with open(formatted_train_dataset_path, 'wb') as f:
        pickle.dump(train_dataset, f, protocol=2)  # for compatible with python27

    formatted_test_dataset_path = os.path.join(formatted_dataset_dir, 'test_dataset.pickle')
    test_dataset = DataSet(test_data, test_labels, label_map)
    with open(formatted_test_dataset_path, 'wb') as f:
        pickle.dump(test_dataset, f, protocol=2)

    label_map_path = os.path.join(formatted_dataset_dir, 'label_map.pickle')
    with open(label_map_path, 'wb') as f2:
        pickle.dump(label_map, f2, protocol=2)

    print("dataset has saved at %s" % formatted_dataset_dir, file=log_file)
    print("load_model has finished", file=log_file)
Пример #2
0
def Train_test(all_data, test_percentage):
    # This function will divide the test and train data sets
    # Note that the test data set is used as validation and predict data used as the pure test
    N_rows = all_data.labels.shape[0]
    N_test = int(N_rows * test_percentage)
    N_months = ceil(N_rows / 30)  # Number of periods
    step = int(N_test / N_months)  # number of test samples from each period
    Train_inx = []
    Test_inx = []
    for i in range(0, N_rows - 30, 30):
        Train_inx.extend(range(i, i + 30 - step))
        Test_inx.extend(range(i + 30 - step, i + 30))
    # As the last period is less than one full month we should select the test equal to step size and leave the rest
    # for the train
    if N_rows - (i + 30) > step:
        Train_inx.extend(range(i + 30, N_rows - step))
        Test_inx.extend(range(N_rows - step, N_rows))
    else:
        Test_inx.extend(range(i + 30, N_rows))

    Train = DataSet(all_data.features[Train_inx], all_data.labels[Train_inx],
                    all_data.date[Train_inx])
    Train.label_max = all_data.label_max
    Train.label_min = all_data.label_min
    Test = DataSet(all_data.features[Test_inx], all_data.labels[Test_inx],
                   all_data.date[Test_inx])

    return Train, Test
Пример #3
0
def do_all_tests(theIndexes, searchRatio):
    dataSets = [
        DataSet('DATASETS/DATASET1.TXT'),
        DataSet('DATASETS/DATASET2.TXT'),
        DataSet('DATASETS/DATASET3.TXT')
    ]
    allStats = []
    theTester = Tester()
    theModel = ModelWrapper()

    print('[[[[ STARTING THE MOTHER OF ALL TESTS ]]]]')
    for useCNN in [False, True]:
        print('[[[ ONLY CNN LAYERS ' + str(useCNN).upper() + ' ]]]')
        for curIndex in theIndexes:
            print('[[ TESTING MODEL ' + curIndex[0] + ' WITH TEST SET ' +
                  str(curIndex[1] + 1) + ' ]]')
            theModel.load(curIndex[0])
            theTester.set_params(theModel, dataSets[curIndex[1]])
            curStats = theTester.compute_fullstats(useCNN=useCNN,
                                                   searchRatio=searchRatio)
            allStats.append(curStats)
            print('[[ MODEL TESTED ]]')
            with open('ALLSTATS_PCT' + str(int(searchRatio * 100)) + '.pkl',
                      'wb') as outFile:
                dump(allStats, outFile)
        print('[[[ FINISHED ONLY CNN LAYERS ' + str(useCNN).upper() + ' ]]]')
    print('[[[[ FINISHED THE MOTHER OF ALL TESTS ]]]]')
Пример #4
0
def init_model(fold, train_data, train_label, val_data, val_label, test_data, test_label):
    train_source = DataSet(train_data, train_label)
    val_source = DataSet(val_data, val_label)
    test_source = DataSet(test_data, test_label)
    print('train_len:', len(train_source))
    print('test_len:', len(test_source))
    _lr = 1e-4
    print('Initialize lr as %f' % _lr)
    model_config = {
        'dout': True,
        'lr': _lr,
        'num_classes': 2,
        'num_workers': 8,
        'batch_size': 64,
        'restore_iter': 0,
        'total_iter': 5000,
        'model_name': 'MGH-dw-all-' + fold,
        'pretrain_point': None,
        'train_source': train_source,
        'val_source': val_source,
        'test_source': test_source
    }
    model_config['save_name'] = '_'.join([
        '{}'.format(model_config['model_name']),
        '{}'.format(model_config['dout']),
        '{}'.format(0.0001),
        '{}'.format(model_config['batch_size']),
    ])

    os.makedirs(osp.join('model', model_config['model_name']), exist_ok=True)

    return Model(**model_config)
Пример #5
0
    def dwi_philips(dataset):
        tag_bval = Tag(0x2001, 0x1003)
        tag_bvec = Tag(0x2001, 0x1004)
        tag_bvec_rl = Tag(0x2005, 0x10b0)
        tag_bvec_ap = Tag(0x2005, 0x10b1)
        tag_bvec_fh = Tag(0x2005, 0x10b2)

        if not all(x in dataset for x in (tag_bval, tag_bvec)):
            return None

        dwi_dataset = DataSet()

        if isinstance(dataset[tag_bval].value,
                      (list, tuple)) and dataset[tag_bval].value:
            dwi_dataset.diffusion_bvalue = FD(dataset[tag_bval].value[0])
        else:
            dwi_dataset.diffusion_bvalue = FD(dataset[tag_bval].value)

        gradient_dataset = DataSet()
        if not isinstance(dataset[tag_bvec], CS):
            gradient_dataset.diffusion_gradient_orientation = FD(
                [float(x) for x in dataset[tag_bvec].value])
        else:
            gradient_dataset.diffusion_gradient_orientation = FD([
                float(dataset[x].value)
                for x in (tag_bvec_rl, tag_bvec_ap, tag_bvec_fh)
            ])

        dwi_dataset.diffusion_gradient_direction_sequence = SQ(
            [gradient_dataset])
        dwi_dataset.diffusion_directionality = CS("DIRECTIONAL")

        return dwi_dataset
Пример #6
0
def read_train_sets(train_path, image_size, classes, validation_size):
    data_set = DataSet()

    images, labels, img_names, class_array = load_train_data(
        train_path, image_size, classes)
    images, labels, img_names, class_array = shuffle(images, labels, img_names,
                                                     class_array)

    if isinstance(validation_size, float):
        validation_size = int(validation_size * images.shape[0])

    validation_images = images[:validation_size]
    validation_labels = labels[:validation_size]
    validation_img_names = img_names[:validation_size]
    validation_cls = class_array[:validation_size]

    train_images = images[validation_size:]
    train_labels = labels[validation_size:]
    train_img_names = img_names[validation_size:]
    train_cls = class_array[validation_size:]

    data_set.train = DataSet(train_images, train_labels, train_img_names,
                             train_cls)
    data_set.valid = DataSet(validation_images, validation_labels,
                             validation_img_names, validation_cls)

    return data_set
Пример #7
0
    def __init__(self, sess, epoch, batch_size, checkpoint_dir, log_dir, learning_rate = 0.00001, beta1=0.5):
        self.sess = sess
        self.keep_prob = 1.0
        #self.dataset_name = dataset_name
        #self.result_dir = result_dir
        self.log_dir = log_dir
        self.checkpoint_dir = checkpoint_dir
        self.epoch = epoch
        self.batch_size = batch_size
        self.beta1 = beta1
        self.label_dim = 50
        self.train_set = DataSet("../data/train_augment", self.batch_size, self.label_dim)
        self.test_set = DataSet("../data/test_augment", self.batch_size, self.label_dim)
		# parameters
        self.input_height = 227
        self.input_width = 227
        #self.output_height = 224
        #self.output_width = 224
        self.c_dim = 3

        # train
        self.init_learning_rate = learning_rate
        
        # get number of batches for a single epoch
        self.num_batches = self.train_set.total_batches
        self.test_num_batches = self.test_set.total_batches
Пример #8
0
def train():
    #训练数据
    data_train, label_train = DataSet.data_from_text("./Hnd/trainyny.txt",1450)
    train = DataSet(data_train, label_train, dtype=dtypes.float32)
    data_test, label_test = DataSet.data_from_text("./Hnd/testyny.txt",145)
    test = DataSet(data_test, label_test, dtype=dtypes.float32)
    Datasetsx = collections.namedtuple('Datasetsx', ['train', 'test'])
    Data = Datasetsx(train=train, test=test)

    #训练过程
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver({'s_w': W, 's_b': b})
        for i in range(50000):                                  #训练阶段,迭代50000次
            batch_xs, batch_ys = Data.train.next_batch(50)      #按批次训练,每批50行数据
            sess.run(train_step, feed_dict={x: batch_xs, y_actual: batch_ys})   
                                                                #执行训练
            accu = 0
            if(i%50==0):                                        #每训练100次,测试一次
                accu = sess.run(accuracy, feed_dict={ x: Data.test.images, 
                                                      y_actual: Data.test.labels})
                print ("accuracy:", accu)
            if(accu>Target_Accuracy):
                break
        saver.save(sess, "./model/softmax.ckpt")
Пример #9
0
def choose_dataset():
    """Lets the user select a data set by keyboard input."""

    val = input(
        'Type "r" for restaurant data set, "p" for plants data set, '
        '"b" for books data set, or anything else for business data set: ')
    if val == 'r':
        size = int(input('How many examples do you want to use? '))
        return SyntheticRestaurant(size)
    elif val == 'p':
        dataset = DataSet(
            attr_names=
            'Habitat Colour TypeOfLeaf LeafWidth LeafLength Height EdibleOrPoisonous',
            name='plants',
            source=
            'http://mldata.org/repository/data/viewslug/plant-classification')
    elif val == 'b':
        dataset = DataSet(
            attr_names=
            'Genre MenBuyers WomenBuyers Price CriticismRate ? LikedByAudience',
            name='books',
            source=
            'http://mldata.org/repository/data/viewslug/book-evaluation-complete'
        )
    else:
        dataset = DataSet(
            attr_names='X1 X2 X3 X4 X5 Successful',
            name='business',
            source=
            'http://mldata.org/repository/data/viewslug/successful-business')
    return choose_size(dataset)
Пример #10
0
def dataset_reshaped(data_sets):
  train_images=data_sets.train.x
  train_images=train_images.reshape(train_images.shape[0],28,28,1)

  train_labels=data_sets.train.labels
  n_values = np.max(train_labels) + 1
  train_labels=np.eye(n_values)[train_labels]

  validation_images=data_sets.validation.x
  validation_images=validation_images.reshape(validation_images.shape[0],28,28,1)
  validation_labels=data_sets.validation.labels
  n_values = np.max(validation_labels) + 1
  validation_labels=np.eye(n_values)[validation_labels]

  test_images=data_sets.test.x
  test_images=test_images.reshape(test_images.shape[0],28,28,1)
  test_labels=data_sets.test.labels
  n_values = np.max(test_labels) + 1
  test_labels=np.eye(n_values)[test_labels]

  train = DataSet(train_images, train_labels,size_change=True)
  validation = DataSet(validation_images, validation_labels,size_change=True)
  test = DataSet(test_images, test_labels,size_change=True)

  return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, seed=0):

    one_hot = False

    class DataSets(object):
        pass

    data_sets = DataSets()
    TRAIN_IMAGES = "train-images-idx3-ubyte.gz"
    TRAIN_LABELS = "train-labels-idx1-ubyte.gz"
    TEST_IMAGES = "t10k-images-idx3-ubyte.gz"
    TEST_LABELS = "t10k-labels-idx1-ubyte.gz"

    local_file = maybe_download(TRAIN_IMAGES, train_dir)
    train_images = extract_images(local_file)

    local_file = maybe_download(TRAIN_LABELS, train_dir)
    train_labels = extract_labels(local_file, one_hot=one_hot)

    local_file = maybe_download(TEST_IMAGES, train_dir)
    test_images = extract_images(local_file)

    local_file = maybe_download(TEST_LABELS, train_dir)
    test_labels = extract_labels(local_file, one_hot=one_hot)

    print('Train', train_images.shape)
    print('Test', test_images.shape)
    data_sets.train = DataSet(train_images, train_labels, seed=seed)
    data_sets.test = DataSet(test_images, test_labels, seed=seed)

    return data_sets
Пример #12
0
    def generate_folds(self, data_set, num_fold):
        """
		Generate smaller, non-overlapping data sets from the master data set.
		The folds should be returned in a list of tuples.  Each tuple corresponds to a fold.
		t[0] is the testing fold of the data 
		, t[1] is the inverse of the fold (training fold: all the rest of the data).
		"""
        items = data_set.get_items()
        folds = []
        if (items is not None):

            for i in range(num_fold):
                # Each fold is a tuple of datasets
                # First one is the test fold,
                # The inverse fold is used for training
                folds.append((DataSet(), DataSet()))
            current_fold = 0

            for item in items:

                folds[current_fold][0].add_item(item,
                                                data_set.get_features(item),
                                                data_set.get_label(item))
                for j in range(num_fold):
                    if (j <> current_fold):
                        folds[j][1].add_item(item, data_set.get_features(item),
                                             data_set.get_label(item))

                current_fold += 1
                if (current_fold == num_fold):
                    current_fold = 0
        return folds
Пример #13
0
def main(args):

    # here can be replaced with argparse
    steps_per_epoch = _DATASET_SIZE / args.batch_size

    mfcc_data = DataSet(args.mfcc_dir)
    label_data = DataSet(args.label_dir)

    _, coefficient_vector_size, num_of_window = mfcc_data.shape()
    _, degree_of_latent_factor = label_data.shape()

    use_channel = False
    if args.model == 'conv1d':
        model = conv1d(num_of_window, coefficient_vector_size,
                       degree_of_latent_factor)
    elif args.model == 'conv2d':
        use_channel = True
        model = conv2d(num_of_window, coefficient_vector_size,
                       degree_of_latent_factor)
    elif args.model == 'conv1d_lstm':
        model = conv_lstm(num_of_window, coefficient_vector_size,
                          degree_of_latent_factor)
    else:
        model = feed_forward(num_of_window, coefficient_vector_size,
                             degree_of_latent_factor)

    trained_model, history = train(model, mfcc_data, label_data, use_channel,
                                   args.test_ratio, args.batch_size,
                                   steps_per_epoch, args.epochs)
    export(args.result, trained_model, history)
Пример #14
0
def main():
    algorithms = [
        factory.get_algorithm("id3"),
        factory.get_algorithm("knn"),
        factory.get_algorithm("bayes")
    ]

    training = DataSet()
    target = training.load_from_file("train.txt")
    validation = DataSet()
    validation.load_from_file("test.txt")
    output_file = "output.txt"

    # run the algorithms
    acc = []
    for i in range(len(algorithms)):
        accuracy = tests.validate(algorithms[i], training, validation, target)
        # print specifically the tree algorithm
        if i == 0:
            algorithms[i].print_tree(output_file)
        acc.append(str(accuracy))

    # write the accuracy into the files
    accuracy_string = '\n' + '\t'.join(acc)
    with open(output_file, 'a') as acc_file:
        acc_file.write(accuracy_string)
Пример #15
0
def create_pointer_examples():
    results = []
    result_names = []
    # pure BPEmb
    vs = 100000
    d = 200
    bp_man = BPEmbeddings(bp_vocab_size=vs, dim=d, case_sensitive=False)
    ds = DataSet("blah")
    ds.read_multiple(train_sets + dev_set + itac_test + conll_test)
    bp_man.build_vocabulary([ds])
    manager = PointerManager(bp_man,
                             "basic",
                             learning_rate=START_LR,
                             lr_factor=LR_DECAY,
                             lr_patience=LR_PATIENCE,
                             cuda_device=CUDA_DEVICE)
    manager.load_model("pointer/models/19_05_11b/bpemb_{}_{}.pt".format(vs, d))
    results.append(test_example(manager))
    result_names.append("bpemb_{}_{}".format(vs, d))

    # pure glove
    for d in [50, 300]:
        path = "embeddings/glove/glove.6B.{}d.txt".format(d)
        g_man = GloveEmbeddings(path=path, dim=d)
        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 cuda_device=CUDA_DEVICE)
        manager.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d))
        results.append(test_example(manager))
        result_names.append("glove_{}".format(d))

    # glove + bpemb
    for g_d, b_d in [(200, 50), (300, 25)]:
        path = "embeddings/glove/glove.6B.{}d.txt".format(g_d)
        g_man = GloveEmbeddings(path=path, dim=g_d)
        b_man = BPEmbeddings(dim=b_d, bp_vocab_size=100000)
        c_man = CombinedEmbeddings([g_man, b_man])
        ds = DataSet("blah")
        ds.read_multiple(train_sets + dev_set + itac_test + conll_test)
        c_man.build_vocabulary([ds])

        manager = PointerManager(g_man,
                                 "basic",
                                 learning_rate=START_LR,
                                 lr_factor=LR_DECAY,
                                 lr_patience=LR_PATIENCE,
                                 cuda_device=CUDA_DEVICE)
        manager.load_model(
            "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format(g_d, b_d))
        results.append(test_example(manager))
        result_names.append("glove_d{}_bp_d{}_vs100000".format(g_d, b_d))

    write_results("results/19_05_11b/pointer_examples.txt",
                  results=results,
                  names=result_names)
Пример #16
0
    def __init__(self,
                 sess,
                 epoch,
                 batch_size,
                 dataset_name,
                 checkpoint_dir,
                 result_dir,
                 log_dir,
                 learning_rate=0.00001,
                 beta1=0.5):
        self.sess = sess
        self.dataset_name = dataset_name
        self.result_dir = result_dir
        self.log_dir = log_dir
        self.epoch = epoch
        self.batch_size = batch_size
        self.beta1 = beta1
        if dataset_name == 'BLSD':
            self.label_dim = 8
            self.train_set = DataSet("../dataset/BLSD/img", self.batch_size,
                                     self.label_dim)
            self.log_dir = log_dir + "/BLSD"
            self.checkpoint_dir = checkpoint_dir + "/BLSD"
            self.predict_set = DataSet("../predictset/BLSD", 1, self.label_dim)
            self.label_name = [
                "amusement", "anger", "awe", "contentment", "disgust",
                "excitement", "fear", "sadness"
            ]
            #self.pred_set = DataSet("../BLSD_predset/img", self.batch_size)
        elif dataset_name == 'kaggle':
            self.label_dim = 7
            self.train_set = DataSet("../dataset/kaggle/training",
                                     self.batch_size, self.label_dim)
            self.test_set = DataSet("../dataset/kaggle/test", 1,
                                    self.label_dim)
            self.log_dir = log_dir + "/kaggle"
            self.checkpoint_dir = checkpoint_dir + "/kaggle"
            self.predict_set = DataSet("../predictset/kaggle", 1,
                                       self.label_dim)
            self.label_name = [
                "anger", "disgust", "fear", "happy", "sad", "surprise",
                "neutral"
            ]

        # parameters
        self.input_height = 224
        self.input_width = 224
        self.output_height = 224
        self.output_width = 224
        self.c_dim = 3

        # train
        self.learning_rate = learning_rate

        # get number of batches for a single epoch
        self.num_batches = self.train_set.total_batches
        self.test_num_batches = self.test_set.total_batches
        self.predict_num_batches = self.predict_set.total_batches
def train_it():

    SAVE_PATH = '/mnt/md1/Experiments/SSAD_Test9'

    config = Config()
    ssad = SSAD(config).to(device)
    # optim = torch.optim.SGD(ssad.parameters(),lr=0.5,momentum=0.9,weight_decay=0.0001)
    optim = torch.optim.Adam(ssad.parameters(),lr=config.learning_rates[0],weight_decay=0.0001)

    # dataset_train = DataSet('training',True,'HQZ_DPN107_RGB_FULL')
    # dataset_val   = DataSet('validation',False,'HQZ_DPN107_RGB_FULL')

    dataset_train = DataSet('training',False,'MIX_RES200_DPN107')
    dataset_val   = DataSet('validation',False,'MIX_RES200_DPN107')

    TRAIN_ITER =  len(dataset_train.vids)//config.batch_size+1
    VAL_ITER = len(dataset_val.vids)//config.batch_size+1

    for epoch in range(config.training_epochs):

        ssad.train()

        dataset_train.pemutate_vids()
        dataset_val.pemutate_vids()

        for idx in range(TRAIN_ITER):

            gF,gL,gB,gI = dataset_train.nextbatch(config.batch_size)
            gF = np.transpose(gF,(0,2,1))
            gF = torch.from_numpy(gF).to(device).float()
            gL = torch.from_numpy(gL).to(device).long()
            gB = torch.from_numpy(gB).to(device).float()

            ssad.zero_grad()
            train_loss,_,_ = SSAD_Train(ssad,gF,gL,gB,gI,config)
            train_loss.backward()
            optim.step()

            print('Train: {} {}/{} train_loss: {}'.format(epoch,idx,TRAIN_ITER,train_loss.item()),flush=True)

        if epoch%2==0:
            ssad.eval()
            for idx in range(VAL_ITER):
                with torch.no_grad():
                    gF,gL,gB,gI = dataset_val.nextbatch(config.batch_size)
                    gF = np.transpose(gF,(0,2,1))
                    gF = torch.from_numpy(gF).to(device).float()
                    gL = torch.from_numpy(gL).to(device).long()
                    gB = torch.from_numpy(gB).to(device).float()

                    val_loss,_,_ = SSAD_Train(ssad,gF,gL,gB,gI,config)
                    print('Test: {} {}/{} test_loss: {}'.format(epoch,idx,VAL_ITER,val_loss.item()),flush=True)

            # save model
            save_modle(ssad,SAVE_PATH+'/ssad_resnet200_2048_{:03d}.pth'.format(epoch))
            # change learning rate
            change_optim_lr(optim,config.learning_rates[epoch])
Пример #18
0
    def moving_extract(self,
                       window=30,
                       date=None,
                       open_prices=None,
                       close_prices=None,
                       high_prices=None,
                       low_prices=None,
                       volumes=None,
                       N_predict=1,
                       flatten=True):

        self.extract(open_prices=open_prices,
                     close_prices=close_prices,
                     high_prices=high_prices,
                     low_prices=low_prices,
                     volumes=volumes)

        feature_arr = numpy.asarray(self.feature)
        p = 0
        rows = feature_arr.shape[0]
        print("feature dimension: %s" % rows)
        all_data = DataSet([], [], [])
        predict = DataSet([], [], [])

        while p + window <= feature_arr.shape[1]:
            # The last self.prospective days can not produce complete labels
            if feature_arr.shape[1] - (p + window) >= N_predict:
                x = feature_arr[:, p:p + window]
                # Label the closing price of the next day -days
                y = make_label(close_prices, p + window, self.prospective)
                d = list(date[p + window:p + window + self.prospective])

                if flatten:
                    x = x.flatten("F")
                all_data.features.append(numpy.nan_to_num(x))
                all_data.labels.append(y)
                all_data.date.append(d)

            else:
                x = feature_arr[:, p:p + window]
                if flatten:
                    x = x.flatten("F")
                predict.features.append(numpy.nan_to_num(x))
                predict.date.append(date[p + window - 1])
                predict.closing_price.append(close_prices[p + window - 1])
                predict.last_label.append(close_prices[p + window - 2])
            p += 1

        all_data._features = numpy.asarray(all_data.features)
        all_data._labels = numpy.asarray(all_data.labels)
        all_data._date = numpy.asarray(all_data.date)
        predict._features = numpy.asarray(predict.features)
        predict._date = numpy.asarray(predict.date)
        predict._last_label = numpy.asarray(predict.last_label)
        predict._closing_price = numpy.asarray(predict.closing_price)

        return all_data, predict
Пример #19
0
def import_mnist():
    """
    This import mnist and saves the data as an object of our DataSet class
    :return:
    """
    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    VALIDATION_SIZE = 0
    ONE_HOT = True
    TRAIN_DIR = 'MNIST_data'


    local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_IMAGES)
    train_images = extract_images(open(local_file))

    local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_LABELS)
    train_labels = extract_labels(open(local_file), one_hot=ONE_HOT)

    local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TEST_IMAGES)
    test_images = extract_images(open(local_file))

    local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TEST_LABELS)
    test_labels = extract_labels(open(local_file), one_hot=ONE_HOT)

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]

    ## Process images
    train_images = process_mnist(train_images)
    validation_images = process_mnist(validation_images)
    test_images = process_mnist(test_images)

    ## Standardize data
    train_mean, train_std = get_data_info(train_images)
#    train_images = standardize_data(train_images, train_mean, train_std)
#    validation_images = standardize_data(validation_images, train_mean, train_std)
#    test_images = standardize_data(test_images, train_mean, train_std)

    # data = DataSet(train_images, train_labels)
    # test = DataSet(test_images, test_labels)
    # val = DataSet(validation_images, validation_labels)

    data = DataSet(train_images, train_images)
    test = DataSet(test_images, test_images)
    val = DataSet(validation_images, validation_images)


    return data, test, val
Пример #20
0
def test():
	with open('unrelated_vs_all.pkl', 'rb') as input:
		unrelated_vs_all = pickle.load(input)

	with open('disagree_vs_all.pkl', 'rb') as input:
		disagree_vs_all = pickle.load(input)

	with open('agree_vs_all.pkl', 'rb') as input:
		agree_vs_all = pickle.load(input)
	# create the test set with lemmatized bodies
	test_set = DataSet("csv/test_stances_csc483583.csv", "csv/lemmatized_bodies.csv")
	# create an original set that has original bodies
	orig_set = DataSet("csv/test_stances_csc483583.csv", "csv/train_bodies.csv")
	stances = test_set.stances
	articles = test_set.articles
	orig_articles = orig_set.articles
	gold = []
	count = 0

	for stance in stances:
		stance_result = ""
		headline = stance['Headline']
		bodyID = stance['Body ID']
		#get lemmatized body from DataSet created with lemmatized_bodies.csv
		body_lemmas = articles[bodyID]
		#get the original body from DataSet created with train_bodies.csv
		orig_body = orig_articles[bodyID]
		count += 1
		print("classifying article id: " + str(bodyID))
		print("article count: " + str(count))
		similarity_score, similar_sentences, max_similarity, negation_average = similarity_feature(headline, body_lemmas, orig_body)
		neg = max_similarity.get('Negates')
		if(neg == None):
			neg = 0
		max_score = max_similarity.get('Score')
		if(max_score == None):
			max_score = 0.0
		# predict stance_result using SVM
		unrelated_vs_all_result = unrelated_vs_all.predict([[similarity_score, max_score]])
		disagree_vs_all_result = disagree_vs_all.predict([[negation_average]])
		agree_vs_all_result = agree_vs_all.predict([[similarity_score, max_score]])
		if(unrelated_vs_all_result == 1):
			stance_result = 'unrelated'
		elif(disagree_vs_all_result == 1):
			stance_result = 'disagree'
		elif(agree_vs_all_result == 1):
			stance_result = 'agree'
		else:
			stance_result = 'discuss'

		gold.append({'Headline': headline, 'Body ID': bodyID, 'Stance': stance_result})

	keys = gold[0].keys()
	with open('csv/gold.csv', 'wb') as output_file:
		dict_writer = csv.DictWriter(output_file, keys)
		dict_writer.writeheader()
		dict_writer.writerows(gold)
Пример #21
0
    def init_model(self):
        print("initilizing network\n")

        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            self.model = createDeepLabv3()
            self.model = nn.DataParallel(self.model,
                                         device_ids=self.device_ids).to(
                                             self.device)
        else:
            self.model = createDeepLabv3().to(self.device)

        # self.optim = torch.optim.Adam(self.model.parameters(), lr=self.lr, betas=(self.beta_1, self.beta_2))
        self.optim = torch.optim.SGD(self.model.parameters(), lr=self.lr)

        self.criterian = torch.nn.MSELoss(reduction='mean')

        self.transform = transforms.Compose([
            # transforms.RandomResizedCrop(128, scale=(0.08, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=2),
            # transforms.RandomRotation((-90,90)),
            # transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0),
            # transforms.RandomHorizontalFlip(p=0.8),
            # transforms.RandomVerticalFlip(p=0.8),
            # transforms.RandomAffine((-5, 5)),
            # # transforms.GaussianBlur(kernel_size, sigma=(0.1, 2.0)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])

        # Dataset作成
        # (RGB)の色の平均値と標準偏差
        color_mean = (0.485, 0.456, 0.406)
        color_std = (0.229, 0.224, 0.225)

        self.train_ = DataSet(img_dir=self.img_dir,
                              mask_dir=self.mask_dir,
                              size=self.im_size,
                              data_type="train")
        #                                transform=DataTransform(input_size=1024, color_mean=color_mean, color_std=color_std))
        self.valid_ = DataSet(img_dir=self.img_dir,
                              mask_dir=self.mask_dir,
                              size=self.im_size,
                              data_type="validation")
        #                                transform=DataTransform(input_size=1024, color_mean=color_mean, color_std=color_std))

        self.dataloader_train = DataLoader(self.train_,
                                           batch_size=self.batch_size,
                                           num_workers=4,
                                           shuffle=True)
        self.dataloader_valid = DataLoader(self.valid_,
                                           batch_size=self.batch_size,
                                           num_workers=4,
                                           shuffle=False)

        print("initilization done\n")
def import_dataset(dataset, fold):

    train_X = np.loadtxt('FOLDS/' + dataset + '_ARD_Xtrain__FOLD_' + fold, delimiter=' ')
    train_Y = np.loadtxt('FOLDS/' + dataset + '_ARD_ytrain__FOLD_' + fold, delimiter=' ')
    test_X = np.loadtxt('FOLDS/' + dataset + '_ARD_Xtest__FOLD_' + fold, delimiter=' ')
    test_Y = np.loadtxt('FOLDS/' + dataset + '_ARD_ytest__FOLD_' + fold, delimiter=' ')

    data = DataSet(train_X, train_Y)
    test = DataSet(test_X, test_Y)

    return data, test
Пример #23
0
def basic_eg1k_checkup():
    dss = []

    dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_rnd_std'))
    dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_rnd_kcv'))
    dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_chr_frm'))
    dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_chr_prc'))

    for ds in dss:
        check_trn_tst_disjoint(ds)
        check_trn_symmetric_and_connected(ds)
Пример #24
0
def read_feature(path, input_shape, prefix):
    ultimate_features = numpy.loadtxt("%s/%s_feature.%s" % (path, prefix, str(input_shape[0])))
    ultimate_features = numpy.reshape(ultimate_features, [-1, input_shape[0], input_shape[1]])
    ultimate_labels = numpy.loadtxt("%s/%s_label.%s" % (path, prefix, str(input_shape[0])))
    # ultimate_labels = numpy.reshape(ultimate_labels, [-1, 1])
    train_set = DataSet(ultimate_features, ultimate_labels)
    test_features = numpy.loadtxt("%s/%s_feature.test.%s" % (path, prefix, str(input_shape[0])))
    test_features = numpy.reshape(test_features, [-1, input_shape[0], input_shape[1]])
    test_labels = numpy.loadtxt("%s/%s_label.test.%s" % (path, prefix, str(input_shape[0])))
    # test_labels = numpy.reshape(test_labels, [-1, 1])
    test_set = DataSet(test_features, test_labels)
    return train_set, test_set
Пример #25
0
def train():
    #训练数据
    data_train, label_train = DataSet.data_from_text("./Hnd/trainyny.txt",
                                                     1450)
    train = DataSet(data_train, label_train, dtype=dtypes.float32)
    data_test, label_test = DataSet.data_from_text("./Hnd/testyny.txt", 145)
    test = DataSet(data_test, label_test, dtype=dtypes.float32)
    DataSetsx = collections.namedtuple('DataSetsx', ['train', 'test'])
    Data = DataSetsx(train=train, test=test)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver({
            'cnn_w1': W_conv1,
            'cnn_w2': W_conv2,
            'cnn_w3': W_fc1,
            'cnn_w4': W_fc2,
            'cnn_b1': b_conv1,
            'cnn_b2': b_conv2,
            'cnn_b3': b_fc1,
            'cnn_b4': b_fc2
        })
        for i in range(50000):  #训练阶段,迭代50000次
            batch_X, batch_Y = Data.train.next_batch(100)
            sess.run(train_step,
                     feed_dict={
                         xs: batch_X,
                         ys: batch_Y,
                         keep_prob: 0.5
                     })
            accu = 0
            if (i % 100 == 0):  #每训练100次,测试一次
                v_xs = Data.test.images
                v_ys = Data.test.labels
                y_pre = sess.run(prediction,
                                 feed_dict={
                                     xs: v_xs,
                                     keep_prob: 1
                                 })
                correct_prediction = tf.equal(tf.argmax(y_pre, 1),
                                              tf.argmax(v_ys, 1))
                accuracy = tf.reduce_mean(
                    tf.cast(correct_prediction, tf.float32))
                accu = sess.run(accuracy,
                                feed_dict={
                                    xs: v_xs,
                                    ys: v_ys,
                                    keep_prob: 1
                                })
                print("accuracy:", accu)
            if (accu > Target_Accuracy):
                break
        saver.save(sess, "./model/cnn.ckpt")
Пример #26
0
    def build(self):
        cfg = utils.load_config()

        if os.path.exists(f"{cfg['user']}.csv"):
            print('Existing csv found, loading the file')
            dataset = DataSet(cfg, create_csv=False)
        else:
            print('No csv found, creating one using segmented data')
            extract.extract_all(data_path=cfg['data_path'],
                                segments_path=cfg['segments_path'])
            dataset = DataSet(cfg, create_csv=True)
        return VideoWidget(dataset, cfg)
Пример #27
0
def read_ultimate(path, input_shape):
    ultimate_features = numpy.loadtxt(path + "ultimate_feature." + str(input_shape[0]))
    ultimate_features = numpy.reshape(ultimate_features, [-1, input_shape[0], input_shape[1]])
    ultimate_labels = numpy.loadtxt(path + "ultimate_label." + str(input_shape[0]))
    # ultimate_labels = numpy.reshape(ultimate_labels, [-1, 1])
    train_set = DataSet(ultimate_features, ultimate_labels)
    test_features = numpy.loadtxt(path + "ultimate_feature.test." + str(input_shape[0]))
    test_features = numpy.reshape(test_features, [-1, input_shape[0], input_shape[1]])
    test_labels = numpy.loadtxt(path + "ultimate_label.test." + str(input_shape[0]))
    # test_labels = numpy.reshape(test_labels, [-1, 1])
    test_set = DataSet(test_features, test_labels)
    return train_set, test_set
Пример #28
0
def import_dataset(dataset, k_fold):

    path_train_1 = os.path.join(path_hdf5, 'fold_0.hdf')
    hf_0 = h5py.File(path_train_1, 'r')

    train_X = loading_data(hf_0)
    test_X = train_X
    hf_0.close()

    data = DataSet(train_X, train_X)
    test = DataSet(test_X, test_X)

    return data, test
Пример #29
0
def read_dataset(folder_name, debug=False):
    f = gzip.open(folder_name, 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()
    n_samples = train_set[0].shape[0]

    if debug:
        n_samples = 10000

    datasets_template = collections.namedtuple('Datasets_template', ['train', 'validation', 'test'])
    Datasets = datasets_template(train=DataSet(train_set[0][:n_samples, :], train_set[1]),
                                 validation=DataSet(valid_set[0], valid_set[1]), test=DataSet(test_set[0], test_set[1]))
    return Datasets
    def test_constructor_reads_from_file_and_concats(self):
        one_result: pd.DataFrame = DataSet(["./fixtures/nine_records.csv"],
                                           PreprocessorSpy(),
                                           TrainerStub())._df

        assert_that(one_result).is_not_none()
        assert_that(len(one_result)).is_equal_to(9)

        two_results: pd.DataFrame = \
            DataSet(["./fixtures/nine_records.csv", "./fixtures/four_records.csv"], PreprocessorSpy(), TrainerStub()) \
                ._df

        assert_that(two_results).is_not_none()
        assert_that(len(two_results)).is_equal_to(13)