예제 #1
0
    def test_load_data(self):
        # warning data has been tampered for testing purposes
        self.assertRaises(
            exceptions.DataMismatchException,
            lambda: load_data.Data('unittests/data_unit_mismatch.json',
                                   'unittests/geneea_unit_mismatch.json'))

        # warning data has been tampered for testing purposes
        data = load_data.Data('unittests/data_unit.json',
                              'unittests/geneea_unit.json')
        # returns size of training set
        self.assertEqual(
            data.generate_sample(10, load_data.LikeTypeEnum.USEFUL), 20)

        # test returned samples
        self.assertEqual(
            data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())[0][0],
            {})

        # only review len features
        self.assertEqual(
            data.generate_sample(10, load_data.LikeTypeEnum.USEFUL), 20)
        self.assertEqual(
            len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())),
            18)
        self.assertTrue('review_length' in data.get_feature_dict(
            load_data.SampleTypeEnum.TRAIN,
            {load_data.FeatureSetEnum.REVIEWLEN})[0][0])

        # number of instances
        self.assertEqual(
            len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())),
            18)
        data.limit_train_size(10)
        self.assertEqual(
            len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())),
            10)

        # test insufficient data exception
        self.assertRaises(IndexError, lambda: data.limit_train_size(1000))

        # test add n-grams
        data.used_ngrams = {'a', 'b'}
        fs = {'c': 2}
        data.add_ngram(fs, ['b', 'b', 'c', 'a'], 2)
        self.assertEqual(fs, {
            'c': 2,
            'contains(b&b&)': 'Yes',
        })
예제 #2
0
def Analysis():
    data = load_data.Data('/home/gustaf/Downloads/data/final/')
    # bar_diagram_video_rating(data, 1)
    # box_pulse_video(data, 2)
    # box_pulse_score(data, 3)
    # scatter_pulse_score(data, 4)
    # scatter_genre_scores(data, 5)
    # box_joy_video(data, 6)
    # hist_emotion_facial_expressions(data, 7)
    # box_video_facial_expression(data, 8)
    # scatter_smile_contempt(data, 9)
    # bar_diagram_video_rating_total_binarized(data, 10)

    # hist_emotion_facial_expressions(data, 1)
    # bar_chart_video_watch_similar(data, 1)
    # bar_chart_video_watch_similar_selective(data, 1,['Despicable','Dog'])
    # bar_chart_video_watch_similar_selective(data, 1, ['Shining'], 1, 1)
    # histogram_score(data,1)
    # bar_diagram_video_rating_total(data, 1)
    # bar_chart_video_watch_similar_total(data,2)

    # prepare_face_files()
    # calculate_percentage_of_face()
    plot_one_pulse_file('/home/gustaf/Downloads/data/1/pulse_files/pulse_Amish_1.txt', 11)
    plt.show()
예제 #3
0
            #     print file.create_file(url_object)

            # file = FileIO('search_index/' + data.get_index_file())
            # print file.create_file(url_object)
            # f = open(os.path.dirname(__file__) + '/../data.yml')
            if os.path.isfile('../search_index/' + data.index_file):
                file = FileIO('../search_index/' + data.index_file)
                file.update_file(url_object)
            else:
                file = FileIO('../search_index/' + data.index_file)
                file.create_file(url_object)

            # url_object[]


if __name__ == '__main__':
    # data = load_data.Data('test.json')
    # if not data.is_file():
    #     data_array = data.segmentation()
    #     # print len(data_array)
    #     for segment in data_array:
    #         ProcessThread(segment, data).start()
    for file in os.listdir(DATA_SOURCE):
        if file.endswith(".json") or file.endswith(".txt"):
            data = load_data.Data(file)
            if not data.is_file():
                data_array = data.segmentation()
                for segment in data_array:
                    ProcessThread(segment, data).start()

예제 #4
0
import load_data
import math
import numpy as np
import theano
import theano.tensor as T
import pdb
from datetime import *
import csv

data = load_data.Data('data/r1.train', 'data/r1.test')


class Train(object):
    """docstring for ClassName"""
    def __init__(self, user_d, item_d, relation_d, margin_in, rate_in,
                 reg_param_in):
        self.n = user_d
        self.m = item_d
        self.k = relation_d
        self.margin = margin_in
        self.rate = rate_in
        self.reg_param = reg_param_in
        self.train_num = data.train_matrix.shape[0]
        self.test_num = data.test_matrix.shape[0]
        self.user_num = len(data.userid2seq)
        self.item_num = len(data.itemid2seq)
        self.relation_num = len(data.relation2seq) / 2
        self.user_vec = theano.shared(np.asarray(np.random.uniform(
            -6 / math.sqrt(self.n), 6 / math.sqrt(self.n),
            (self.user_num, self.n)),
                                                 dtype='float32'),
예제 #5
0
import utils as ut
# from params import *
import params
import load_data
import multiprocessing
import numpy as np
cores = 4
# cores = multiprocessing.cpu_count()
# print("cores:"+str(cores))

print(params.BATCH_SIZE)
# train_user_file_name = 'testTrain.dat'
data_generator_1 = load_data.Data(
    train_file=params.DIR + params.trainUserFileName_1,
    test_file=params.DIR + params.testUserFileName_1,
    batch_size=params.BATCH_SIZE)
data_generator_2 = load_data.Data(
    train_file=params.DIR + params.trainUserFileName_2,
    test_file=params.DIR + params.testUserFileName_2,
    batch_size=params.BATCH_SIZE)
data_generator_all = [data_generator_1, data_generator_2]
# print(params.trainUserFileName_1, params.testUserFileName_1)
# print(params.trainUserFileName_2, params.testUserFileName_2)
USER_NUM_1, ITEM_NUM_1 = data_generator_1.get_num_users_items()
USER_NUM_2, ITEM_NUM_2 = data_generator_2.get_num_users_items()


def test_one_user(x):
    # user u's ratings for user u
    rating = x[0]
    #uid
def main():
    logger.info("Logger is set - training start")

    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = True

    # get data with meta info
    input_channels = 1
    n_classes = 27
    # input_size, input_channels, n_classes, train_data = utils.get_data(
    #     config.dataset, config.data_path, cutout_length=0, validation=False)

    net_crit = nn.CrossEntropyLoss().to(device)
    model = SearchCNNController(input_channels,
                                config.init_channels,
                                n_classes,
                                config.layers,
                                net_crit,
                                device_ids=config.gpus)
    model = model.to(device)

    # weights optimizer
    w_optim = torch.optim.SGD(model.weights(),
                              config.w_lr,
                              momentum=config.w_momentum,
                              weight_decay=config.w_weight_decay)
    # alphas optimizer
    alpha_optim = torch.optim.Adam(model.alphas(),
                                   config.alpha_lr,
                                   betas=(0.5, 0.999),
                                   weight_decay=config.alpha_weight_decay)

    # split data to train/validation
    # n_train = len(train_data[0])
    # split = n_train // 2
    # indices = list(range(n_train))
    # train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split])
    # valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:])
    # train_loader = torch.utils.data.DataLoader(train_data,
    #                                            batch_size=config.batch_size,
    #                                            sampler=train_sampler,
    #                                            num_workers=config.workers,
    #                                            pin_memory=True)
    # valid_loader = torch.utils.data.DataLoader(train_data,
    #                                            batch_size=config.batch_size,
    #                                            sampler=valid_sampler,
    #                                            num_workers=config.workers,
    #                                            pin_memory=True)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optim, config.epochs, eta_min=config.w_lr_min)
    architect = Architect(model, config.w_momentum, config.w_weight_decay)

    # training loop
    best_top1 = 0.
    for epoch in range(config.epochs):
        lr_scheduler.step()
        lr = lr_scheduler.get_lr()[0]

        model.print_alphas(logger)

        train_data = load_data.Data(data_path=train_data_path,
                                    label_path=train_label_path)
        # training
        train(train_data, model, architect, w_optim, alpha_optim, lr, epoch)

        # validation
        cur_step = (epoch + 1) * len(train_data)
        test_data = load_data.Data(data_path=test_data_path,
                                   label_path=test_label_path)
        top1 = validate(test_data, model, epoch, cur_step)

        # log
        # genotype
        genotype = model.genotype()
        logger.info("genotype = {}".format(genotype))

        # genotype as a image
        plot_path = os.path.join(config.plot_path,
                                 "EP{:02d}".format(epoch + 1))
        caption = "Epoch {}".format(epoch + 1)
        plot(genotype.normal, plot_path + "-normal", caption)
        plot(genotype.reduce, plot_path + "-reduce", caption)

        # save
        if best_top1 < top1:
            best_top1 = top1
            best_genotype = genotype
            is_best = True
        else:
            is_best = False
        utils.save_checkpoint(model, config.path, is_best)
        print("")

    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
    logger.info("Best Genotype = {}".format(best_genotype))