def test_load_data(self): # warning data has been tampered for testing purposes self.assertRaises( exceptions.DataMismatchException, lambda: load_data.Data('unittests/data_unit_mismatch.json', 'unittests/geneea_unit_mismatch.json')) # warning data has been tampered for testing purposes data = load_data.Data('unittests/data_unit.json', 'unittests/geneea_unit.json') # returns size of training set self.assertEqual( data.generate_sample(10, load_data.LikeTypeEnum.USEFUL), 20) # test returned samples self.assertEqual( data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())[0][0], {}) # only review len features self.assertEqual( data.generate_sample(10, load_data.LikeTypeEnum.USEFUL), 20) self.assertEqual( len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())), 18) self.assertTrue('review_length' in data.get_feature_dict( load_data.SampleTypeEnum.TRAIN, {load_data.FeatureSetEnum.REVIEWLEN})[0][0]) # number of instances self.assertEqual( len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())), 18) data.limit_train_size(10) self.assertEqual( len(data.get_feature_dict(load_data.SampleTypeEnum.TRAIN, set())), 10) # test insufficient data exception self.assertRaises(IndexError, lambda: data.limit_train_size(1000)) # test add n-grams data.used_ngrams = {'a', 'b'} fs = {'c': 2} data.add_ngram(fs, ['b', 'b', 'c', 'a'], 2) self.assertEqual(fs, { 'c': 2, 'contains(b&b&)': 'Yes', })
def Analysis(): data = load_data.Data('/home/gustaf/Downloads/data/final/') # bar_diagram_video_rating(data, 1) # box_pulse_video(data, 2) # box_pulse_score(data, 3) # scatter_pulse_score(data, 4) # scatter_genre_scores(data, 5) # box_joy_video(data, 6) # hist_emotion_facial_expressions(data, 7) # box_video_facial_expression(data, 8) # scatter_smile_contempt(data, 9) # bar_diagram_video_rating_total_binarized(data, 10) # hist_emotion_facial_expressions(data, 1) # bar_chart_video_watch_similar(data, 1) # bar_chart_video_watch_similar_selective(data, 1,['Despicable','Dog']) # bar_chart_video_watch_similar_selective(data, 1, ['Shining'], 1, 1) # histogram_score(data,1) # bar_diagram_video_rating_total(data, 1) # bar_chart_video_watch_similar_total(data,2) # prepare_face_files() # calculate_percentage_of_face() plot_one_pulse_file('/home/gustaf/Downloads/data/1/pulse_files/pulse_Amish_1.txt', 11) plt.show()
# print file.create_file(url_object) # file = FileIO('search_index/' + data.get_index_file()) # print file.create_file(url_object) # f = open(os.path.dirname(__file__) + '/../data.yml') if os.path.isfile('../search_index/' + data.index_file): file = FileIO('../search_index/' + data.index_file) file.update_file(url_object) else: file = FileIO('../search_index/' + data.index_file) file.create_file(url_object) # url_object[] if __name__ == '__main__': # data = load_data.Data('test.json') # if not data.is_file(): # data_array = data.segmentation() # # print len(data_array) # for segment in data_array: # ProcessThread(segment, data).start() for file in os.listdir(DATA_SOURCE): if file.endswith(".json") or file.endswith(".txt"): data = load_data.Data(file) if not data.is_file(): data_array = data.segmentation() for segment in data_array: ProcessThread(segment, data).start()
import load_data import math import numpy as np import theano import theano.tensor as T import pdb from datetime import * import csv data = load_data.Data('data/r1.train', 'data/r1.test') class Train(object): """docstring for ClassName""" def __init__(self, user_d, item_d, relation_d, margin_in, rate_in, reg_param_in): self.n = user_d self.m = item_d self.k = relation_d self.margin = margin_in self.rate = rate_in self.reg_param = reg_param_in self.train_num = data.train_matrix.shape[0] self.test_num = data.test_matrix.shape[0] self.user_num = len(data.userid2seq) self.item_num = len(data.itemid2seq) self.relation_num = len(data.relation2seq) / 2 self.user_vec = theano.shared(np.asarray(np.random.uniform( -6 / math.sqrt(self.n), 6 / math.sqrt(self.n), (self.user_num, self.n)), dtype='float32'),
import utils as ut # from params import * import params import load_data import multiprocessing import numpy as np cores = 4 # cores = multiprocessing.cpu_count() # print("cores:"+str(cores)) print(params.BATCH_SIZE) # train_user_file_name = 'testTrain.dat' data_generator_1 = load_data.Data( train_file=params.DIR + params.trainUserFileName_1, test_file=params.DIR + params.testUserFileName_1, batch_size=params.BATCH_SIZE) data_generator_2 = load_data.Data( train_file=params.DIR + params.trainUserFileName_2, test_file=params.DIR + params.testUserFileName_2, batch_size=params.BATCH_SIZE) data_generator_all = [data_generator_1, data_generator_2] # print(params.trainUserFileName_1, params.testUserFileName_1) # print(params.trainUserFileName_2, params.testUserFileName_2) USER_NUM_1, ITEM_NUM_1 = data_generator_1.get_num_users_items() USER_NUM_2, ITEM_NUM_2 = data_generator_2.get_num_users_items() def test_one_user(x): # user u's ratings for user u rating = x[0] #uid
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info input_channels = 1 n_classes = 27 # input_size, input_channels, n_classes, train_data = utils.get_data( # config.dataset, config.data_path, cutout_length=0, validation=False) net_crit = nn.CrossEntropyLoss().to(device) model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, net_crit, device_ids=config.gpus) model = model.to(device) # weights optimizer w_optim = torch.optim.SGD(model.weights(), config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) # alphas optimizer alpha_optim = torch.optim.Adam(model.alphas(), config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) # split data to train/validation # n_train = len(train_data[0]) # split = n_train // 2 # indices = list(range(n_train)) # train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split]) # valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:]) # train_loader = torch.utils.data.DataLoader(train_data, # batch_size=config.batch_size, # sampler=train_sampler, # num_workers=config.workers, # pin_memory=True) # valid_loader = torch.utils.data.DataLoader(train_data, # batch_size=config.batch_size, # sampler=valid_sampler, # num_workers=config.workers, # pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_lr_min) architect = Architect(model, config.w_momentum, config.w_weight_decay) # training loop best_top1 = 0. for epoch in range(config.epochs): lr_scheduler.step() lr = lr_scheduler.get_lr()[0] model.print_alphas(logger) train_data = load_data.Data(data_path=train_data_path, label_path=train_label_path) # training train(train_data, model, architect, w_optim, alpha_optim, lr, epoch) # validation cur_step = (epoch + 1) * len(train_data) test_data = load_data.Data(data_path=test_data_path, label_path=test_label_path) top1 = validate(test_data, model, epoch, cur_step) # log # genotype genotype = model.genotype() logger.info("genotype = {}".format(genotype)) # genotype as a image plot_path = os.path.join(config.plot_path, "EP{:02d}".format(epoch + 1)) caption = "Epoch {}".format(epoch + 1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # save if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype))