def __init__(self, log_dir=None): full_dataset = self.create_dataset() train_len = int(0.9 * len(full_dataset)) train_dataset, validation_dataset = torch.utils.data.random_split( full_dataset, [train_len, len(full_dataset) - train_len]) logger.info("train size: %d, validation size: %d" % (len(train_dataset), len(validation_dataset))) self.train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.gan.batch_size, num_workers=config.gan.data_loader_workers, drop_last=True, shuffle=True) self.validation_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=config.gan.batch_size, num_workers=config.gan.data_loader_workers, drop_last=True, shuffle=True) self.input_shape = next(iter(self.train_loader))[0].size()[1:] self.stats = Stats(log_dir=log_dir, input_shape=self.input_shape, train_loader=self.train_loader, validation_loader=self.validation_loader) evaluator = Evaluator(self.train_loader, self.validation_loader) self.evolutionary_algorithm = { "NEAT": NEAT, "NSGA2": NSGA2 }[config.evolution.algorithm](evaluator)
def __init__(self, config): """Initializes the primary azurlane-auto instance with the passed in Config instance; creates the Stats instance and resets scheduled sleep timers. Args: config (Config): azurlane-auto Config instance """ self.config = config self.oil_limit = 0 self.stats = Stats(config) if self.config.updates['enabled']: self.modules['updates'] = UpdateUtil(self.config) if self.config.combat['enabled']: self.modules['combat'] = CombatModule(self.config, self.stats) self.oil_limit = self.config.combat['oil_limit'] if self.config.commissions['enabled']: self.modules['commissions'] = CommissionModule(self.config, self.stats) if self.config.enhancement['enabled']: self.modules['enhancement'] = EnhancementModule(self.config, self.stats) if self.config.missions['enabled']: self.modules['missions'] = MissionModule(self.config, self.stats) if self.config.retirement['enabled']: self.modules['retirement'] = RetirementModule(self.config, self.stats) if self.config.dorm['enabled'] or self.config.academy['enabled']: self.modules['headquarters'] = HeadquartersModule(self.config, self.stats) if self.config.events['enabled']: self.modules['event'] = EventModule(self.config, self.stats) self.print_stats_check = True self.next_combat = datetime.now()
def __init__(self): self.stats = Stats() self.train_dataset = self.create_dataset() train_indexes, validation_indexes = np.split( np.random.permutation(np.arange(len(self.train_dataset))), [int(0.9 * len(self.train_dataset))]) logger.info("train size: %d, validation size: %d" % (len(train_indexes), len(validation_indexes))) # train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_indexes) train_sampler = torch.utils.data.sampler.SequentialSampler( self.train_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_size=config.gan.batch_size, sampler=train_sampler, num_workers=0) validation_sampler = torch.utils.data.sampler.SubsetRandomSampler( validation_indexes) self.validation_loader = torch.utils.data.DataLoader( self.train_dataset, batch_size=config.gan.batch_size, sampler=validation_sampler) self.input_shape = next(iter(self.train_loader))[0].size()[1:]
def user_based_split(self, folds_num=5): """ Splits the rating matrix following the user-based method, the result after invoking this method is: two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same format, as following: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default 5 :return: None """ train = [[[] for _ in range(self.num_users)] for _ in range(folds_num)] test = [[[] for _ in range(self.num_users)] for _ in range(folds_num)] for user in range(self.num_users): if user % 1000 == 0: print("user_{}".format(user)) items_ids = np.array(self.users_ratings[user]) n = len(items_ids) if n >= folds_num: idx = list(range(n)) item_ids_folds = random_divide(idx, folds_num) for fold in range(folds_num): test_idx = item_ids_folds[fold] train_idx = [id for id in idx if id not in test_idx] train[fold][user].extend(items_ids[train_idx].tolist()) test[fold][user].extend(items_ids[test_idx].tolist()) else: for fold in range(folds_num): train[fold][user].extend(items_ids.tolist()) test[fold][user].extend([]) stats = Stats(self.generate_validation) for fold in range(folds_num): users_train = train[fold] items_train = self.items_mat_from_users_ratings(users_train) for u in users_train: if len(u) == 0: print("some users contains 0 training items, split again again!") raise Exception("Split_Error!") write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) users_test = test[fold] items_test = self.items_mat_from_users_ratings(users_test) # Storing the fold test items for all users write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) # Calculate statistics: #TODO: Calculate Validation sets: users_validation = [] items_validation = [] if self.generate_validation: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test) # calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
def __init__(self, config): """Initializes the primary azurlane-auto instance with the passed in Config instance; creates the Stats instance and resets scheduled sleep timers. Args: config (Config): azurlane-auto Config instance """ self.config = config self.stats = Stats(config) if self.config.commissions['enabled']: self.modules['commissions'] = CommissionModule( self.config, self.stats) if self.config.combat['enabled']: self.modules['combat'] = CombatModule(self.config, self.stats) if self.config.missions['enabled']: self.modules['missions'] = MissionModule(self.config, self.stats) self.print_stats_check = True
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense - 1: l_output = DenseLayer( l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, { l_in_prem: x_p, l_in_hyp: x_h }) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format( str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex % (acc_num * batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def train(self, numEpochs=1, batchSize=5, learnRateVal=0.1, numExamplesToTrain=-1, gradMax=3., L2regularization=0.0, dropoutRate=0.0, sentenceAttention=False, wordwiseAttention=False): """ Takes care of training model, including propagation of errors and updating of parameters. """ expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \ "wordAttn_{5}".format(str(numEpochs), str(learnRateVal), str(L2regularization), str(dropoutRate), str(sentenceAttention), str(wordwiseAttention)) self.configs.update(locals()) trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.trainData, self.trainDataStats) trainGoldLabel = convertLabelsToMat(self.trainData) valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.valData, self.valDataStats) valGoldLabel = convertLabelsToMat(self.valData) # If you want to train on less than full dataset if numExamplesToTrain > 0: valPremiseIdxMat = valPremiseIdxMat[:, range(numExamplesToTrain), :] valHypothesisIdxMat = valHypothesisIdxMat[:, range(numExamplesToTrain ), :] valGoldLabel = valGoldLabel[range(numExamplesToTrain)] #Whether zero-padded on left or right pad = "right" # Get full premise/hypothesis tensors # batchPremiseTensor, batchHypothesisTensor, batchLabels = \ # convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, # self.numTimestepsHypothesis, "right", self.embeddingTable, # valGoldLabel, range(len(valGoldLabel))) #sharedValPremise = theano.shared(batchPremiseTensor) #sharedValHypothesis = theano.shared(batchHypothesisTensor) #sharedValLabels = theano.shared(batchLabels) inputPremise = T.ftensor3(name="inputPremise") inputHypothesis = T.ftensor3(name="inputHypothesis") yTarget = T.fmatrix(name="yTarget") learnRate = T.scalar(name="learnRate", dtype='float32') fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \ fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise, inputHypothesis, yTarget, learnRate, gradMax, L2regularization, dropoutRate, sentenceAttention, wordwiseAttention, batchSize) totalExamples = 0 stats = Stats(self.logger, expName) # Training self.logger.Log("Model configs: {0}".format(self.configs)) self.logger.Log( "Starting training with {0} epochs, {1} batchSize," " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate" .format(numEpochs, batchSize, learnRateVal, L2regularization, dropoutRate)) predictFunc = self.predictFunc(inputPremise, inputHypothesis, dropoutRate) for epoch in xrange(numEpochs): self.logger.Log("Epoch number: %d" % (epoch)) if numExamplesToTrain > 0: minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize) else: minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize) numExamples = 0 for _, minibatch in minibatches: self.dropoutMode.set_value(1.0) numExamples += len(minibatch) totalExamples += len(minibatch) self.logger.Log( "Processed {0} examples in current epoch".format( str(numExamples))) batchPremiseTensor, batchHypothesisTensor, batchLabels = \ convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, self.numTimestepsHypothesis, pad, self.embeddingTable, valGoldLabel, minibatch) gradHypothesisOut = fGradSharedHypothesis( batchPremiseTensor, batchHypothesisTensor, batchLabels) gradPremiseOut = fGradSharedPremise(batchPremiseTensor, batchHypothesisTensor, batchLabels) fUpdatePremise(learnRateVal) fUpdateHypothesis(learnRateVal) predictLabels = self.predict(batchPremiseTensor, batchHypothesisTensor, predictFunc) #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels))) cost = costFn(batchPremiseTensor, batchHypothesisTensor, batchLabels) stats.recordCost(totalExamples, cost) # Note: Big time sink happens here if totalExamples % (100) == 0: # TODO: Don't compute accuracy of dev set self.dropoutMode.set_value(0.0) devAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) stats.recordAcc(totalExamples, devAccuracy, "dev") stats.recordFinalTrainingTime(totalExamples) # Save model to disk self.logger.Log("Saving model...") self.extractParams() configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format( str(batchSize), str(numEpochs), str(learnRateVal), str(self.dimHidden), str(self.dimInput)) self.saveModel(currDir + "/savedmodels/basicLSTM_" + configString + ".npz") self.logger.Log("Model saved!") # Set dropout to 0. again for testing self.dropoutMode.set_value(0.0) #Train Accuracy # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat, # trainHypothesisIdxMat, trainGoldLabel, predictFunc) # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy)) # Val Accuracy valAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) # TODO: change -1 for training acc to actual value when I enable train computation stats.recordFinalStats(totalExamples, -1, valAccuracy)
def split(self): # Get the mapping as a list of user_hash where the key is the corresponding index: userhash_userid_map_list = list(self.users_dict.items()) userhash_userid_map_list.sort(key=lambda x: x[1]) user_id_userhash_map_list = np.array( [i for (i, _) in userhash_userid_map_list]) # Get the mapping as a list of doc_ids where the key is the corresponding index: docid_paperid_map_list = list(self.papers_dict.items()) docid_paperid_map_list.sort(key=lambda x: x[1]) paper_id_docid_map_list = np.array( [i for (i, _) in docid_paperid_map_list]) # Get the ratings list integrated with time stamps: ratings_list = self.integrate_raings_timestamp(self.users_dict, self.papers_dict) fr = pd.DataFrame(data=ratings_list, columns=['user', 'paper', 'date']) print("Ratings: {}, users: {}, papers: {}.".format( len(fr), fr.user.nunique(), fr.paper.nunique())) # First split date: d1 = datetime.strptime('2005-03-31', "%Y-%m-%d").date() # Last date: last_date = fr.date.max() ratings_period = (last_date.year - d1.year) * 12 + last_date.month # These lists are used for plotting: tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates = [], [], [], [], [], [], [], [] folds_num = ratings_period // self.split_duration # For split stats: stats_header = ['{:4}'.format('Fold'), '{:20}'.format('#Usrs(Tot,R,S)'),'{:23}'.format('#Itms(Tot,R,S)'),'{:23}'.format('#Rtng(Tot,R,S)'),\ '{:23}'.format('PRU(min/max/avg/std)'), '{:22}'.format('PSU(min/max/avg/std)'), '{:20}'.format('PRI(min/max/avg/std)'), '{:20}'.format('PSI(min/max/avg/std)')] self.stat_list.append(stats_header) stats = Stats() for fold in range(folds_num): d2 = d1 + relativedelta(months=self.split_duration) # Training ratings: f1 = fr[fr['date'] < d1] # Test ratings: if self.out_of_matrix: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user'])] else: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user']) & (fr['paper'].isin(f1['paper']))] print("{}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%"\ .format(d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100)) # Generate data for the folds: train_l_users, train_l_users_age, train_l_items, test_l_users, test_l_items, useridx_user_id_map_list, paperidx_paper_id_map_list, n_users, n_papers = self.generate_fold( d1, f1, f2) stats.add_fold_statistics(fold + 1, train_l_users, test_l_users, train_l_items, test_l_items) """ tru = [len(i) for i in train_l_users] tsu = [len(i) for i in test_l_users] tri = [len(i) for i in train_l_items] tsi = [len(i) for i in test_l_items] self.stat_list.append(['{:4}'.format(fold + 1), '{:5d} / {:5d} / {:4d}'.format(n_users, f1.user.nunique(), f2.user.nunique()), '{:6d} / {:6d} / {:5d}'.format(n_papers, f1.paper.nunique(), f2.paper.nunique()),\ '{:6d} / {:6d} / {:5d}'.format(f1.shape[0]+ f2.shape[0], f1.shape[0], f2.shape[0]), \ '{:1d} / {:4d} / {:4.1f} / {:5.1f}'.format(np.min(tru), np.max(tru), np.mean(tru), np.std(tru)),\ '{:1d} / {:4d} / {:4.1f} / {:4.1f}'.format(np.min(tsu), np.max(tsu), np.mean(tsu), np.std(tsu)),\ '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tri), np.max(tri), np.mean(tri), np.std(tri)),\ '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tsi), np.max(tsi), np.mean(tsi), np.std(tsi))]) """ # Write to file: fold_folder = os.path.join( self.base_dir, 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'fold-{}'.format(fold + 1)) if not os.path.exists(fold_folder): os.makedirs(fold_folder) write_ratings(train_l_users, os.path.join(fold_folder, 'train-users.dat')) write_ratings(train_l_users_age, os.path.join(fold_folder, 'train-users-ages.dat')) write_ratings(test_l_users, os.path.join(fold_folder, 'test-users.dat')) write_ratings(train_l_items, os.path.join(fold_folder, 'train-items.dat')) write_ratings(test_l_items, os.path.join(fold_folder, 'test-items.dat')) print("Generating the new mult file...") self.generate_docs_terms(self.docs_vocabs, paperidx_paper_id_map_list, self.terms, fold_folder) # Write users and papers mappings to files: useridx_userhash = user_id_userhash_map_list[ useridx_user_id_map_list] write_list_to_file( [(j, i) for (i, j) in enumerate(useridx_userhash)], os.path.join(fold_folder, 'citeulikeUserHash_userId_map.dat'), header=['citeulikeUserHash', 'user_id']) paperidx_docid = paper_id_docid_map_list[ paperidx_paper_id_map_list] write_list_to_file([(j, i) for (i, j) in enumerate(paperidx_docid)], os.path.join(fold_folder, 'citeulikeId_docId_map.dat'), header=['citeulikeId', 'paper_id']) # For plotting: dates.append(d2) tr_rs.append(len(f1)) tr_us.append(f1.user.nunique()) tr_ps.append(f1.paper.nunique()) ts_rs.append(len(f2)) ts_us.append(f2.user.nunique()) ts_ps.append(f2.paper.nunique()) rat.append(len(f2) / len(f1) * 100) d1 = d2 self.plot_split_lines(tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates) # Write split statistics to file: stats.save_stats_to_file( os.path.join( self.base_dir, 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'stats.txt'))
def out_of_matrix_split(self, folds_num=5): """ Splits the rating matrix following the out-of-matrix method defined in CTR, the result after invoking this method is: two files for each fold (out_of-train-fold_id-users.dat and out_of-train-fold_id-users.dat), both files have the same following format: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default = 5 :return: None """ # 1- Split items ids in folds: items_ids = list(range(self.num_items)) item_ids_folds = random_divide(items_ids, folds_num) # 2- Generate the training and test sets for each fold: stats = Stats(self.generate_validation) for test_fold in range(folds_num): # Get the test, validation and training items: items_test_ids = set(item_ids_folds[test_fold]) items_validation_ids = set() if self.generate_validation: # Add items of the next fold as validation validation_fold = (test_fold + 1) % folds_num items_validation_ids = set(item_ids_folds[validation_fold]) # Add the rest as training: items_train_ids = set(items_ids) - items_test_ids - items_validation_ids # Generate users ratings for training, test and validation: users_train = [] users_test = [] users_validation = [] for user_ratings in self.users_ratings: tr_ratings = list(items_train_ids.intersection(user_ratings)) if len(tr_ratings) == 0: print("some users contains 0 training items, split again again!") raise Exception("Split_Error!") tes_ratings = list(items_test_ids.intersection(user_ratings)) val_ratings = list(items_validation_ids.intersection(user_ratings)) tr_ratings.sort() tes_ratings.sort() val_ratings.sort() users_train.append(tr_ratings) users_test.append(tes_ratings) users_validation.append(val_ratings) write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) items_train = self.items_mat_from_users_ratings(users_train) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) items_test = self.items_mat_from_users_ratings(users_test) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) items_validation = self.items_mat_from_users_ratings(users_validation) write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) # Saving left out items ids: items_test_lst = list(items_test) items_test_lst.sort() write_ratings(items_test_lst, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "heldout-set-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter, print_line_length=False) # Calculate statistics: if self.generate_validation: stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test) # calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
def cf_split(self, folds_num=5): """ Splits the rating matrix following the in-matrix method defined in CTR, the result after invoking this method is: two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same following format: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default 5 :return: None """ items_mat = self.items_mat_from_users_ratings(self.users_ratings) train = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] test = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] validation = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] print("Number of items: {}".format(self.num_items)) folds_list = list(range(folds_num)) print("Splitting items ratings, progress:") # 1- Split items ratings into the folds. This guarantees that all items appear at least once in the test set. # If generating validation set is required: if self.generate_validation: for item in range(self.num_items): # Reporting progress: if item % 5000 == 0: print("doc_{}".format(item)) user_ids = np.array(items_mat[item]) n = len(user_ids) # If the number of ratings associated to this item are greater than the number of folds then, this item' ratings can participate in both the training and in the test sets. if n >= folds_num: idx = list(range(n)) user_ids_folds = random_divide(idx, folds_num) for test_fold in folds_list: # Add users of the current fold as test test_idx = user_ids_folds[test_fold] # Add users of the next fold as validation validation_fold = (test_fold + 1) % folds_num validation_idx = user_ids_folds[validation_fold] # Add the rest as training: train_idx = [] for i in folds_list: if i != test_fold and i != validation_fold: train_idx.extend(user_ids_folds[i]) train[test_fold][item].extend(user_ids[train_idx].tolist()) test[test_fold][item].extend(user_ids[test_idx].tolist()) validation[test_fold][item].extend(user_ids[validation_idx].tolist()) # If the number of ratings associated to this item are less than the number of folds then, this item's ratings can appear in the training set only. else: for fold in folds_list: train[fold][item].extend(user_ids.tolist()) test[fold][item].extend([]) validation[fold][item].extend([]) # If generating validation set is not required, generate Test and Training sets only: else: for item in range(self.num_items): if item % 5000 == 0: print("doc_{}".format(item)) user_ids = np.array(items_mat[item]) n = len(user_ids) if n >= folds_num: idx = list(range(n)) user_ids_folds = random_divide(idx, folds_num) for test_fold in folds_list: # Add users of the current fold as test test_idx = user_ids_folds[test_fold] # Add the rest as training: train_idx = [id for id in idx if id not in test_idx] train[test_fold][item].extend(user_ids[train_idx].tolist()) test[test_fold][item].extend(user_ids[test_idx].tolist()) else: for fold in folds_list: train[fold][item].extend(user_ids.tolist()) test[fold][item].extend([]) # 2- Generate the user ratings from the splits generated on step 1. stats = Stats(self.generate_validation) for fold in folds_list: items_train = train[fold] users_train = self.users_mat_from_items(items_train) for u_id, u in enumerate(users_train): if len(u) == 0: print("User {} contains 0 training items, split again!".format(u_id)) raise Exception("Split_Error!") write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) items_test = test[fold] users_test = self.users_mat_from_items(items_test) write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) if self.generate_validation: items_validation = validation[fold] users_validation = self.users_mat_from_items(items_validation) # Storing the fold validation items for all users write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) # Calculate statistics: if self.generate_validation: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test) #calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
from dataset.data_processing import DataProcessing from dataset.dataset import Dataset from util.visualize_dataset import VisualizeDataset from util.stats import Stats import tensorflow as tf from train.darknet.darknet import TDarknet from train.resnet34.resnet34 import TResNet34 from train.resnet50.resnet50 import TResNet50 from train.inception_v4.inception_v4 import TInception_v4 from test.test_model import TestModel import numpy as np print(tf.__version__) vs = VisualizeDataset() stats = Stats() train = Train() td = TDarknet() tr34 = TResNet34() tr50 = TResNet50() ti = TInception_v4() dp = DataProcessing() # dp.process_and_save_data() ds = Dataset() # ds.save_trainset_as_npy() # images, labels = ds.load_testset() # vs.show_images(images, labels, cols=4, rows=2)