示例#1
0
 def __init__(self, log_dir=None):
     full_dataset = self.create_dataset()
     train_len = int(0.9 * len(full_dataset))
     train_dataset, validation_dataset = torch.utils.data.random_split(
         full_dataset, [train_len, len(full_dataset) - train_len])
     logger.info("train size: %d, validation size: %d" %
                 (len(train_dataset), len(validation_dataset)))
     self.train_loader = torch.utils.data.DataLoader(
         train_dataset,
         batch_size=config.gan.batch_size,
         num_workers=config.gan.data_loader_workers,
         drop_last=True,
         shuffle=True)
     self.validation_loader = torch.utils.data.DataLoader(
         validation_dataset,
         batch_size=config.gan.batch_size,
         num_workers=config.gan.data_loader_workers,
         drop_last=True,
         shuffle=True)
     self.input_shape = next(iter(self.train_loader))[0].size()[1:]
     self.stats = Stats(log_dir=log_dir,
                        input_shape=self.input_shape,
                        train_loader=self.train_loader,
                        validation_loader=self.validation_loader)
     evaluator = Evaluator(self.train_loader, self.validation_loader)
     self.evolutionary_algorithm = {
         "NEAT": NEAT,
         "NSGA2": NSGA2
     }[config.evolution.algorithm](evaluator)
示例#2
0
    def __init__(self, config):
        """Initializes the primary azurlane-auto instance with the passed in
        Config instance; creates the Stats instance and resets scheduled sleep
        timers.

        Args:
            config (Config): azurlane-auto Config instance
        """
        self.config = config
        self.oil_limit = 0
        self.stats = Stats(config)
        if self.config.updates['enabled']:
            self.modules['updates'] = UpdateUtil(self.config)
        if self.config.combat['enabled']:
            self.modules['combat'] = CombatModule(self.config, self.stats)
            self.oil_limit = self.config.combat['oil_limit']
        if self.config.commissions['enabled']:
            self.modules['commissions'] = CommissionModule(self.config, self.stats)
        if self.config.enhancement['enabled']:
            self.modules['enhancement'] = EnhancementModule(self.config, self.stats)
        if self.config.missions['enabled']:
            self.modules['missions'] = MissionModule(self.config, self.stats)
        if self.config.retirement['enabled']:
            self.modules['retirement'] = RetirementModule(self.config, self.stats)
        if self.config.dorm['enabled'] or self.config.academy['enabled']:
            self.modules['headquarters'] = HeadquartersModule(self.config, self.stats)
        if self.config.events['enabled']:
            self.modules['event'] = EventModule(self.config, self.stats)
        self.print_stats_check = True
        self.next_combat = datetime.now()
示例#3
0
    def __init__(self):
        self.stats = Stats()
        self.train_dataset = self.create_dataset()

        train_indexes, validation_indexes = np.split(
            np.random.permutation(np.arange(len(self.train_dataset))),
            [int(0.9 * len(self.train_dataset))])
        logger.info("train size: %d, validation size: %d" %
                    (len(train_indexes), len(validation_indexes)))
        # train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_indexes)
        train_sampler = torch.utils.data.sampler.SequentialSampler(
            self.train_dataset)
        self.train_loader = torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=config.gan.batch_size,
            sampler=train_sampler,
            num_workers=0)
        validation_sampler = torch.utils.data.sampler.SubsetRandomSampler(
            validation_indexes)
        self.validation_loader = torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=config.gan.batch_size,
            sampler=validation_sampler)

        self.input_shape = next(iter(self.train_loader))[0].size()[1:]
示例#4
0
    def user_based_split(self, folds_num=5):
        """
        Splits the rating matrix following the user-based method, the result after invoking this method is:
        two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same format, as following:
        line i has delimiter-separated list of item ids rated by user i        
        :param folds_num: the number of folds, default 5
        :return: None
        """
        train = [[[] for _ in range(self.num_users)] for _ in range(folds_num)]
        test = [[[] for _ in range(self.num_users)] for _ in range(folds_num)]
        for user in range(self.num_users):
            if user % 1000 == 0:
                print("user_{}".format(user))
            items_ids = np.array(self.users_ratings[user])
            n = len(items_ids)
            if n >= folds_num:
                idx = list(range(n))
                item_ids_folds = random_divide(idx, folds_num)
                for fold in range(folds_num):
                    test_idx = item_ids_folds[fold]
                    train_idx = [id for id in idx if id not in test_idx]
                    train[fold][user].extend(items_ids[train_idx].tolist())
                    test[fold][user].extend(items_ids[test_idx].tolist())
            else:
                for fold in range(folds_num):
                    train[fold][user].extend(items_ids.tolist())
                    test[fold][user].extend([])

        stats = Stats(self.generate_validation)
        for fold in range(folds_num):
            users_train = train[fold]
            items_train = self.items_mat_from_users_ratings(users_train)
            for u in users_train:
                if len(u) == 0:
                    print("some users contains 0 training items, split again again!")
                    raise Exception("Split_Error!")
            write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter)
            write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter)

            users_test = test[fold]
            items_test = self.items_mat_from_users_ratings(users_test)

            # Storing the fold test items for all users
            write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter)
            write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter)

            # Calculate statistics:
            #TODO: Calculate Validation sets:
            users_validation = []
            items_validation = []
            if self.generate_validation:
                stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation)
            else:
                stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test)
            # calculate_split_stats(users_train, users_test, items_train, items_test, fold)

        # Write split statistics:
        stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
示例#5
0
    def __init__(self, config):
        """Initializes the primary azurlane-auto instance with the passed in
        Config instance; creates the Stats instance and resets scheduled sleep
        timers.

        Args:
            config (Config): azurlane-auto Config instance
        """
        self.config = config
        self.stats = Stats(config)
        if self.config.commissions['enabled']:
            self.modules['commissions'] = CommissionModule(
                self.config, self.stats)
        if self.config.combat['enabled']:
            self.modules['combat'] = CombatModule(self.config, self.stats)
        if self.config.missions['enabled']:
            self.modules['missions'] = MissionModule(self.config, self.stats)
        self.print_stats_check = True
示例#6
0
def main(exp_name, embed_data, train_data, train_data_stats, val_data,
         val_data_stats, test_data, test_data_stats, log_path, batch_size,
         num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty,
         reg_coeff):
    """
    Main run function for training model.
    :param exp_name:
    :param embed_data:
    :param train_data:
    :param train_data_stats:
    :param val_data:
    :param val_data_stats:
    :param test_data:
    :param test_data_stats:
    :param log_path:
    :param batch_size:
    :param num_epochs:
    :param unroll_steps:
    :param learn_rate:
    :param num_dense: Number of dense fully connected layers to add after concatenation layer
    :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1
    :param penalty: Penalty to use for regularization
    :param reg_weight: Regularization coeff to use for each layer of network; may
                       want to support different coefficient for different layers
    :return:
    """
    # Set random seed for deterministic results
    np.random.seed(0)
    num_ex_to_train = 30

    # Load embedding table
    table = EmbeddingTable(embed_data)
    vocab_size = table.sizeVocab
    dim_embeddings = table.dimEmbeddings
    embeddings_mat = table.embeddings

    train_prem, train_hyp = generate_data(train_data,
                                          train_data_stats,
                                          "left",
                                          "right",
                                          table,
                                          seq_len=unroll_steps)
    val_prem, val_hyp = generate_data(val_data,
                                      val_data_stats,
                                      "left",
                                      "right",
                                      table,
                                      seq_len=unroll_steps)
    train_labels = convertLabelsToMat(train_data)
    val_labels = convertLabelsToMat(val_data)

    # To test for overfitting capabilities of model
    if num_ex_to_train > 0:
        val_prem = val_prem[0:num_ex_to_train]
        val_hyp = val_hyp[0:num_ex_to_train]
        val_labels = val_labels[0:num_ex_to_train]

    # Theano expressions for premise/hypothesis inputs to network
    x_p = T.imatrix()
    x_h = T.imatrix()
    target_values = T.fmatrix(name="target_output")

    # Embedding layer for premise
    l_in_prem = InputLayer((batch_size, unroll_steps))
    l_embed_prem = EmbeddingLayer(l_in_prem,
                                  input_size=vocab_size,
                                  output_size=dim_embeddings,
                                  W=embeddings_mat)

    # Embedding layer for hypothesis
    l_in_hyp = InputLayer((batch_size, unroll_steps))
    l_embed_hyp = EmbeddingLayer(l_in_hyp,
                                 input_size=vocab_size,
                                 output_size=dim_embeddings,
                                 W=embeddings_mat)

    # Ensure embedding matrix parameters are not trainable
    l_embed_hyp.params[l_embed_hyp.W].remove('trainable')
    l_embed_prem.params[l_embed_prem.W].remove('trainable')

    l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp)
    l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem)

    # Concatenate sentence embeddings for premise and hypothesis
    l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum])

    l_in = l_concat
    l_output = l_concat
    # Add 'num_dense' dense layers with tanh
    # top layer is softmax
    if num_dense > 1:
        for n in range(num_dense):
            if n == num_dense - 1:
                l_output = DenseLayer(
                    l_in,
                    num_units=NUM_DENSE_UNITS,
                    nonlinearity=lasagne.nonlinearities.softmax)
            else:
                l_in = DenseLayer(l_in,
                                  num_units=dense_dim,
                                  nonlinearity=lasagne.nonlinearities.tanh)
    else:
        l_output = DenseLayer(l_in,
                              num_units=NUM_DENSE_UNITS,
                              nonlinearity=lasagne.nonlinearities.softmax)

    network_output = get_output(l_output, {
        l_in_prem: x_p,
        l_in_hyp: x_h
    })  # Will have shape (batch_size, 3)
    f_dense_output = theano.function([x_p, x_h],
                                     network_output,
                                     on_unused_input='warn')

    # Compute cost
    if penalty == "l2":
        p_metric = l2
    elif penalty == "l1":
        p_metric = l1

    layers = lasagne.layers.get_all_layers(l_output)
    layer_dict = {l: reg_coeff for l in layers}
    reg_cost = reg_coeff * regularize_layer_params_weighted(
        layer_dict, p_metric)
    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output,
                                        target_values).mean()) + reg_cost
    compute_cost = theano.function([x_p, x_h, target_values], cost)

    # Compute accuracy
    accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1),
                           T.argmax(target_values, axis=-1)),
                      dtype=theano.config.floatX)
    compute_accuracy = theano.function([x_p, x_h, target_values], accuracy)

    label_output = T.argmax(network_output, axis=-1)
    predict = theano.function([x_p, x_h], label_output)

    # Define update/train functions
    all_params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.rmsprop(cost, all_params, learn_rate)
    train = theano.function([x_p, x_h, target_values], cost, updates=updates)

    # TODO: Augment embedding layer to allow for masking inputs

    stats = Stats(exp_name)
    acc_num = 10

    #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size)
    minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size)
    print("Training ...")
    try:
        total_num_ex = 0
        for epoch in xrange(num_epochs):
            for _, minibatch in minibatches:
                total_num_ex += len(minibatch)
                stats.log("Processed {0} total examples in epoch {1}".format(
                    str(total_num_ex), str(epoch)))

                #prem_batch = val_prem[minibatch]
                #hyp_batch = val_hyp[minibatch]
                #labels_batch = val_labels[minibatch]

                prem_batch = train_prem[minibatch]
                hyp_batch = train_hyp[minibatch]
                labels_batch = train_labels[minibatch]

                train(prem_batch, hyp_batch, labels_batch)
                cost_val = compute_cost(prem_batch, hyp_batch, labels_batch)

                stats.recordCost(total_num_ex, cost_val)
                # Periodically compute and log train/dev accuracy
                if total_num_ex % (acc_num * batch_size) == 0:
                    train_acc = compute_accuracy(train_prem, train_hyp,
                                                 train_labels)
                    dev_acc = compute_accuracy(val_prem, val_hyp, val_labels)
                    stats.recordAcc(total_num_ex, train_acc, dataset="train")
                    stats.recordAcc(total_num_ex, dev_acc, dataset="dev")

    except KeyboardInterrupt:
        pass
示例#7
0
    def train(self,
              numEpochs=1,
              batchSize=5,
              learnRateVal=0.1,
              numExamplesToTrain=-1,
              gradMax=3.,
              L2regularization=0.0,
              dropoutRate=0.0,
              sentenceAttention=False,
              wordwiseAttention=False):
        """
        Takes care of training model, including propagation of errors and updating of
        parameters.
        """
        expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \
                       "wordAttn_{5}".format(str(numEpochs), str(learnRateVal),
                                             str(L2regularization), str(dropoutRate),
                                             str(sentenceAttention), str(wordwiseAttention))
        self.configs.update(locals())
        trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
            self.trainData, self.trainDataStats)
        trainGoldLabel = convertLabelsToMat(self.trainData)

        valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
            self.valData, self.valDataStats)
        valGoldLabel = convertLabelsToMat(self.valData)

        # If you want to train on less than full dataset
        if numExamplesToTrain > 0:
            valPremiseIdxMat = valPremiseIdxMat[:,
                                                range(numExamplesToTrain), :]
            valHypothesisIdxMat = valHypothesisIdxMat[:,
                                                      range(numExamplesToTrain
                                                            ), :]
            valGoldLabel = valGoldLabel[range(numExamplesToTrain)]

        #Whether zero-padded on left or right
        pad = "right"

        # Get full premise/hypothesis tensors
        # batchPremiseTensor, batchHypothesisTensor, batchLabels = \
        #             convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
        #                                        self.numTimestepsHypothesis, "right", self.embeddingTable,
        #                                        valGoldLabel, range(len(valGoldLabel)))
        #sharedValPremise = theano.shared(batchPremiseTensor)
        #sharedValHypothesis = theano.shared(batchHypothesisTensor)
        #sharedValLabels = theano.shared(batchLabels)

        inputPremise = T.ftensor3(name="inputPremise")
        inputHypothesis = T.ftensor3(name="inputHypothesis")
        yTarget = T.fmatrix(name="yTarget")
        learnRate = T.scalar(name="learnRate", dtype='float32')


        fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \
            fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise,
                                            inputHypothesis, yTarget, learnRate, gradMax,
                                            L2regularization, dropoutRate, sentenceAttention,
                                            wordwiseAttention, batchSize)

        totalExamples = 0
        stats = Stats(self.logger, expName)

        # Training
        self.logger.Log("Model configs: {0}".format(self.configs))
        self.logger.Log(
            "Starting training with {0} epochs, {1} batchSize,"
            " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate"
            .format(numEpochs, batchSize, learnRateVal, L2regularization,
                    dropoutRate))

        predictFunc = self.predictFunc(inputPremise, inputHypothesis,
                                       dropoutRate)

        for epoch in xrange(numEpochs):
            self.logger.Log("Epoch number: %d" % (epoch))

            if numExamplesToTrain > 0:
                minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize)
            else:
                minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize)

            numExamples = 0
            for _, minibatch in minibatches:
                self.dropoutMode.set_value(1.0)
                numExamples += len(minibatch)
                totalExamples += len(minibatch)

                self.logger.Log(
                    "Processed {0} examples in current epoch".format(
                        str(numExamples)))

                batchPremiseTensor, batchHypothesisTensor, batchLabels = \
                    convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
                                               self.numTimestepsHypothesis, pad, self.embeddingTable,
                                               valGoldLabel, minibatch)

                gradHypothesisOut = fGradSharedHypothesis(
                    batchPremiseTensor, batchHypothesisTensor, batchLabels)
                gradPremiseOut = fGradSharedPremise(batchPremiseTensor,
                                                    batchHypothesisTensor,
                                                    batchLabels)
                fUpdatePremise(learnRateVal)
                fUpdateHypothesis(learnRateVal)

                predictLabels = self.predict(batchPremiseTensor,
                                             batchHypothesisTensor,
                                             predictFunc)
                #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels)))

                cost = costFn(batchPremiseTensor, batchHypothesisTensor,
                              batchLabels)
                stats.recordCost(totalExamples, cost)

                # Note: Big time sink happens here
                if totalExamples % (100) == 0:
                    # TODO: Don't compute accuracy of dev set
                    self.dropoutMode.set_value(0.0)
                    devAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                                       valHypothesisIdxMat,
                                                       valGoldLabel,
                                                       predictFunc)
                    stats.recordAcc(totalExamples, devAccuracy, "dev")

        stats.recordFinalTrainingTime(totalExamples)

        # Save model to disk
        self.logger.Log("Saving model...")
        self.extractParams()
        configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format(
            str(batchSize), str(numEpochs), str(learnRateVal),
            str(self.dimHidden), str(self.dimInput))
        self.saveModel(currDir + "/savedmodels/basicLSTM_" + configString +
                       ".npz")
        self.logger.Log("Model saved!")

        # Set dropout to 0. again for testing
        self.dropoutMode.set_value(0.0)

        #Train Accuracy
        # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat,
        #                              trainHypothesisIdxMat, trainGoldLabel, predictFunc)
        # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy))

        # Val Accuracy
        valAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                           valHypothesisIdxMat, valGoldLabel,
                                           predictFunc)
        # TODO: change -1 for training acc to actual value when I enable train computation
        stats.recordFinalStats(totalExamples, -1, valAccuracy)
示例#8
0
    def split(self):
        # Get the mapping as a list of user_hash where the key is the corresponding index:
        userhash_userid_map_list = list(self.users_dict.items())
        userhash_userid_map_list.sort(key=lambda x: x[1])
        user_id_userhash_map_list = np.array(
            [i for (i, _) in userhash_userid_map_list])

        # Get the mapping as a list of doc_ids where the key is the corresponding index:
        docid_paperid_map_list = list(self.papers_dict.items())
        docid_paperid_map_list.sort(key=lambda x: x[1])
        paper_id_docid_map_list = np.array(
            [i for (i, _) in docid_paperid_map_list])

        # Get the ratings list integrated with time stamps:
        ratings_list = self.integrate_raings_timestamp(self.users_dict,
                                                       self.papers_dict)

        fr = pd.DataFrame(data=ratings_list, columns=['user', 'paper', 'date'])
        print("Ratings: {}, users: {}, papers: {}.".format(
            len(fr), fr.user.nunique(), fr.paper.nunique()))

        # First split date:
        d1 = datetime.strptime('2005-03-31', "%Y-%m-%d").date()

        # Last date:
        last_date = fr.date.max()
        ratings_period = (last_date.year - d1.year) * 12 + last_date.month

        # These lists are used for plotting:
        tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates = [], [], [], [], [], [], [], []

        folds_num = ratings_period // self.split_duration

        # For split stats:
        stats_header = ['{:4}'.format('Fold'), '{:20}'.format('#Usrs(Tot,R,S)'),'{:23}'.format('#Itms(Tot,R,S)'),'{:23}'.format('#Rtng(Tot,R,S)'),\
                        '{:23}'.format('PRU(min/max/avg/std)'), '{:22}'.format('PSU(min/max/avg/std)'), '{:20}'.format('PRI(min/max/avg/std)'), '{:20}'.format('PSI(min/max/avg/std)')]
        self.stat_list.append(stats_header)
        stats = Stats()
        for fold in range(folds_num):
            d2 = d1 + relativedelta(months=self.split_duration)

            # Training ratings:
            f1 = fr[fr['date'] < d1]

            # Test ratings:
            if self.out_of_matrix:
                f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2)
                        & fr['user'].isin(f1['user'])]
            else:
                f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2)
                        & fr['user'].isin(f1['user']) &
                        (fr['paper'].isin(f1['paper']))]
            print("{}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%"\
                  .format(d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100))

            # Generate data for the folds:
            train_l_users, train_l_users_age, train_l_items, test_l_users, test_l_items, useridx_user_id_map_list, paperidx_paper_id_map_list, n_users, n_papers = self.generate_fold(
                d1, f1, f2)
            stats.add_fold_statistics(fold + 1, train_l_users, test_l_users,
                                      train_l_items, test_l_items)
            """
            tru = [len(i) for i in train_l_users]
            tsu = [len(i) for i in test_l_users]
            tri = [len(i) for i in train_l_items]
            tsi = [len(i) for i in test_l_items]
            self.stat_list.append(['{:4}'.format(fold + 1), '{:5d} / {:5d} / {:4d}'.format(n_users, f1.user.nunique(), f2.user.nunique()),
                                   '{:6d} / {:6d} / {:5d}'.format(n_papers, f1.paper.nunique(), f2.paper.nunique()),\
                                   '{:6d} / {:6d} / {:5d}'.format(f1.shape[0]+ f2.shape[0], f1.shape[0], f2.shape[0]), \
                                   '{:1d} / {:4d} / {:4.1f} / {:5.1f}'.format(np.min(tru), np.max(tru), np.mean(tru), np.std(tru)),\
                                   '{:1d} / {:4d} / {:4.1f} / {:4.1f}'.format(np.min(tsu), np.max(tsu), np.mean(tsu), np.std(tsu)),\
                                   '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tri), np.max(tri), np.mean(tri), np.std(tri)),\
                                   '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tsi), np.max(tsi), np.mean(tsi), np.std(tsi))])
            """

            # Write to file:
            fold_folder = os.path.join(
                self.base_dir, 'time-based_split_out-of-matrix'
                if self.out_of_matrix else 'time-based_split_in-matrix',
                'fold-{}'.format(fold + 1))
            if not os.path.exists(fold_folder):
                os.makedirs(fold_folder)

            write_ratings(train_l_users,
                          os.path.join(fold_folder, 'train-users.dat'))
            write_ratings(train_l_users_age,
                          os.path.join(fold_folder, 'train-users-ages.dat'))
            write_ratings(test_l_users,
                          os.path.join(fold_folder, 'test-users.dat'))
            write_ratings(train_l_items,
                          os.path.join(fold_folder, 'train-items.dat'))
            write_ratings(test_l_items,
                          os.path.join(fold_folder, 'test-items.dat'))

            print("Generating the new mult file...")
            self.generate_docs_terms(self.docs_vocabs,
                                     paperidx_paper_id_map_list, self.terms,
                                     fold_folder)

            # Write users and papers mappings to files:
            useridx_userhash = user_id_userhash_map_list[
                useridx_user_id_map_list]
            write_list_to_file(
                [(j, i) for (i, j) in enumerate(useridx_userhash)],
                os.path.join(fold_folder, 'citeulikeUserHash_userId_map.dat'),
                header=['citeulikeUserHash', 'user_id'])

            paperidx_docid = paper_id_docid_map_list[
                paperidx_paper_id_map_list]
            write_list_to_file([(j, i)
                                for (i, j) in enumerate(paperidx_docid)],
                               os.path.join(fold_folder,
                                            'citeulikeId_docId_map.dat'),
                               header=['citeulikeId', 'paper_id'])

            # For plotting:
            dates.append(d2)
            tr_rs.append(len(f1))
            tr_us.append(f1.user.nunique())
            tr_ps.append(f1.paper.nunique())
            ts_rs.append(len(f2))
            ts_us.append(f2.user.nunique())
            ts_ps.append(f2.paper.nunique())
            rat.append(len(f2) / len(f1) * 100)
            d1 = d2
        self.plot_split_lines(tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat,
                              dates)

        # Write split statistics to file:
        stats.save_stats_to_file(
            os.path.join(
                self.base_dir, 'time-based_split_out-of-matrix'
                if self.out_of_matrix else 'time-based_split_in-matrix',
                'stats.txt'))
示例#9
0
    def out_of_matrix_split(self, folds_num=5):
        """
        Splits the rating matrix following the out-of-matrix method defined in CTR, the result after invoking this method is:
        two files for each fold (out_of-train-fold_id-users.dat and out_of-train-fold_id-users.dat), both files have the same following format:
        line i has delimiter-separated list of item ids rated by user i
        :param folds_num: the number of folds, default = 5
        :return: None
        """
        # 1- Split items ids in folds:
        items_ids = list(range(self.num_items))
        item_ids_folds = random_divide(items_ids, folds_num)

        # 2- Generate the training and test sets for each fold:
        stats = Stats(self.generate_validation)
        for test_fold in range(folds_num):

            # Get the test, validation and training items:
            items_test_ids = set(item_ids_folds[test_fold])
            items_validation_ids = set()
            if self.generate_validation:
                # Add items of the next fold as validation
                validation_fold = (test_fold + 1) % folds_num
                items_validation_ids = set(item_ids_folds[validation_fold])
            # Add the rest as training:
            items_train_ids = set(items_ids) - items_test_ids - items_validation_ids

            # Generate users ratings for training, test and validation:
            users_train = []
            users_test = []
            users_validation = []

            for user_ratings in self.users_ratings:
                tr_ratings = list(items_train_ids.intersection(user_ratings))
                if len(tr_ratings) == 0:
                    print("some users contains 0 training items, split again again!")
                    raise Exception("Split_Error!")
                tes_ratings = list(items_test_ids.intersection(user_ratings))
                val_ratings = list(items_validation_ids.intersection(user_ratings))

                tr_ratings.sort()
                tes_ratings.sort()
                val_ratings.sort()

                users_train.append(tr_ratings)
                users_test.append(tes_ratings)
                users_validation.append(val_ratings)

            write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter)
            write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter)
            write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter)

            items_train = self.items_mat_from_users_ratings(users_train)
            write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter)

            items_test = self.items_mat_from_users_ratings(users_test)
            write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter)

            items_validation = self.items_mat_from_users_ratings(users_validation)
            write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter)

            # Saving left out items ids:
            items_test_lst = list(items_test)
            items_test_lst.sort()
            write_ratings(items_test_lst, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "heldout-set-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter, print_line_length=False)

            # Calculate statistics:
            if self.generate_validation:
                stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation)
            else:
                stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test)
            # calculate_split_stats(users_train, users_test, items_train, items_test, fold)

        # Write split statistics:
        stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
示例#10
0
    def cf_split(self, folds_num=5):
        """
        Splits the rating matrix following the in-matrix method defined in CTR, the result after invoking this method is:
        two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same following format:
        line i has delimiter-separated list of item ids rated by user i
        :param folds_num: the number of folds, default 5
        :return: None
        """
        items_mat = self.items_mat_from_users_ratings(self.users_ratings)
        train = [[[] for _ in range(self.num_items)] for _ in range(folds_num)]
        test = [[[] for _ in range(self.num_items)] for _ in range(folds_num)]
        validation = [[[] for _ in range(self.num_items)] for _ in range(folds_num)]
        print("Number of items: {}".format(self.num_items))
        folds_list = list(range(folds_num))
        print("Splitting items ratings, progress:")

        # 1- Split items ratings into the folds. This guarantees that all items appear at least once in the test set.
        # If generating validation set is required:
        if self.generate_validation:
            for item in range(self.num_items):
                # Reporting progress:
                if item % 5000 == 0:
                    print("doc_{}".format(item))

                user_ids = np.array(items_mat[item])
                n = len(user_ids)

                # If the number of ratings associated to this item are greater than the number of folds then, this item' ratings can participate in both the training and in the test sets.
                if n >= folds_num:
                    idx = list(range(n))
                    user_ids_folds = random_divide(idx, folds_num)
                    for test_fold in folds_list:
                        # Add users of the current fold as test
                        test_idx = user_ids_folds[test_fold]

                        # Add users of the next fold as validation
                        validation_fold = (test_fold + 1) % folds_num
                        validation_idx = user_ids_folds[validation_fold]

                        # Add the rest as training:
                        train_idx = []
                        for i in folds_list:
                            if i != test_fold and i != validation_fold:
                                train_idx.extend(user_ids_folds[i])

                        train[test_fold][item].extend(user_ids[train_idx].tolist())
                        test[test_fold][item].extend(user_ids[test_idx].tolist())
                        validation[test_fold][item].extend(user_ids[validation_idx].tolist())
                # If the number of ratings associated to this item are less than the number of folds then, this item's ratings can appear in the training set only.
                else:
                    for fold in folds_list:
                        train[fold][item].extend(user_ids.tolist())
                        test[fold][item].extend([])
                        validation[fold][item].extend([])

        # If generating validation set is not required, generate Test and Training sets only:
        else:
            for item in range(self.num_items):
                if item % 5000 == 0:
                    print("doc_{}".format(item))
                user_ids = np.array(items_mat[item])
                n = len(user_ids)

                if n >= folds_num:
                    idx = list(range(n))
                    user_ids_folds = random_divide(idx, folds_num)
                    for test_fold in folds_list:
                        # Add users of the current fold as test
                        test_idx = user_ids_folds[test_fold]

                        # Add the rest as training:
                        train_idx = [id for id in idx if id not in test_idx]
                        train[test_fold][item].extend(user_ids[train_idx].tolist())
                        test[test_fold][item].extend(user_ids[test_idx].tolist())
                else:
                    for fold in folds_list:
                        train[fold][item].extend(user_ids.tolist())
                        test[fold][item].extend([])

        # 2- Generate the user ratings from the splits generated on step 1.
        stats = Stats(self.generate_validation)
        for fold in folds_list:
            items_train = train[fold]
            users_train = self.users_mat_from_items(items_train)

            for u_id, u in enumerate(users_train):
                if len(u) == 0:
                    print("User {} contains 0 training items, split again!".format(u_id))
                    raise Exception("Split_Error!")
            write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter)
            write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter)

            items_test = test[fold]
            users_test = self.users_mat_from_items(items_test)           
            write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter)
            write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter)

            if self.generate_validation:
                items_validation = validation[fold]
                users_validation = self.users_mat_from_items(items_validation)
                # Storing the fold validation items for all users
                write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter)
                write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter)

            # Calculate statistics:
            if self.generate_validation:
                stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation)
            else:
                stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test)
            #calculate_split_stats(users_train, users_test, items_train, items_test, fold)

        # Write split statistics:
        stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
示例#11
0
from dataset.data_processing import DataProcessing
from dataset.dataset import Dataset
from util.visualize_dataset import VisualizeDataset
from util.stats import Stats
import tensorflow as tf
from train.darknet.darknet import TDarknet
from train.resnet34.resnet34 import TResNet34
from train.resnet50.resnet50 import TResNet50
from train.inception_v4.inception_v4 import TInception_v4
from test.test_model import TestModel
import numpy as np

print(tf.__version__)

vs = VisualizeDataset()
stats = Stats()
train = Train()

td = TDarknet()
tr34 = TResNet34()
tr50 = TResNet50()
ti = TInception_v4()

dp = DataProcessing()
# dp.process_and_save_data()

ds = Dataset()
# ds.save_trainset_as_npy()

# images, labels = ds.load_testset()
# vs.show_images(images, labels, cols=4, rows=2)