예제 #1
0
    def test_(self, testset, epoch=0):
        assert (testset.num_features == self.num_features)
        auc_ = 0.5
        if self.evaluate_op:
            teX, teY = testset.full_batch()
            predicts, auc_, cost_ = self.session.run(
                [self.predict_op, self.evaluate_op, self.cost],
                feed_dict=melt.gen_feed_dict(self.trainer,
                                             self.algo,
                                             teX,
                                             teY,
                                             test_mode=True))
        else:
            # teX, teY = testset.full_batch()
            # predicts, cost_ = self.session.run([self.predict_op, self.cost], feed_dict = melt.gen_feed_dict(self.trainer, self.algo, teX, teY, test_mode = True))
            num_test_instances = testset.num_instances()
            predicts = []
            cost_ = 0
            while True:
                teX, teY = testset.next_batch(batch_size)
                if teX is None:
                    break
                predicts_, now_cost = self.session.run(
                    [self.predict_op, self.cost],
                    feed_dict=melt.gen_feed_dict(self.trainer,
                                                 self.algo,
                                                 teX,
                                                 teY,
                                                 test_mode=True))
                predicts.extend(predicts_)
                cost_ += now_cost
            teY = testset.labels

            predicts = np.array(predicts)

            #print np.array(zip(teY, predicts))
            #print len(teY), len(predicts)
            auc_ = auc(teY, predicts)

        print epoch, ' auc:', auc_, 'cost:', cost_ / len(teY)

        need_stop = False
        if FLAGS.auto_stop and (auc_ - self.auc) < FLAGS.min_improve:
            need_stop = True

        self.auc = auc_

        return need_stop
예제 #2
0
    def test_(self, testset, epoch=0):
        assert (testset.num_features == self.num_features)
        auc_ = 0.0
        if self.evaluate_op:
            teX, teY = testset.full_batch()
            predicts, auc_, cost_ = self.session.run(
                [self.predict_op, self.evaluate_op, self.cost],
                feed_dict=melt.gen_feed_dict(self.trainer,
                                             self.algo,
                                             teX,
                                             teY,
                                             test_mode=True))
        else:
            # teX, teY = testset.full_batch()
            # predicts, cost_ = self.session.run([self.predict_op, self.cost], feed_dict = melt.gen_feed_dict(self.trainer, self.algo, teX, teY, test_mode = True))
            num_test_instances = testset.num_instances()
            predicts = []
            cost_ = 0
            while True:
                teX, teY = testset.next_batch(batch_size)
                if teX is None:
                    break
                predicts_, now_cost = self.session.run(
                    [self.predict_op, self.cost],
                    feed_dict=melt.gen_feed_dict(self.trainer,
                                                 self.algo,
                                                 teX,
                                                 teY,
                                                 test_mode=True))
                predicts.extend(predicts_)
                cost_ += now_cost
            teY = testset.labels

            predicts = np.array(predicts)

            #print teY
            #print predicts
            print np.array(zip(teY, predicts))
            #print len(teY), len(predicts)
            auc_ = auc(teY, predicts)
            #auc_ = 0.5
        #predicts, auc_, cost_, weight = self.session.run([predict_op, evaluate_op, cost, self.weight], feed_dict = trainer.gen_feed_dict(teX, teY))
        print epoch, ' auc:', auc_, 'cost:', cost_ / len(teY)
예제 #3
0
 def calibrate_(self, dataset):
     num_instances = dataset.num_instances()
     #@TODO better iteration now will lost last group data
     for start, end in zip(range(0, num_instances, batch_size),
                           range(batch_size, num_instances, batch_size)):
         trX, trY = dataset.mini_batch(start, end)
         predicts = self.session.run(self.predict_op,
                                     feed_dict=melt.gen_feed_dict(
                                         self.trainer, self.algo, trX, trY))
         self.calibrate(trY, predicts)
     self.calibrator.FinishTraining()
예제 #4
0
 def predict(self, feature_vecs):
     if type(feature_vecs) != list:
         feature_vecs = [feature_vecs]
     spf = melt.sparse_vectors2sparse_features(feature_vecs)
     predicts = self.session.run([self.predict_op],
                                 feed_dict=melt.gen_feed_dict(
                                     self.trainer,
                                     self.algo,
                                     spf,
                                     test_mode=True))
     return predicts
예제 #5
0
 def train_(self, trainset):
     num_train_instances = trainset.num_instances()
     #@TODO this minibatch will lost the last instances..
     # for start, end in zip(range(0, num_train_instances, batch_size), range(batch_size, num_train_instances, batch_size)):
     #     trX, trY = trainset.mini_batch(start, end)
     #     self.session.run(self.train_op, feed_dict = melt.gen_feed_dict(self.trainer, self.algo, trX, trY))
     while True:
         trX, trY = trainset.next_batch(batch_size)
         if trX is None:
             break
         self.session.run(self.train_op,
                          feed_dict=melt.gen_feed_dict(
                              self.trainer, self.algo, trX, trY))
예제 #6
0
    def predict(self, feature_vecs):
        if type(feature_vecs) != list:
            feature_vecs = [feature_vecs]

        trX = None
        if self.trainer.type == 'sparse':
            trX = melt.sparse_vectors2sparse_features(feature_vecs)
        else:  #dense
            trX = melt.dense_vectors2features(feature_vecs,
                                              self.trainer.index_only)

        predicts = self.session.run([self.predict_op],
                                    feed_dict=melt.gen_feed_dict(
                                        self.trainer,
                                        self.algo,
                                        trX,
                                        test_mode=True))
        return predicts
예제 #7
0
 def train_(self, trainset, testset=None, epoch=0):
     num_train_instances = trainset.num_instances()
     round = 0
     start_time = time.time()
     while True:
         trX, trY = trainset.next_batch(batch_size)
         if trX is None:
             break
         feed_dict = melt.gen_feed_dict(self.trainer, self.algo, trX, trY)
         _, cost_, accuracy_ = self.session.run(
             [self.train_op, self.cost, self.accuracy], feed_dict=feed_dict)
         #py_x = self.session.run(self.py_x, feed_dict = feed_dict)
         #print py_x
         if round % 100 == 0:
             end_time = time.time()
             duration = end_time - start_time
             start_time = end_time
             print 'epoch:', epoch, 'round:', round, 'train precision@1:', accuracy_, 'cost:', cost_, 'duration:', duration
         if round % 1000 == 0:
             self.test_(testset, epoch, round)
         round += 1
예제 #8
0
    def test_(self, testset, epoch=0, round=0):
        #assert(testset.num_features == self.num_features)
        #num_test_instances = testset.num_instances()
        predicts = []
        cost_ = 0.
        accuracy_ = 0.
        ground = round
        round = 0
        while True:
            teX, teY = testset.next_batch(batch_size * 10)
            if teX is None:
                break
            feed_dict = melt.gen_feed_dict(self.trainer,
                                           self.algo,
                                           teX,
                                           teY,
                                           test_mode=True)
            now_cost, now_accuracy = self.session.run(
                [self.cost, self.accuracy], feed_dict=feed_dict)
            cost_ += now_cost
            accuracy_ += now_accuracy
            round += 1
        cost_ / round
        accuracy_ /= round

        print 'epoch:', epoch, 'round:', ground, 'test  precision@1:', accuracy_, 'cost:', cost_

        need_stop = False
        #if FLAGS.auto_stop and (accuracy_ - self.avg_accuracy) < FLAGS.min_improve:
        #    need_stop = True

        self.avg_accuracy = accuracy_

        # if FLAGS.use_summary:
        #     teX, teY = testset.full_batch()
        #     summary_str = self.session.run(merged_summary_op, feed_dict = melt.gen_feed_dict(self.trainer, self.algo, teX, teY, test_mode = True))
        #     summary_writer.add_summary(summary_str, epoch)

        return need_stop
예제 #9
0
    def train(self, trainset_file, testset_file, method, num_epochs,
              learning_rate, model_path):
        print 'batch_size:', batch_size, ' learning_rate:', learning_rate, ' num_epochs:', num_epochs
        print 'method:', method

        trainset = melt.load_dataset(trainset_file)
        print "finish loading train set ", trainset_file
        self.num_features = trainset.num_features
        print 'num_features: ', self.num_features
        print 'trainSet size: ', trainset.num_instances()
        testset = melt.load_dataset(testset_file)
        print "finish loading test set ", testset_file
        assert (trainset.num_features == testset.num_features)
        print 'testSet size: ', testset.num_instances()

        algo = self.gen_algo(method)
        trainer = melt.gen_binary_classification_trainer(trainset)
        self.algo = algo
        self.trainer = trainer
        print 'trainer_type:', trainer.type
        print 'trainer_index_only:', trainer.index_only

        cost, train_op, predict_op, evaluate_op = self.foward(
            algo, trainer, learning_rate)
        #self.foward(algo, trainer, learning_rate)

        config = None
        if not FLAGS.show_device:
            config = tf.ConfigProto()
        else:
            config = tf.ConfigProto(log_device_placement=True)
        config.gpu_options.allocator_type = 'BFC'

        self.session = tf.Session(config=config)
        init = tf.initialize_all_variables()
        self.session.run(init)

        summary_writer = None
        if FLAGS.use_summary:
            tf.scalar_summary("cross_entropy", self.cost)
            if FLAGS.use_auc_op:
                tf.scalar_summary("auc", evaluate_op)
            merged_summary_op = tf.merge_all_summaries()
            summary_writer = tf.train.SummaryWriter(FLAGS.summary_path,
                                                    self.session.graph_def)

        #os.system('rm -rf ' + FLAGS.model)
        os.system('mkdir -p ' + FLAGS.model)

        self.save_info(model_path)

        for epoch in range(num_epochs):
            if epoch > 0 and FLAGS.shuffle:
                trainset = melt.load_dataset(trainset_file)

            self.train_(trainset)
            need_stop = self.test_(testset, epoch=epoch)

            if need_stop:
                print 'need stop as improve is smaller then %f' % FLAGS.min_improve
                break

            #print weight
            #@FIXME
            if epoch % FLAGS.save_epochs == 0 and not trainer.index_only:
                self.save_model(model_path, epoch)

        self.save_model(model_path)
        if FLAGS.calibrate:
            dataset = trainset
            if not FLAGS.calibrate_trainset:
                dataset = testset
            self.calibrate_(dataset)  #@TODO may be test set is right?
            CalibratorFactory.Save(self.calibrator,
                                   model_path + '/calibrator.bin')
            #self.calibrator.Save(model_path + '/calibrator.bin')
            self.calibrator.SaveText(model_path + '/calibrator.txt')

        if FLAGS.use_summary:
            teX, teY = testset.full_batch()
            summary_str = self.session.run(merged_summary_op,
                                           feed_dict=melt.gen_feed_dict(
                                               self.trainer,
                                               self.algo,
                                               teX,
                                               teY,
                                               test_mode=True))
            summary_writer.add_summary(summary_str, epoch)