def rescale(config, online_preds_fp):
        online_preds = DataUtil.load_vector(online_preds_fp, 'float')

        feature_name = 'graph_edge_max_clique_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_mc = Feature.load(test_feature_fp).toarray()

        feature_name = 'graph_edge_cc_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_cc = Feature.load(test_feature_fp).toarray()

        for index in range(len(online_preds)):
            score = online_preds[index]
            if test_features_mc[index][0] == 3.:
                score = PostProcessor.adj(score, te=0.40883512, tr=0.623191)
            elif test_features_mc[index][0] > 3.:
                score = PostProcessor.adj(score, te=0.96503024, tr=0.972554)
            else:
                if test_features_cc[index][0] < 3.:
                    score = PostProcessor.adj(score,
                                              te=0.05739666,
                                              tr=0.233473)
                else:
                    score = PostProcessor.adj(score,
                                              te=0.04503431,
                                              tr=0.149471)
            online_preds[index] = score

        DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)
 def run_online(self):
     # load feature matrix
     online_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'),
                                        self.config.get('FEATURE', 'feature_selected').split(),
                                        self.config.get('MODEL', 'online_rawset_name'),
                                        self.config.get('FEATURE', 'will_save'))
     model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
     model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name')
     model.load(model_fp)
     online_preds = model.predict(online_features)
     online_preds_fp = '%s/se_online.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                 self.config.get('MODEL', 'online_test_rawset_name'))
     DataUtil.save_vector(online_preds_fp, online_preds, 'w')
示例#3
0
    def __generate_index(self, row_num):
        index_all = [list()] * self.cv_num
        for i in range(row_num):
            index_all[int(random.random() * self.cv_num)].append(i)
        for i in range(self.cv_num):
            LogUtil.log(
                'INFO',
                'generate cv index, size(part%d)=%d' % (i, len(index_all[i])))

        index_pt = self.config.get('DEFAULT', 'index_pt')
        for i in range(self.cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            DataUtil.save_vector(fp, list(), 'w')
            for j in range(self.cv_num - 2):
                part_id = (i + j) % self.cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 2) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 1) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
示例#4
0
    def generate_index_with_swap(self):
        """
        Generate the index file of `train_with_swap.csv`
        :return: none
        """
        train_index_fp = '%s/train_311.train.index' % self.config.get('DEFAULT', 'feature_index_pt')
        train_with_swap_index_fp = '%s/train_311.train_with_swap.index' % self.config.get('DEFAULT', 'feature_index_pt')

        train_index = DataUtil.load_vector(train_index_fp, False)
        train_index = [int(x) for x in train_index]

        offset = 404290
        train_swap_index = [x + offset for x in train_index]
        train_with_swap_index = train_index + train_swap_index
        DataUtil.save_vector(train_with_swap_index_fp, train_with_swap_index, 'w')
示例#5
0
 def run_online(self):
     # load feature matrix
     online_features = Feature.load_all(
         self.config.get('DIRECTORY', 'feature_pt'),
         self.config.get('FEATURE', 'feature_selected').split(),
         self.config.get('MODEL', 'online_rawset_name'),
         self.config.get('FEATURE', 'will_save'))
     model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
     model_fp = self.config.get(
         'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get(
             'MODEL', 'model_name')
     model.load(model_fp)
     online_preds = model.predict(online_features)
     online_preds_fp = '%s/se_online.%s.pred' % (self.config.get(
         'DIRECTORY',
         'pred_pt'), self.config.get('MODEL', 'online_test_rawset_name'))
     DataUtil.save_vector(online_preds_fp, online_preds, 'w')
示例#6
0
 def __generate_index(self, row_num):
     train_indexs = list()
     valid_indexs = list()
     test_indexs = list()
     for i in range(row_num):
         part_id = random.random() * self.se_num
         if part_id < self.se_num - 2:
             train_indexs.append(i)
         elif part_id < self.se_num - 1:
             valid_indexs.append(i)
         else:
             test_indexs.append(i)
     index_pt = self.config.get('DEFAULT', 'index_pt')
     train_fp = '%s/se_tag%s_train.%s.index' % (
         index_pt, self.se_tag,
         self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(train_fp, train_indexs, 'w')
     valid_fp = '%s/se_tag%s_valid.%s.index' % (
         index_pt, self.se_tag,
         self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(valid_fp, valid_indexs, 'w')
     test_fp = '%s/se_tag%s_test.%s.index' % (
         index_pt, self.se_tag,
         self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(test_fp, test_indexs, 'w')
 def __generate_index(self, row_num):
     train_indexs = list()
     valid_indexs = list()
     test_indexs = list()
     for i in range(row_num):
         part_id = random.random() * self.se_num
         if part_id < self.se_num - 2:
             train_indexs.append(i)
         elif part_id < self.se_num - 1:
             valid_indexs.append(i)
         else:
             test_indexs.append(i)
     index_pt = self.config.get('DEFAULT', 'index_pt')
     train_fp = '%s/se_tag%s_train.%s.index' % (index_pt,
                                                self.se_tag,
                                                self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(train_fp, train_indexs, 'w')
     valid_fp = '%s/se_tag%s_valid.%s.index' % (index_pt,
                                                self.se_tag,
                                                self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(valid_fp, valid_indexs, 'w')
     test_fp = '%s/se_tag%s_test.%s.index' % (index_pt,
                                              self.se_tag,
                                              self.config.get('MODEL', 'offline_rawset_name'))
     DataUtil.save_vector(test_fp, test_indexs, 'w')
    def generate_cv_subset_index(cf, argv):
        """
        Generate index used for 5-fold cross validation
        :param cf: configuration file
        :param argv: parameter list
        :return: none
        """
        tag = argv[0]
        cv_num = 5
        cv_rawset_name = 'train_with_swap'
        train_data_size = 404290

        index_all = []
        for i in range(cv_num):
            index_all.append([])
        for i in range(train_data_size):
            index_all[int(random.random() * cv_num)].append(i)

        for i in range(cv_num):
            LogUtil.log('INFO', 'size(part%d)=%d' % (i, len(index_all[i])))

        index_fp = cf.get('DEFAULT', 'feature_index_pt')
        for i in range(cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            for j in range(cv_num - 2):
                part_id = (i + j) % cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            for j in range(cv_num - 2):
                part_id = (i + j) % cv_num
                DataUtil.save_vector(
                    fp,
                    [index + train_data_size
                     for index in index_all[part_id]], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            part_id = (fold_id + cv_num - 2) % cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            part_id = (fold_id + cv_num - 1) % cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
    def __generate_index(self, row_num):
        index_all = [list()] * self.cv_num
        for i in range(row_num):
            index_all[int(random.random() * self.cv_num)].append(i)
        for i in range(self.cv_num):
            LogUtil.log('INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i])))

        index_pt = self.config.get('DEFAULT', 'index_pt')
        for i in range(self.cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (index_pt,
                                                         self.cv_tag,
                                                         self.cv_num,
                                                         fold_id,
                                                         self.config.get('MODEL', 'offline_rawset_name'))
            DataUtil.save_vector(fp, list(), 'w')
            for j in range(self.cv_num - 2):
                part_id = (i + j) % self.cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (index_pt,
                                                         self.cv_tag,
                                                         self.cv_num,
                                                         fold_id,
                                                         self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 2) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (index_pt,
                                                        self.cv_tag,
                                                        self.cv_num,
                                                        fold_id,
                                                        self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 1) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
示例#10
0
    def run_offline(self):
        LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag)
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.cv_tag:
            self.cv_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        # cross validation
        offline_valid_preds_all = [0.] * offline_features.shape[0]
        offline_test_preds_all = [0.] * offline_features.shape[0]
        for fold_id in range(self.cv_num):
            LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id)

            # generate training data set
            offline_train_pos_rate = float(
                self.config.get('MODEL', 'train_pos_rate'))
            offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_train_indexs = DataUtil.load_vector(
                offline_train_indexs_fp, 'int')
            offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
                CrossValidation.__generate_data(offline_train_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_train_pos_rate)
            LogUtil.log('INFO', 'offline train data generation done')

            # generate validation data set
            offline_valid_pos_rate = float(
                self.config.get('MODEL', 'valid_pos_rate'))
            offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_valid_indexs = DataUtil.load_vector(
                offline_valid_indexs_fp, 'int')
            offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
                CrossValidation.__generate_data(offline_valid_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_valid_pos_rate)
            LogUtil.log('INFO', 'offline valid data generation done')

            # generate test data set
            offline_test_pos_rate = float(
                self.config.get('MODEL', 'test_pos_rate'))
            offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                       'int')
            offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
                CrossValidation.__generate_data(offline_test_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_test_pos_rate)
            LogUtil.log('INFO', 'offline test data generation done')

            model = Model.new(self.config.get('MODEL', 'model_name'),
                              self.config)
            model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \
                                                                  (self.cv_num,
                                                                   fold_id,
                                                                   self.config.get('MODEL', 'model_name'))
            model.save(model_fp)
            offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
                offline_train_features, offline_train_labels,
                offline_valid_features, offline_valid_labels,
                offline_test_features, offline_test_labels)
            offline_train_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_train_labels, offline_train_preds)
            offline_valid_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_valid_labels, offline_valid_preds)
            offline_test_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_test_labels, offline_test_preds)
            score_fp = '%s/%s.score' % (self.config.get(
                'DIRECTORY', 'score_pt'), 'cv')
            score_file = open(score_fp, 'a')
            score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' %
                             (fold_id, offline_train_score,
                              offline_valid_score, offline_test_score))
            score_file.close()
            # merge prediction results
            for index in range(len(offline_valid_balanced_indexs)):
                offline_valid_preds_all[offline_valid_balanced_indexs[
                    index]] = offline_valid_preds[index]
            for index in range(len(offline_test_balanced_indexs)):
                offline_test_preds_all[offline_test_balanced_indexs[
                    index]] = offline_test_preds[index]
            LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id)
        # save prediction results
        offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_all_fp,
                             offline_valid_preds_all, 'w')
        offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all,
                             'w')
        # evaluate
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_valid_preds_all)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_test_preds_all)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('cross_validation\tvalid:%s\ttest:%s\n' %
                         (offline_valid_score, offline_test_score))
        score_file.close()
示例#11
0
    def run_offline(self):
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.se_tag:
            self.se_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        index_pt = self.config.get('DIRECTORY', 'index_pt')
        # generate training data set
        offline_train_pos_rate = float(
            self.config.get('MODEL', 'train_pos_rate'))
        offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp,
                                                    'int')
        offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
            SingleExec.__generate_data(offline_train_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_train_pos_rate)
        LogUtil.log('INFO', 'offline train data generation done')

        # generate validation data set
        offline_valid_pos_rate = float(
            self.config.get('MODEL', 'valid_pos_rate'))
        offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp,
                                                    'int')
        offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
            SingleExec.__generate_data(offline_valid_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_valid_pos_rate)
        LogUtil.log('INFO', 'offline valid data generation done')

        # generate test data set
        offline_test_pos_rate = float(self.config.get('MODEL',
                                                      'test_pos_rate'))
        offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                   'int')
        offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
            SingleExec.__generate_data(offline_test_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_test_pos_rate)
        LogUtil.log('INFO', 'offline test data generation done')

        model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
        model_fp = self.config.get(
            'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get(
                'MODEL', 'model_name')
        model.save(model_fp)
        offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
            offline_train_features, offline_train_labels,
            offline_valid_features, offline_valid_labels,
            offline_test_features, offline_test_labels)
        offline_train_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_train_labels,
            offline_train_preds)
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_valid_labels,
            offline_valid_preds)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_test_labels,
            offline_test_preds)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write(
            'single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' %
            (offline_train_score, offline_valid_score, offline_test_score))
        score_file.close()
        # save prediction results
        offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w')
        offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
    def run_offline(self):
        LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag)
        # load feature matrix
        offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'),
                                            self.config.get('FEATURE', 'feature_selected').split(),
                                            self.config.get('MODEL', 'offline_rawset_name'),
                                            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                                                               self.config.get('MODEL', 'offline_rawset_name')),
                                              True)
        # generate index file
        if '' == self.cv_tag:
            self.cv_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        # cross validation
        offline_valid_preds_all = [0.] * offline_features.shape[0]
        offline_test_preds_all = [0.] * offline_features.shape[0]
        for fold_id in range(self.cv_num):
            LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id)

            # generate training data set
            offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate'))
            offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                              self.cv_tag,
                                                                              self.cv_num,
                                                                              fold_id,
                                                                              self.config.get('MODEL',
                                                                                              'offline_rawset_name'))
            offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int')
            offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
                CrossValidation.__generate_data(offline_train_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_train_pos_rate)
            LogUtil.log('INFO', 'offline train data generation done')

            # generate validation data set
            offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate'))
            offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                              self.cv_tag,
                                                                              self.cv_num,
                                                                              fold_id,
                                                                              self.config.get('MODEL',
                                                                                              'offline_rawset_name'))
            offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int')
            offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
                CrossValidation.__generate_data(offline_valid_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_valid_pos_rate)
            LogUtil.log('INFO', 'offline valid data generation done')

            # generate test data set
            offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate'))
            offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                            self.cv_tag,
                                                                            self.cv_num,
                                                                            fold_id,
                                                                            self.config.get('MODEL',
                                                                                            'offline_rawset_name'))
            offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int')
            offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
                CrossValidation.__generate_data(offline_test_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_test_pos_rate)
            LogUtil.log('INFO', 'offline test data generation done')

            model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
            model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \
                                                                  (self.cv_num,
                                                                   fold_id,
                                                                   self.config.get('MODEL', 'model_name'))
            model.save(model_fp)
            offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features,
                                                                                     offline_train_labels,
                                                                                     offline_valid_features,
                                                                                     offline_valid_labels,
                                                                                     offline_test_features,
                                                                                     offline_test_labels)
            offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                     offline_train_labels,
                                                     offline_train_preds)
            offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                     offline_valid_labels,
                                                     offline_valid_preds)
            offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                    offline_test_labels,
                                                    offline_test_preds)
            score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
            score_file = open(score_fp, 'a')
            score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id,
                                                                         offline_train_score,
                                                                         offline_valid_score,
                                                                         offline_test_score))
            score_file.close()
            # merge prediction results
            for index in range(len(offline_valid_balanced_indexs)):
                offline_valid_preds_all[offline_valid_balanced_indexs[index]] = offline_valid_preds[index]
            for index in range(len(offline_test_balanced_indexs)):
                offline_test_preds_all[offline_test_balanced_indexs[index]] = offline_test_preds[index]
            LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id)
        # save prediction results
        offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                                  self.cv_num,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w')
        offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                                self.cv_num,
                                                                self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w')
        # evaluate
        offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_labels,
                                                 offline_valid_preds_all)
        offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                offline_labels,
                                                offline_test_preds_all)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score))
        score_file.close()
    def run_offline(self):
        # load feature matrix
        offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'),
                                            self.config.get('FEATURE', 'feature_selected').split(),
                                            self.config.get('MODEL', 'offline_rawset_name'),
                                            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                                                               self.config.get('MODEL', 'offline_rawset_name')),
                                              True)
        # generate index file
        if '' == self.se_tag:
            self.se_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        index_pt = self.config.get('DIRECTORY', 'index_pt')
        # generate training data set
        offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate'))
        offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (index_pt,
                                                                  self.se_tag,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int')
        offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
            SingleExec.__generate_data(offline_train_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_train_pos_rate)
        LogUtil.log('INFO', 'offline train data generation done')

        # generate validation data set
        offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate'))
        offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (index_pt,
                                                                  self.se_tag,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int')
        offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
            SingleExec.__generate_data(offline_valid_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_valid_pos_rate)
        LogUtil.log('INFO', 'offline valid data generation done')

        # generate test data set
        offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate'))
        offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (index_pt,
                                                                self.se_tag,
                                                                self.config.get('MODEL', 'offline_rawset_name'))
        offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int')
        offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
            SingleExec.__generate_data(offline_test_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_test_pos_rate)
        LogUtil.log('INFO', 'offline test data generation done')

        model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
        model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name')
        model.save(model_fp)
        offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features,
                                                                                 offline_train_labels,
                                                                                 offline_valid_features,
                                                                                 offline_valid_labels,
                                                                                 offline_test_features,
                                                                                 offline_test_labels)
        offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_train_labels,
                                                 offline_train_preds)
        offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_valid_labels,
                                                 offline_valid_preds)
        offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                offline_test_labels,
                                                offline_test_preds)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score,
                                                                         offline_valid_score,
                                                                         offline_test_score))
        score_file.close()
        # save prediction results
        offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                          self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w')
        offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                        self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')