def rescale(config, online_preds_fp): online_preds = DataUtil.load_vector(online_preds_fp, 'float') feature_name = 'graph_edge_max_clique_size' feature_pt = config.get('DEFAULT', 'feature_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_mc = Feature.load(test_feature_fp).toarray() feature_name = 'graph_edge_cc_size' feature_pt = config.get('DEFAULT', 'feature_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_cc = Feature.load(test_feature_fp).toarray() for index in range(len(online_preds)): score = online_preds[index] if test_features_mc[index][0] == 3.: score = PostProcessor.adj(score, te=0.40883512, tr=0.623191) elif test_features_mc[index][0] > 3.: score = PostProcessor.adj(score, te=0.96503024, tr=0.972554) else: if test_features_cc[index][0] < 3.: score = PostProcessor.adj(score, te=0.05739666, tr=0.233473) else: score = PostProcessor.adj(score, te=0.04503431, tr=0.149471) online_preds[index] = score DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)
def run_online(self): # load feature matrix online_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'online_rawset_name'), self.config.get('FEATURE', 'will_save')) model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name') model.load(model_fp) online_preds = model.predict(online_features) online_preds_fp = '%s/se_online.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'online_test_rawset_name')) DataUtil.save_vector(online_preds_fp, online_preds, 'w')
def __generate_index(self, row_num): index_all = [list()] * self.cv_num for i in range(row_num): index_all[int(random.random() * self.cv_num)].append(i) for i in range(self.cv_num): LogUtil.log( 'INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i]))) index_pt = self.config.get('DEFAULT', 'index_pt') for i in range(self.cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(fp, list(), 'w') for j in range(self.cv_num - 2): part_id = (i + j) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 2) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 1) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def generate_index_with_swap(self): """ Generate the index file of `train_with_swap.csv` :return: none """ train_index_fp = '%s/train_311.train.index' % self.config.get('DEFAULT', 'feature_index_pt') train_with_swap_index_fp = '%s/train_311.train_with_swap.index' % self.config.get('DEFAULT', 'feature_index_pt') train_index = DataUtil.load_vector(train_index_fp, False) train_index = [int(x) for x in train_index] offset = 404290 train_swap_index = [x + offset for x in train_index] train_with_swap_index = train_index + train_swap_index DataUtil.save_vector(train_with_swap_index_fp, train_with_swap_index, 'w')
def run_online(self): # load feature matrix online_features = Feature.load_all( self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'online_rawset_name'), self.config.get('FEATURE', 'will_save')) model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get( 'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get( 'MODEL', 'model_name') model.load(model_fp) online_preds = model.predict(online_features) online_preds_fp = '%s/se_online.%s.pred' % (self.config.get( 'DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'online_test_rawset_name')) DataUtil.save_vector(online_preds_fp, online_preds, 'w')
def __generate_index(self, row_num): train_indexs = list() valid_indexs = list() test_indexs = list() for i in range(row_num): part_id = random.random() * self.se_num if part_id < self.se_num - 2: train_indexs.append(i) elif part_id < self.se_num - 1: valid_indexs.append(i) else: test_indexs.append(i) index_pt = self.config.get('DEFAULT', 'index_pt') train_fp = '%s/se_tag%s_train.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(train_fp, train_indexs, 'w') valid_fp = '%s/se_tag%s_valid.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(valid_fp, valid_indexs, 'w') test_fp = '%s/se_tag%s_test.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(test_fp, test_indexs, 'w')
def __generate_index(self, row_num): train_indexs = list() valid_indexs = list() test_indexs = list() for i in range(row_num): part_id = random.random() * self.se_num if part_id < self.se_num - 2: train_indexs.append(i) elif part_id < self.se_num - 1: valid_indexs.append(i) else: test_indexs.append(i) index_pt = self.config.get('DEFAULT', 'index_pt') train_fp = '%s/se_tag%s_train.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(train_fp, train_indexs, 'w') valid_fp = '%s/se_tag%s_valid.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(valid_fp, valid_indexs, 'w') test_fp = '%s/se_tag%s_test.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(test_fp, test_indexs, 'w')
def generate_cv_subset_index(cf, argv): """ Generate index used for 5-fold cross validation :param cf: configuration file :param argv: parameter list :return: none """ tag = argv[0] cv_num = 5 cv_rawset_name = 'train_with_swap' train_data_size = 404290 index_all = [] for i in range(cv_num): index_all.append([]) for i in range(train_data_size): index_all[int(random.random() * cv_num)].append(i) for i in range(cv_num): LogUtil.log('INFO', 'size(part%d)=%d' % (i, len(index_all[i]))) index_fp = cf.get('DEFAULT', 'feature_index_pt') for i in range(cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector( fp, [index + train_data_size for index in index_all[part_id]], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 2) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 1) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def __generate_index(self, row_num): index_all = [list()] * self.cv_num for i in range(row_num): index_all[int(random.random() * self.cv_num)].append(i) for i in range(self.cv_num): LogUtil.log('INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i]))) index_pt = self.config.get('DEFAULT', 'index_pt') for i in range(self.cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(fp, list(), 'w') for j in range(self.cv_num - 2): part_id = (i + j) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 2) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 1) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def run_offline(self): LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag) # load feature matrix offline_features = Feature.load_all( self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector( '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.cv_tag: self.cv_tag = self.out_tag self.__generate_index(offline_features.shape[0]) # cross validation offline_valid_preds_all = [0.] * offline_features.shape[0] offline_test_preds_all = [0.] * offline_features.shape[0] for fold_id in range(self.cv_num): LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id) # generate training data set offline_train_pos_rate = float( self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector( offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ CrossValidation.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float( self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector( offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ CrossValidation.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float( self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ CrossValidation.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \ (self.cv_num, fold_id, self.config.get('MODEL', 'model_name')) model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit( offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get( 'DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id, offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # merge prediction results for index in range(len(offline_valid_balanced_indexs)): offline_valid_preds_all[offline_valid_balanced_indexs[ index]] = offline_valid_preds[index] for index in range(len(offline_test_balanced_indexs)): offline_test_preds_all[offline_test_balanced_indexs[ index]] = offline_test_preds[index] LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id) # save prediction results offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % ( self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w') offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % ( self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w') # evaluate offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_valid_preds_all) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_test_preds_all) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score)) score_file.close()
def run_offline(self): # load feature matrix offline_features = Feature.load_all( self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector( '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.se_tag: self.se_tag = self.out_tag self.__generate_index(offline_features.shape[0]) index_pt = self.config.get('DIRECTORY', 'index_pt') # generate training data set offline_train_pos_rate = float( self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ SingleExec.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float( self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ SingleExec.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ SingleExec.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get( 'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get( 'MODEL', 'model_name') model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit( offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write( 'single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # save prediction results offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get( 'DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w') offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get( 'DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
def run_offline(self): LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag) # load feature matrix offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.cv_tag: self.cv_tag = self.out_tag self.__generate_index(offline_features.shape[0]) # cross validation offline_valid_preds_all = [0.] * offline_features.shape[0] offline_test_preds_all = [0.] * offline_features.shape[0] for fold_id in range(self.cv_num): LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id) # generate training data set offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ CrossValidation.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ CrossValidation.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ CrossValidation.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \ (self.cv_num, fold_id, self.config.get('MODEL', 'model_name')) model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id, offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # merge prediction results for index in range(len(offline_valid_balanced_indexs)): offline_valid_preds_all[offline_valid_balanced_indexs[index]] = offline_valid_preds[index] for index in range(len(offline_test_balanced_indexs)): offline_test_preds_all[offline_test_balanced_indexs[index]] = offline_test_preds[index] LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id) # save prediction results offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w') offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w') # evaluate offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_valid_preds_all) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_test_preds_all) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score)) score_file.close()
def run_offline(self): # load feature matrix offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.se_tag: self.se_tag = self.out_tag self.__generate_index(offline_features.shape[0]) index_pt = self.config.get('DIRECTORY', 'index_pt') # generate training data set offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ SingleExec.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ SingleExec.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ SingleExec.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name') model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # save prediction results offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w') offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')