def cotraining (model_one, model_two, n_iter = 100) : """ """ data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number:,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) # train_number = 100 # unlabel_number = 1000 # # train = train[:100,:] # unlabel = unlabel[:1000,:] # label = label[:100] train_one = copy.deepcopy (train) label_one = copy.deepcopy (label) train_two = copy.deepcopy (train) label_two = copy.deepcopy (label) model_one.fit (train_one, label_one) model_two.fit (train_two, label_two) for iter in xrange (1 , n_iter + 1 , 1) : logging.info ('#%d iter for co-training :' % iter) unlabel_label = [-1] * unlabel_number unlabel_index = range (0, unlabel_number) step = 0 while len (unlabel_index) > 0 : step += 1 logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index))) model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two) model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one) evaluate.get_auc (model_one.predict_proba (validation)[:,1]) evaluate.get_auc (model_two.predict_proba (validation)[:,1]) evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0) joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step)) joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step)) evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
sys.path.insert(0, '../..') import feature.splitvalue as split import model.evaluate as evaluate if __name__ == '__main__' : data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] val_label = pd.read_csv ('../../data/val_cv_y.csv').y.values io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard') train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard_decompose') # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) train_data, train_label, validation_data, validation_label, test, unlabel = io.grab ('../../data/data_standard') print 'training set:' , train_data.shape print 'validation set: ' , validation_data.shape print 'testing set', test.shape print 'unlabel set', unlabel.shape assert train_data.shape[0] == len (train_label) assert validation_data.shape[0] == len (validation_label) """