def main(data_filename,stat_filename,max_iter,sample_rate,learn_rate,max_depth,split_points): dataset=DataSet(data_filename); print "Model parameters configuration:[data_file=%s,stat_file=%s,max_iter=%d,sample_rate=%f,learn_rate=%f,max_depth=%d,split_points=%d]"%(data_filename,stat_filename,max_iter,sample_rate,learn_rate,max_depth,split_points); dataset.describe(); stat_file=open(stat_filename,"w"); stat_file.write("iteration\taverage loss in train data\tprediction accuracy on test data\taverage loss in test data\n"); model=Model(max_iter,sample_rate,learn_rate,max_depth,split_points); train_data=sample(dataset.get_instances_idset(),int(dataset.size()*2.0/3.0)); test_data=set(dataset.get_instances_idset())-set(train_data); model.train(dataset,train_data,stat_file,test_data); #model.test(dataset,test_data); stat_file.close();
iter4 : train loss=0.123063 iter5 : train loss=0.087872 iter6 : train loss=0.065684 iter7 : train loss=0.049936 iter8 : train loss=0.041866 iter9 : train loss=0.035695 iter10 : train loss=0.030581 iter11 : train loss=0.027034 iter12 : train loss=0.024570 iter13 : train loss=0.019227 iter14 : train loss=0.015794 iter15 : train loss=0.013484 iter16 : train loss=0.010941 iter17 : train loss=0.009879 iter18 : train loss=0.008619 iter19 : train loss=0.007306 iter20 : train loss=0.005610 """ from data import DataSet from model import GBDT if __name__ == '__main__': data_file = './credit.data.csv' dateset = DataSet(data_file) gbdt = GBDT(max_iter=20, sample_rate=0.9, learn_rate=0.5, max_depth=7, loss_type='binary-classification') gbdt.fit(dateset, dateset.get_instances_idset())