예제 #1
0
						# random.shuffle(tmplist)

						# TODO: for each feature group, make one plot of all 
						#		the AUCs compared to each other
						# need: for each featurelist:
						#				- a name for it
						#				- the AUC + confidence interval

						for featurelist in itertools.combinations(featuregroup,pr):
						# for featurelist in tmplist[:10]:

							print "Features: \n\t%s"%'\n\t'.join(featurelist)

							m = getattr(am,modelname)(**thispdict)
							e = Experiment(model=m, feature_list=featurelist, 
										   dloader=dload, id=None,nan_handling=cfg['nan_handling'],
										   logFolder=cfg['logFolder'], looplog=cfg['looplog'],
										   summary_only=cfg['summary_only'])

							e.run_experiment()

							# tmp[(tuple(featurelist)] = (e.auc, e.auc_train)
							# tmp.loc[rowIdx] = [e.auc,e.auc_train,featurelist]
							# if e.auc>e.auc_train:
							# 	print "THIS NO GOOD"
							# 	print ', '.join(featurelist)
							# 	print "++++++++++++++++++++"
							# rowIdx+=1
							
							# print "Best so far: "
							# bla = max(tmp.items(),key=lambda x: x[1][0])
							# print '\tModel: ', bla[0][0]
예제 #2
0
                        #		the AUCs compared to each other
                        # need: for each featurelist:
                        #				- a name for it
                        #				- the AUC + confidence interval

                        for featurelist in itertools.combinations(
                                featuregroup, pr):
                            # for featurelist in tmplist[:10]:

                            print "Features: \n\t%s" % '\n\t'.join(featurelist)

                            m = getattr(am, modelname)(**thispdict)
                            e = Experiment(model=m,
                                           feature_list=featurelist,
                                           dloader=dload,
                                           id=None,
                                           nan_handling=cfg['nan_handling'],
                                           logFolder=cfg['logFolder'],
                                           looplog=cfg['looplog'],
                                           summary_only=cfg['summary_only'])

                            e.run_experiment()

                            # tmp[(tuple(featurelist)] = (e.auc, e.auc_train)
                            # tmp.loc[rowIdx] = [e.auc,e.auc_train,featurelist]
                            # if e.auc>e.auc_train:
                            # 	print "THIS NO GOOD"
                            # 	print ', '.join(featurelist)
                            # 	print "++++++++++++++++++++"
                            # rowIdx+=1

                            # print "Best so far: "
	# go over feature groups
	res = {}
	for featuregroup in cfg['features']:

		model = cfg['model']
		if (len(model.keys()) > 1) or (len(model.values()) > 1):
			raise IOError("A model is not specified correctly.")

		modelname = model.keys()[0]
		paramdict = model.values()[0]

		m = getattr(am,modelname)(**paramdict)

		e = Experiment(model=m, feature_list=featuregroup.values()[0], 
					   dloader=dload, id=None,nan_handling=cfg['nan_handling'],
					   logFolder=cfg['logFolder'], looplog=cfg['looplog'])

		e.apply_postprocessors()
		e.handle_NAs()

		# need to make sure that we only select the columns that are the same across train and test, need the intersection
		test_cols = set(e.test_rows.columns.values)
		train_cols = set(e.train_rows.columns.values)
		predictor_cols = list(test_cols & train_cols)
		predictor_cols.remove(e.target_col)

		# take out the dataframe we'll be working with
		df = e.train_rows[predictor_cols + [e.target_col]]
		randIdxs = np.random.randint(df.shape[0],size=(df.shape[0],cfg['n_boot']))