def prep_data_for_sklearn(**kwargs): if kwargs.get('comb_data', False): features = [f.functor(f.particle) for f in gcm().comb_bkg_bdt_vars] else: features = [f.functor(f.particle) for f in gcm().rand_spi_bdt_vars] spectators = [f.functor(f.particle) for f in gcm().spectator_vars] kwargs.update({'sklearn': True}) data = get_bdt_data(**kwargs) train, test = train_test_split(data, random_state=43) return (train, test, train['labels'].astype(np.bool), test['labels'].astype(np.bool)), features, spectators
def test_splitting(): signal_df = pandas.DataFrame(numpy.ones([10, 10])) bg_df = pandas.DataFrame(numpy.zeros([10, 10])) common_X = pandas.concat([signal_df, bg_df], ignore_index=True) common_y = numpy.concatenate([numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))]) trainX, testX, trainY, testY = commonutils.train_test_split(common_X, common_y) for (index, row), label in zip(trainX.iterrows(), trainY): assert numpy.all(row == label), 'wrong data partition' for (index, row), label in zip(testX.iterrows(), testY): assert numpy.all(row == label), 'wrong data partition'
def test_splitting(): signal_df = pandas.DataFrame(numpy.ones([10, 10])) bg_df = pandas.DataFrame(numpy.zeros([10, 10])) common_X = pandas.concat([signal_df, bg_df], ignore_index=True) common_y = numpy.concatenate( [numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))]) trainX, testX, trainY, testY = commonutils.train_test_split( common_X, common_y) for (index, row), label in zip(trainX.iterrows(), trainY): assert numpy.all(row == label), 'wrong data partition' for (index, row), label in zip(testX.iterrows(), testY): assert numpy.all(row == label), 'wrong data partition'
def test_feature_splitter(size=2000): X, y = commonutils.generate_sample(size, 10, distance=0.5) X['column0'] = numpy.clip(numpy.array(X['column0']).astype(numpy.int), -2, 2) trainX, testX, trainY, testY = commonutils.train_test_split(X, y) base_estimators = {'rf': RandomForestClassifier()} splitter = FeatureSplitter('column0', base_estimators=base_estimators, final_estimator=RandomForestClassifier()) splitter.fit(trainX, trainY) print(splitter.score(testX, testY)) print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY)) print(DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit(trainX, trainY).score(testX, testY)) chain = OrderedDict() chain['QDA'] = QDA() chain['LDA'] = LDA() chain['RF'] = RandomForestClassifier() print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY)) print(LDA().fit(trainX, trainY).score(testX, testY))
def test_splitting(n_rows=10, n_columns=8): column_names = ['col' + str(i) for i in range(n_columns)] signal_df = pandas.DataFrame(numpy.ones([n_rows, n_columns]), columns=column_names) bg_df = pandas.DataFrame(numpy.zeros([n_rows, n_columns]), columns=column_names) common_X = pandas.concat([signal_df, bg_df], ignore_index=True) common_y = numpy.concatenate([numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))]) trainX, testX, trainY, testY = commonutils.train_test_split(common_X, common_y) for (index, row), label in zip(trainX.iterrows(), trainY): assert numpy.all(row == label), 'wrong data partition' for (index, row), label in zip(testX.iterrows(), testY): assert numpy.all(row == label), 'wrong data partition' assert (trainX.columns == column_names).all(), 'new column names!' assert (testX.columns == column_names).all(), 'new column names!' assert len(trainX) + len(testX) == len(common_X), 'new size is strange'
def test_feature_splitter(size=2000): X, y = commonutils.generate_sample(size, 10, distance=0.5) X['column0'] = numpy.clip( numpy.array(X['column0']).astype(numpy.int), -2, 2) trainX, testX, trainY, testY = commonutils.train_test_split(X, y) base_estimators = {'rf': RandomForestClassifier()} splitter = FeatureSplitter('column0', base_estimators=base_estimators, final_estimator=RandomForestClassifier()) splitter.fit(trainX, trainY) print(splitter.score(testX, testY)) print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY)) print( DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit( trainX, trainY).score(testX, testY)) chain = OrderedDict() chain['QDA'] = QDA() chain['LDA'] = LDA() chain['RF'] = RandomForestClassifier() print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY)) print(LDA().fit(trainX, trainY).score(testX, testY))
def test_splitting(n_rows=10, n_columns=8): column_names = ['col' + str(i) for i in range(n_columns)] signal_df = pandas.DataFrame(numpy.ones([n_rows, n_columns]), columns=column_names) bg_df = pandas.DataFrame(numpy.zeros([n_rows, n_columns]), columns=column_names) common_X = pandas.concat([signal_df, bg_df], ignore_index=True) common_y = numpy.concatenate( [numpy.ones(len(signal_df)), numpy.zeros(len(bg_df))]) trainX, testX, trainY, testY = commonutils.train_test_split( common_X, common_y) for (index, row), label in zip(trainX.iterrows(), trainY): assert numpy.all(row == label), 'wrong data partition' for (index, row), label in zip(testX.iterrows(), testY): assert numpy.all(row == label), 'wrong data partition' assert (trainX.columns == column_names).all(), 'new column names!' assert (testX.columns == column_names).all(), 'new column names!' assert len(trainX) + len(testX) == len(common_X), 'new size is strange'
def just_the_labels(sw=False, comb_data=False): data = get_bdt_data(sw=sw, sklearn=True, comb_data=comb_data) train, test = train_test_split(data, random_state=43) return train[['labels', 'weights']], test[['labels', 'weights']]