Exemplo n.º 1
0
    def init_master_params(self, from_file=False, filepath=None):
        """
		Computes initial random gaussian values for master weights and biases
		Returns:
			(float array): Random gaussian values for neural network weights and biases
		"""
        if from_file:
            master_params = utils.load_array(filepath)
            if len(master_params) != self.params_size:
                err_msg = "Params from (" + filepath + ") does not match the network shape."
                raise ValueError(err_msg)
            return utils.load_array(filepath)
        master_params = []
        weights = np.random.normal(
            0, 1, self.config['input_size'] * self.config['n_nodes_per_layer'])
        master_params += list(weights)
        biases = np.random.normal(0, 1, self.config['n_nodes_per_layer'])
        master_params += list(biases)

        for i in range(self.config['n_hidden_layers']):
            weights = np.random.normal(
                0, 1, self.config['n_nodes_per_layer'] *
                self.config['n_nodes_per_layer'])
            master_params += list(weights)
            biases = np.random.normal(0, 1, self.config['n_nodes_per_layer'])
            master_params += list(biases)

        weights = np.random.normal(
            0, 1,
            self.config['n_nodes_per_layer'] * self.config['output_size'])
        master_params += list(weights)
        biases = np.random.normal(0, 1, self.config['output_size'])
        master_params += list(biases)
        return master_params
Exemplo n.º 2
0
    def load_data(self,
                  serialized_data_folder='./',
                  path_to_vehicle_folder='vehicles/',
                  path_to_non_vehicle_folder='non-vehicles/'):

        path_to_X_dat_file = '{0}/{1}'.format(serialized_data_folder, 'X.dat')
        path_to_y_dat_file = '{0}/{1}'.format(serialized_data_folder, 'y.dat')
        X_dat_file_exists = os.path.exists(path_to_X_dat_file)
        y_dat_file_exists = os.path.exists(path_to_y_dat_file)

        if X_dat_file_exists and y_dat_file_exists:
            print('Loading from serialized ...')
            self.X = load_array(path_to_X_dat_file)
            self.y = load_array(path_to_y_dat_file)
            print('Done reading serialized arrays')
        else:
            print('Creating data from image folders')
            non_vehicle_class, vehicle_class = 0, 1
            non_vehicle_X, non_vehicle_y = self._get_X_y(
                path_to_non_vehicle_folder, non_vehicle_class)
            vehicle_X, vehicle_y = self._get_X_y(path_to_vehicle_folder,
                                                 vehicle_class)

            self.X = np.concatenate((non_vehicle_X, vehicle_X))
            self.y = np.concatenate((non_vehicle_y, vehicle_y))
            print(
                'Data created successfully, creating serialized numpy arrays')
            save_array(path_to_X_dat_file, self.X)
            save_array(path_to_y_dat_file, self.y)
            print('Done saving arrays')
Exemplo n.º 3
0
def get_data(dataset_version):
    feature_names_comps = utils.load_array("%s/feature_names_comps" % dataset_version)
    feature_names_ab = utils.load_array("%s/feature_names_ab" % dataset_version)
    feature_names_ti = utils.load_array("%s/feature_names_ti" % dataset_version)
    X_comps = utils.load_array("%s/X_comps" % dataset_version)
    X_ab = utils.load_csr("%s/X_ab" % dataset_version)
    X_ti = utils.load_csr("%s/X_ti" % dataset_version)
    y = utils.load_array("%s/y" % dataset_version)
    return feature_names_comps, feature_names_ab, feature_names_ti, X_comps, X_ab, X_ti, y
Exemplo n.º 4
0
def get_data(dataset_version):
    feature_names_comps = utils.load_array("%s/feature_names_comps" %
                                           dataset_version)
    feature_names_ab = utils.load_array("%s/feature_names_ab" %
                                        dataset_version)
    feature_names_ti = utils.load_array("%s/feature_names_ti" %
                                        dataset_version)
    X_comps = utils.load_array("%s/X_comps" % dataset_version)
    X_ab = utils.load_csr("%s/X_ab" % dataset_version)
    X_ti = utils.load_csr("%s/X_ti" % dataset_version)
    y = utils.load_array("%s/y" % dataset_version)
    return feature_names_comps, feature_names_ab, feature_names_ti, X_comps, X_ab, X_ti, y
Exemplo n.º 5
0
def submit(filename):
    df_test = pd.read_csv(TEST_LIST_FILE)
    preds = load_array(PRED_FILE)
    #preds = load_array(PRED_WEATHER)
    threshold = load_array(THRESHOLD_FILE_ENS).tolist()
    #threshold = 0.18
    print(threshold)

    for i, pred in enumerate(preds):
        tags = get_multi_classes(pred, classes, threshold)
        df_test['tags'][i] = tags

    df_test.to_csv(RESULT_DIR + '/' + filename, index=False)
    print(df_test.head())
Exemplo n.º 6
0
def find_best_weather():
    thr = load_array(THRESHOLD_FILE_ENS)
    labels = load_array(VAL_LABELS)
    preds = load_array(PRED_VAL)

    print(labels.shape)
    weather = preds[:, 0:4]
    y = labels[0, :, 0:4]
    print(y.shape)
    print(weather.shape)
    thr = thr[0:4]

    def mf(p):
        p2 = np.zeros_like(p)
        for i in range(4):
            p2[:, i] = (p[:, i] > thr[i]).astype(np.int)
        score1 = fbeta_score(y, p2, beta=2, average='samples')
        return score1

    base_score = mf(
        weather)  #fbeta_score(y, weather, beta=2, average='samples')
    print('base score:{}'.format(base_score))
    max_score = base_score
    d = 0.5
    best_d = 0.5
    best_w = weather
    while d < 1:
        w = get_one_weather(weather, thr, d)
        score = mf(w)  #fbeta_score(y, w, beta=2, average='samples')
        print('score{}, d:{}'.format(score, d))
        if score > max_score:
            max_score = score
            best_d = d
            best_w = w
        d += 0.1

    print('best d:{}'.format(best_d))
    w1 = force_one_weather(weather, thr)
    score1 = mf(w1)
    print('force one weather score:{}'.format(score1))

    if max_score > base_score + 0.00001:
        test_preds = load_array(PRED_FILE)
        test_w = test_preds[:, 0:4]
        w = get_one_weather(test_w, thr, best_d)
        test_preds[:, 0:4] = w
        #preds[:, 0:4] = best_w

        save_array(PRED_WEATHER, test_preds)
Exemplo n.º 7
0
def find_best_threshold():
    preds = load_array(PRED_VAL)
    labels = load_array(VAL_LABELS)
    print(np.array(labels).shape)
    for i in range(1, len(labels)):
        for j in range(len(labels[i])):
            for k in range(len(labels[i][j])):
                if labels[i][j][k] != labels[i - 1][j][k]:
                    print('error, check labels failed')
                    exit()

    x = optimise_f2_thresholds(labels[0], preds)
    print('best threshold:')
    print(x)
    save_array(THRESHOLD_FILE_ENS, x)
    def __init__(self, w):
        '''
        Generates random messages

        # Parameter
        -------------
        w: np.array
            Weighted random condition.

        # Returns
        -------------
        sample: integer
            Single random message.
        '''

        self.w = w
        self.N = len(w)
        # We assign each message version with a unique ID from 0 to N-1
        self.setN = np.arange(self.N)
        self.M = sum(w)
        # w/self.M will generate an array of 0s, use numpy divide 
        self.norm_w = np.true_divide(w,self.M)
        # We randomize M message from N version for only 1 time
        if self.M < 6:
            self.shuffle = np.random.choice(self.setN, self.M, p=self.norm_w)
        else:
            # Use bcolz to store array > 1MB to speed up computation
            utils.save_array("shuffle.bc", np.random.choice(self.setN, self.M, p=self.norm_w))
            self.shuffle = utils.load_array("shuffle.bc")
        # Everytime we call the message function that take a random message, 
        # we increase current_id for the next message
        self.current_id = 0
Exemplo n.º 9
0
def submit(filename, clip, use_weight=False):
    #filenames = [f.split('/')[-1] for f, i in dsets.imgs]
    #filenames = get_stage1_test_loader('res50').filenames
    filenames = load_array(TEST_FILE_NAMES)
    if use_weight:
        preds = load_array(PRED_FILE_WEIGHTED)
    else:
        preds = load_array(PRED_FILE)
    if clip > 0.9999:
        subm = np.array(preds)
    else:
        subm = do_clip(preds, clip)
    subm_name = RESULT_DIR + '/' + filename
    submission = pd.DataFrame(subm, columns=dset_classes)
    submission.insert(0, 'image_name', filenames)
    print(submission.head())
    submission.to_csv(subm_name, index=False)
Exemplo n.º 10
0
def submit(filename):
    #filenames = [f.split('/')[-1] for f, i in dsets.imgs]
    #filenames = get_stage1_test_loader('res50').filenames
    preds = load_array(PRED_FILE)
    print(preds[:100])
    subm_name = RESULT_DIR + '/' + filename
    df = pd.read_csv(data_dir + '/sample_submission.csv')
    df['invasive'] = preds
    print(df.head())
    df.to_csv(subm_name, index=False)
Exemplo n.º 11
0
    def load_precomputed_conv_models(self):
        '''
        Method loads precomputed conv_model outputs. Initializes:

        :return: self.train_precomputed and self.val_precomputed.
        '''
        print("loading precomputed conv. outputs...")

        fName1 = "precomputed_trn_features." + self.runID + ".h5"
        fName2 = "precomputed_val_features." + self.runID + ".h5"

        self.train_precomputed = load_array(fName1)
        self.val_precomputed = load_array(fName2)

        # since we're loading precomputed outputs from the conv_model,
        # set this flag to true.
        self.use_precomputed_conv_output = True

        print("done...")
        return self
Exemplo n.º 12
0
def submit(filename, clip):
    filenames = [f.split('/')[-1] for f, i in dsets.imgs]

    preds = load_array(PRED_FILE)
    subm = do_clip(preds, clip)
    #classes = dsets['train'].classes
    subm_name = RESULT_DIR+'/'+filename  
    submission = pd.DataFrame(subm, columns=dset_classes)
    submission.insert(0, 'image_name', filenames)
    print(submission.head())
    submission.to_csv(subm_name, index=False)
Exemplo n.º 13
0
def load_data(self):
    self.file = h5py.File(self.input, "r")
    print("[MVA_HELPER] INPUT FILE: %s" % self.input)
    print("[MVA_HELPER] INPUT FILE KEYS: ", self.file.keys())

    self.data = {}
    self.data["feature_names"] = utils.load_array(self.file, "feature_names")
    for set in ["train", "test", "data"]:
        self.data[set] = {}
        for aux in default_branches + self.additional_branches:
            self.data[set][aux] = utils.load_array(self.file,
                                                   "%s_%s" % (aux, set))
        self.data[set]["label"] = utils.load_array(self.file,
                                                   "%s_%s" % ("label", set))
        self.data[set]["global"] = utils.load_array(self.file,
                                                    "%s_%s" % ("global", set))
        self.data[set]["objects"] = utils.load_array(
            self.file, "%s_%s" % ("objects", set))

        self.data[set]["n_events_raw"] = len(self.data[set]["label"])
        for check in default_branches + ["global"] + self.additional_branches:
            if not len(
                    self.data[set][check]) == self.data[set]["n_events_raw"]:
                print(
                    "[MVA_HELPER] WARNING -- %s set: entry %s does not have the same number of events as the label (%d vs. %d)"
                    % (set, check, len(self.data[set][check]),
                       self.data[set]["n_events_raw"]))
        self.data[set]["n_events"] = numpy.sum(self.data[set]["evt_weight_"])

        print(
            "[MVA_HELPER] LOADING DATA -- %s set: %d raw events with a normalization of %.3f"
            %
            (set, self.data[set]["n_events_raw"], self.data[set]["n_events"]))

    self.data["train"]["train_id"] = numpy.zeros(
        len(self.data["train"]["label"]))
    self.data["test"]["train_id"] = numpy.ones(len(self.data["test"]["label"]))
    self.data["data"]["train_id"] = numpy.ones(len(self.data["data"]["label"]))
Exemplo n.º 14
0
def submit(filename, clip):
    filenames = [f.split('/')[-1] for f, i in dsets.imgs]

    preds = load_array(PRED_FILE)
    if clip > 0.9999:
        subm = np.array(preds)
    else:
        subm = do_clip(preds, clip)
        #up_clip(subm, clip, 0.8)
        #print(subm[:10])
    subm_name = RESULT_DIR + '/' + filename
    submission = pd.DataFrame(subm, columns=dset_classes)
    submission.insert(0, 'image_name', filenames)
    print(submission.head())
    submission.to_csv(subm_name, index=False)
Exemplo n.º 15
0
def submit(filename):
    preds_raw = []
    pred_files = glob.glob(settings.PREDICT_DIR + os.sep + '*.dat')
    for pred_file in pred_files:
        print("loading predictions: % s" % pred_file)
        preds_raw.append(load_array(pred_file))
    preds = np.mean(preds_raw, axis=0)

    print(preds[:100])
    subm_name = settings.RESULT_DIR + os.sep + filename
    df = pd.read_csv(settings.DATA_DIR + os.sep + 'sample_submission.csv')
    df['invasive'] = preds
    print(df.head())
    df.to_csv(subm_name, index=False)

    preds2 = (preds > 0.5).astype(np.int)
    df2 = pd.read_csv(settings.DATA_DIR + os.sep + 'sample_submission.csv')
    df2['invasive'] = preds2
    df2.to_csv(subm_name + '01', index=False)
Exemplo n.º 16
0
use_ti = True
test_percent = .25
scoring = 'precision'

(feature_names_comps, feature_names_ab, feature_names_ti, X_comps, X_ab, X_ti,
 y) = get_data(dataset_version)

# <codecell>

############################################################################
# extract features via chi2
# assemble X_train, X_test, and feature_names
print("assembling features")
t0 = time()

fnab = utils.load_array('data/feature_names_ab')
fnti = utils.load_array('data/feature_names_ti')
fn = np.append(fnab, fnti)
X_train = utils.load_coo('data/X_train').todense()
y_train = utils.load_array('data/y_train')
X_test = utils.load_coo('data/X_test').todense()
y_test = utils.load_array('data/y_test')
print("done assembling features in %fs" % (time() - t0))

# <codecell>

fitted = []


def grid_search(estimator):
    # try:
Exemplo n.º 17
0
    with open(HELIOS, 'r') as f:
        config = json.load(f)

    logging.basicConfig(filename='../log/forest_gater_par.txt',
                        level=logging.INFO)
    start_time = time.time()
    logging.info('##### NEW EXPERIMENT_' + str(start_time) + '_#####')
    logging.info(HELIOS)

    logging.info(json.dumps(config, indent=4))
    TRAIN = PATH + 'train/100/'
    TEST = PATH + 'test/100/'
    VAL = PATH + 'val/100/'

    X_train = load_array(TRAIN + 'X_train.bc/')
    y_train = load_array(TRAIN + 'y_train.bc/')
    X_test = load_array(TEST + 'X_test.bc/')
    y_test = load_array(TEST + 'y_test.bc/')
    X_val = load_array(VAL + 'X_val.bc/')
    y_val = load_array(VAL + 'y_val.bc/')

    y_train[y_train == 0] = -1
    y_test[y_test == 0] = -1
    y_val[y_val == 0] = -1

    logging.info('Training Size ' + str(X_train.shape[0]))
    logging.info('Testing Size ' + str(X_test.shape[0]))
    logging.info('Val Size ' + str(X_val.shape[0]))

    moe = MOE()
Exemplo n.º 18
0
def train_bdt(config):
    # Trains BDT with given hyperparams and returns max Z_A (as calculated on bkg MC), requiring at least 4 signal events
    if config["invert_test_and_train"]:
        config["input_file"] = config["input_file_2"]
    else:
        config["input_file"] = config["input_file_1"]
    f = h5py.File(config["input_file"], "r")

    feature_names = utils.load_array(f, 'feature_names')
    training_feature_names = utils.load_array(f, 'training_feature_names')

    print(("Training with the following features: ", training_feature_names))

    #if config["invert_test_and_train"]:
    #print "Inverting test and train splits"
    #if config["sideband"]:
    #  print "Not yet implemented how to handle inverting the test/train set when training on data sidebands, exiting"
    #  return -1

    #global_features = utils.load_array(f, 'global_validation')
    #label = utils.load_array(f, 'label_validation')
    #multi_label = utils.load_array(f, 'multi_label_validation')
    #weights = utils.load_array(f, 'weights_validation')
    #mass = utils.load_array(f, 'mass_validation')

    #global_features_validation = utils.load_array(f, 'global')
    #label_validation = utils.load_array(f, 'label')
    #multi_label_validation = utils.load_array(f, 'multi_label')
    #weights_validation = utils.load_array(f, 'weights')
    #mass_validation = utils.load_array(f, 'mass')

    #else:
    global_features = utils.load_array(f, 'global')
    label = utils.load_array(f, 'label')
    multi_label = utils.load_array(f, 'multi_label')
    weights = utils.load_array(f, 'weights')
    mass = utils.load_array(f, 'mass')

    global_features_validation = utils.load_array(f, 'global_validation')
    label_validation = utils.load_array(f, 'label_validation')
    multi_label_validation = utils.load_array(f, 'multi_label_validation')
    weights_validation = utils.load_array(f, 'weights_validation')
    mass_validation = utils.load_array(f, 'mass_validation')

    if config["sideband"]:
        global_features = utils.load_array(f, 'global_data_sideband')
        label = utils.load_array(f, 'label_data_sideband')
        multi_label = utils.load_array(f, 'multi_label_data_sideband')
        weights = utils.load_array(f, 'weights_data_sideband')
        mass = utils.load_array(f, 'mass_data_sideband')

    global_features_data = utils.load_array(f, 'global_data')
    label_data = utils.load_array(f, 'label_data')
    multi_label_data = utils.load_array(f, 'multi_label_data')
    weights_data = utils.load_array(f, 'weights_data')
    mass_data = utils.load_array(f, 'mass_data')

    print((global_features.shape))
    print((label.shape))
    print((weights.shape))

    print((global_features_validation.shape))
    print((label_validation.shape))
    print((weights_validation.shape))

    print((global_features_data.shape))
    print((label_data.shape))
    print((weights_data.shape))

    x_train, y_train, y_train_multi, weights_train = global_features, label, multi_label, weights
    x_test, y_test, y_test_multi, weights_test = global_features_validation, label_validation, multi_label_validation, weights_validation

    X_train = pandas.DataFrame(data=x_train, columns=training_feature_names)
    X_test = pandas.DataFrame(data=x_test, columns=training_feature_names)
    X_data = pandas.DataFrame(data=global_features_data,
                              columns=training_feature_names)

    if config["multiclassifier"]:
        Y_train = y_train_multi
        Y_test = y_test_multi
    else:
        Y_train = y_train
        Y_test = y_test

    sum_neg_weights = utils.sum_of_weights_v2(weights_train, label, 0)
    sum_pos_weights = utils.sum_of_weights_v2(weights_train, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    d_train = xgboost.DMatrix(X_train, label=Y_train, weight=weights_train)
    d_test = xgboost.DMatrix(X_test, label=Y_test)
    d_data = xgboost.DMatrix(X_data)

    param = {
        'max_depth': config["max_depth"],
        'eta': config["eta"],
        'subsample': config["subsample"],
        'colsample_bytree': config["colsample_bytree"],
        'min_child_weight': config["min_child_weight"],
        'gamma': config["gamma"],
        'reg_alpha': config["reg_alpha"],
        'reg_lambda': config["reg_lambda"],
        'scale_pos_weight': sum_neg_weights / sum_pos_weights,
        'objective': 'binary:logistic',
        'nthread': 16,
    }

    if config["multiclassifier"]:
        param["num_class"] = config["n_class"]
        param["objective"] = "multi:softprob"
        param["scale_pos_weight"] = 1

    evallist = [(d_train, 'train'), (d_test, 'test')]
    progress = {}

    n_round = config["n_round"]
    print((param, n_round))

    # train
    bdt = xgboost.train(param,
                        d_train,
                        n_round,
                        evallist,
                        evals_result=progress)

    bdt.save_model(config["tag"] + "_bdt.xgb")
    model = bdt.get_dump()

    input_variables = []
    for name in feature_names:
        input_variables.append((name, 'F'))
    #tmva_utils.convert_model(model, input_variables = input_variables, output_xml = config["tag"] + '_bdt.xml')

    # predict
    pred_train = bdt.predict(d_train, output_margin=config["multiclassifier"])
    pred_test = bdt.predict(d_test, output_margin=config["multiclassifier"])
    pred_data = bdt.predict(d_data, output_margin=config["multiclassifier"])

    fpr_train, tpr_train, thresh_train = metrics.roc_curve(
        y_train, pred_train, pos_label=1, sample_weight=weights_train)
    fpr_test, tpr_test, thresh_test = metrics.roc_curve(
        y_test, pred_test, pos_label=1, sample_weight=weights_test)

    auc_train, auc_train_unc = utils.auc_and_unc(y_train, pred_train,
                                                 weights_train, 100)
    auc_test, auc_test_unc = utils.auc_and_unc(y_test, pred_test, weights_test,
                                               100)

    #auc_train = metrics.auc(fpr_train, tpr_train, reorder = True)
    #auc_test  = metrics.auc(fpr_test , tpr_test , reorder = True)

    print(("Training AUC: %.3f" % auc_train))
    print(("Testing  AUC: %.3f" % auc_test))

    # estimate z_a w/at least 4 signal events
    n_quantiles = 25
    signal_mva_scores = {
        "bdt_score": ks_test.logical_vector(pred_test, y_test, 1)
    }
    bkg_mva_scores = {
        "bdt_score": ks_test.logical_vector(pred_test, y_test, 0)
    }
    data_mva_scores = {"bdt_score": pred_data}

    signal_mass = ks_test.logical_vector(mass_validation, y_test, 1)
    bkg_mass = ks_test.logical_vector(mass_validation, y_test, 0)

    signal_weights = ks_test.logical_vector(weights_validation, y_test, 1)
    bkg_weights = ks_test.logical_vector(weights_validation, y_test, 0)

    optimization_vars = config["optimization_vars"].split(
        ",") if config["optimization_vars"] else []
    for var in optimization_vars:
        signal_mva_scores[var] = ks_test.logical_vector(
            utils.load_array(f, var + '_validation'), y_test, 1)
        bkg_mva_scores[var] = ks_test.logical_vector(
            utils.load_array(f, var + '_validation'), y_test, 0)
        data_mva_scores[var] = utils.load_array(f, var + '_data')

    signal_events = {
        "mass": signal_mass,
        "weights": signal_weights,
        "mva_score": signal_mva_scores
    }
    bkg_events = {
        "mass": bkg_mass,
        "weights": bkg_weights,
        "mva_score": bkg_mva_scores
    }
    data_events = {
        "mass": mass_data,
        "weights": weights_data,
        "mva_score": data_mva_scores
    }

    za, za_unc, s, b, sigma_eff = significance_utils.za_scores(
        n_quantiles, signal_events, bkg_events, False)
    za_data, za_unc_data, s_data, b_data, sigma_eff_data = significance_utils.za_scores(
        n_quantiles, signal_events, data_events, True)
    za = numpy.asarray(za)
    za_data = numpy.asarray(za_data)

    if numpy.all(za == 0) or numpy.all(za_data == 0):
        return 0.0, 0.0, 0.0, 0.0

    max_za_mc = numpy.max(za[numpy.where(numpy.asarray(s) >= 4.)])
    max_za_data = numpy.max(za_data[numpy.where(numpy.asarray(s_data) >= 4.)])

    max_za_mc, max_za_mc_idx = utils.find_nearest(za, max_za_mc)
    max_za_data, max_za_data_idx = utils.find_nearest(za_data, max_za_data)

    max_za_mc_unc = za_unc[max_za_mc_idx]
    max_za_data_unc = za_unc_data[max_za_data_idx]

    print(("Max Z_A on MC:   %.4f +/- %.4f" % (max_za_mc, max_za_mc_unc)))
    print(("Max Z_A on data: %.4f +/- %.4f" % (max_za_data, max_za_data_unc)))

    return max_za_mc, max_za_mc_unc, max_za_data, max_za_data_unc, auc_train, auc_train_unc, auc_test, auc_test_unc
Exemplo n.º 19
0
    for i in range(0, len(labels)):

        if labels[i]:
            used.add(i)
            used_in_new.add(counter)
            data_new[counter] = data[i]
            counter += 1

    #reorder the array with labels
    for i in range(0, counter):
        labels_new[i] = True

    #reorder the array with images
    filler_new = 0

    for i in range(counter, len(data)):

        if i not in used_in_new:

            data_new[i] = data[filler_new]
            filler_new += 1

    #np.save("data/PV/balanced/PV_split.npy")
    #np.save("data/PV/balanced/labels.npy", labels_new)


data = utils.load_array("data/PV/X_cells_only.bc")

labels = np.load("data/PV/labels.npy")

create_dataset(labels, data)
Exemplo n.º 20
0
# In[157]:

utils.save_array(data_path + 'train/train.bc', train.as_matrix())

# In[158]:

utils.save_array(data_path + 'train/meta_train.bc', meta.as_matrix())

# ## Further Feature Engineering

# After converting 'csv_to_hdf5.py' functionality to pandas, I saved that array and then simply constructed the rest of the features as specified in the paper using pandas. I didn't bother seeing how the author did it as it was extremely obtuse and involved the fuel module.

# In[424]:

train = pd.DataFrame(utils.load_array(data_path + 'train/train.bc'),
                     columns=[
                         'TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND',
                         'TAXI_ID', 'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA',
                         'POLYLINE', 'LATITUDE', 'LONGITUDE'
                     ])

# In[425]:

train.head()

# The paper discusses how many categorical variables there are per category. The following all check out

# In[426]:

train['ORIGIN_CALL'].max()
Exemplo n.º 21
0
    num2 = int(two[0].split('/')[-1].split('.')[0])
    return num1 - num2


dsets = datasets.ImageFolder(test_dir, data_transforms['test'])
#dsets.imgs = sorted(dsets.imgs, key=cmp_to_key(mycmp))
dsets.imgs = sorted(dsets.imgs)

dsetsv3 = datasets.ImageFolder(test_dir, data_transforms['testv3'])
#dsetsv3.imgs = sorted(dsetsv3.imgs, key=cmp_to_key(mycmp))
dsetsv3.imgs = sorted(dsetsv3.imgs)

print(dsets.imgs[:5])
print(dsetsv3.imgs[:5])

dset_classes = load_array(CLASSES_FILE)
print(dset_classes)

test_loader = torch.utils.data.DataLoader(dsets,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          num_workers=4)

test_v3_loader = torch.utils.data.DataLoader(dsetsv3,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=4)

use_gpu = torch.cuda.is_available()

Exemplo n.º 22
0
#!/usr/bin/env python
import numpy as np
from utils import load_array

data_arry = [
    'part-time-job', 'full-time-job', 'hourly-wage', 'salary',
    'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed',
    "licence-needed", '1-year-experience-needed',
    '2-4-years-experience-needed', '5-plus-years-experience-needed',
    'supervising-job'
]

if __name__ == '__main__':
    sub_data = load_array('../save_data/save_array.bc')
    labels = [['tags']]
    for k1, v1 in enumerate(sub_data):
        each_row = []
        for k2, v2 in enumerate(v1):
            if v2 == 1:
                each_row.append(data_arry[k2])

        labels.append(each_row)

    res_file = open("../submissions/submit.tsv", 'w')

    for r in labels:
        res_file.write(' '.join(r) + "\n")
    res_file.close()
Exemplo n.º 23
0
 def getPretrainDataPerFolder(self, data):
     x = utils.load_array(self.baseFolder + data.pretrain[0])
     y = utils.load_array(self.baseFolder + data.pretrain[1])
     f = utils.load_array(self.baseFolder + data.pretrain[2])
     return [x, y, f]
Exemplo n.º 24
0
def load_pred_data():
    preds = load_array(RESULTS_DIR + '/test_preds')
    filenames = load_array(RESULTS_DIR + '/filenames')
    return filenames, preds
Exemplo n.º 25
0
test_percent=.25
scoring='precision'

(feature_names_comps, feature_names_ab,
     feature_names_ti, X_comps, X_ab, X_ti, y) = get_data(dataset_version)


# <codecell>

############################################################################
# extract features via chi2
# assemble X_train, X_test, and feature_names
print("assembling features")
t0 = time()

fnab = utils.load_array('data/feature_names_ab')
fnti = utils.load_array('data/feature_names_ti')
fn = np.append(fnab, fnti)
X_train = utils.load_coo('data/X_train').todense()
y_train = utils.load_array('data/y_train')
X_test = utils.load_coo('data/X_test').todense()
y_test = utils.load_array('data/y_test')
print("done assembling features in %fs" % (time() - t0))

# <codecell>


fitted=[]

def grid_search(estimator):
    # try:
Exemplo n.º 26
0
from vgg16 import Vgg16
vgg = Vgg16()
model = vgg.model

# Use batch size of 1 since we're just doing preprocessing on the CPU
val_batches = get_batches(path+'valid', shuffle=False, batch_size=1)
batches = get_batches(path+'train', shuffle=False, batch_size=1)

val_data = get_data(path+'valid')

trn_data = get_data(path+'train')

save_array(model_path+'train_data.bc', trn_data)
save_array(model_path+'valid_data.bc', val_data)

trn_data = load_array(model_path+'train_data.bc')
val_data = load_array(model_path+'valid_data.bc')

val_classes = val_batches.classes
trn_classes = batches.classes
val_labels = onehot(val_classes)
trn_labels = onehot(trn_classes)

# trn_features = model.predict(trn_data, batch_size=batch_size)
# val_features = model.predict(val_data, batch_size=batch_size)
#
# save_array(model_path+'train_lastlayer_features.bc', trn_features)
# save_array(model_path+'valid_lastlayer_features.bc', val_features)
#
# trn_features = load_array(model_path+'train_lastlayer_features.bc')
# val_features = load_array(model_path+'valid_lastlayer_features.bc')
Exemplo n.º 27
0
def train_bdt(config, invert=False):
    results = {}

    args = config["args"]
    ### Read features ###
    if not invert:
        f = h5py.File(args.input.replace(".hdf5", "") + ".hdf5", "r")
    else:
        f = h5py.File(args.input_invert.replace(".hdf5", "") + ".hdf5", "r")

    feature_names = utils.load_array(f, 'feature_names')
    training_feature_names = utils.load_array(f, 'training_feature_names')

    print(training_feature_names)

    global_features = utils.load_array(f, 'global')
    global_dnn_features = utils.load_array(f, 'global_dnn')
    label = utils.load_array(f, 'label')
    multi_label = utils.load_array(f, 'multi_label')
    weights = utils.load_array(f, 'weights')
    mass = utils.load_array(f, 'mass')
    njets = utils.load_array(f, 'njets')
    lead_sigmaEtoE = utils.load_array(f, 'lead_sigmaEtoE')
    sublead_sigmaEtoE = utils.load_array(f, 'sublead_sigmaEtoE')
    signal_mass_label = utils.load_array(f, 'signal_mass_label')
    signal_mass_category = utils.load_array(f, 'signal_mass_category')
    tth_2017_reference_mva = utils.load_array(f, 'tth_2017_reference_mva')
    evt = utils.load_array(f, 'evt')
    run = utils.load_array(f, 'run')
    lumi = utils.load_array(f, 'lumi')
    process_id = utils.load_array(f, 'process_id')
    year = utils.load_array(f, 'year')
    #objects = utils.load_array(f, 'objects')
    tth_runII_mva = utils.load_array(f, 'tth_runII_mva')

    if args.sideband:
        global_features = utils.load_array(f, 'global_data_sideband')
        label = utils.load_array(f, 'label_data_sideband')
        multi_label = utils.load_array(f, 'multi_label_data_sideband')
        weights = utils.load_array(f, 'weights_data_sideband')
        mass = utils.load_array(f, 'mass_data_sideband')
        #lead_sigmaEtoE = utils.load_array(f, 'lead_sigmaEtoE_data_sideband')
        #sublead_sigmaEtoE = utils.load_array(f, 'sublead_sigmaEtoE_data_sideband')

    global_features_validation = utils.load_array(f, 'global_validation')
    global_dnn_features_validation = utils.load_array(f,
                                                      'global_dnn_validation')
    label_validation = utils.load_array(f, 'label_validation')
    multi_label_validation = utils.load_array(f, 'multi_label_validation')
    weights_validation = utils.load_array(f, 'weights_validation')
    mass_validation = utils.load_array(f, 'mass_validation')
    njets_validation = utils.load_array(f, 'njets_validation')
    signal_mass_label_validation = utils.load_array(
        f, 'signal_mass_label_validation')
    signal_mass_category_validation = utils.load_array(
        f, 'signal_mass_category_validation')
    tth_2017_reference_mva_validation = utils.load_array(
        f, 'tth_2017_reference_mva_validation')
    evt_validation = utils.load_array(f, 'evt_validation')
    run_validation = utils.load_array(f, 'run_validation')
    lumi_validation = utils.load_array(f, 'lumi_validation')
    process_id_validation = utils.load_array(f, 'process_id_validation')
    year_validation = utils.load_array(f, 'year_validation')
    #objects_validation = utils.load_array(f, 'objects_validation')
    tth_runII_mva_validation = utils.load_array(f, 'tth_runII_mva_validation')

    global_features_data = utils.load_array(f, 'global_data')
    global_dnn_features_data = utils.load_array(f, 'global_dnn_data')
    label_data = utils.load_array(f, 'label_data')
    multi_label_data = utils.load_array(f, 'multi_label_data')
    weights_data = utils.load_array(f, 'weights_data')
    mass_data = utils.load_array(f, 'mass_data')
    njets_data = utils.load_array(f, 'njets_data')
    signal_mass_label_data = utils.load_array(f, 'signal_mass_label_data')
    signal_mass_category_data = utils.load_array(f,
                                                 'signal_mass_category_data')
    tth_2017_reference_mva_data = utils.load_array(
        f, 'tth_2017_reference_mva_data')
    evt_data = utils.load_array(f, 'evt_data')
    run_data = utils.load_array(f, 'run_data')
    lumi_data = utils.load_array(f, 'lumi_data')
    process_id_data = utils.load_array(f, 'process_id_data')
    year_data = utils.load_array(f, 'year_data')
    #objects_data = utils.load_array(f, 'objects_data')
    tth_runII_mva_data = utils.load_array(f, 'tth_runII_mva_data')

    global_features_final_fit = utils.load_array(f, 'global_final_fit')
    global_dnn_features_final_fit = utils.load_array(f, 'global_dnn_final_fit')
    label_final_fit = utils.load_array(f, 'label_final_fit')
    multi_label_final_fit = utils.load_array(f, 'multi_label_final_fit')
    weights_final_fit = utils.load_array(f, 'weights_final_fit')
    mass_final_fit = utils.load_array(f, 'mass_final_fit')
    njets_final_fit = utils.load_array(f, 'njets_final_fit')
    signal_mass_label_final_fit = utils.load_array(
        f, 'signal_mass_label_final_fit')
    signal_mass_category_final_fit = utils.load_array(
        f, 'signal_mass_category_final_fit')
    tth_2017_reference_mva_final_fit = utils.load_array(
        f, 'tth_2017_reference_mva_final_fit')
    evt_final_fit = utils.load_array(f, 'evt_final_fit')
    run_final_fit = utils.load_array(f, 'run_final_fit')
    lumi_final_fit = utils.load_array(f, 'lumi_final_fit')
    process_id_final_fit = utils.load_array(f, 'process_id_final_fit')
    year_final_fit = utils.load_array(f, 'year_final_fit')
    #objects_final_fit = utils.load_array(f, 'objects_final_fit')
    tth_runII_mva_final_fit = utils.load_array(f, 'tth_runII_mva_final_fit')

    print global_dnn_features.shape, global_dnn_features_validation.shape, global_dnn_features_data.shape, global_dnn_features_final_fit.shape

    num_multi_class = 3  #len(numpy.unique(multi_label, return_index = True))

    train_frac = 1.0  # use this fraction of data for training, use 1-train_frac for testing
    nTrain = int(len(label) * train_frac)

    print((global_features.shape))
    print((label.shape))
    print((weights.shape))

    print((global_features_validation.shape))
    print((label_validation.shape))
    print((weights_validation.shape))

    print((global_features_data.shape))
    print((label_data.shape))
    print((weights_data.shape))

    x_train, y_train, y_train_multi, weights_train = global_features, label, multi_label, weights
    x_test, y_test, y_test_multi, weights_test = global_features_validation, label_validation, multi_label_validation, weights_validation

    X_train = pandas.DataFrame(data=x_train, columns=training_feature_names)
    X_test = pandas.DataFrame(data=x_test, columns=training_feature_names)
    X_data = pandas.DataFrame(data=global_features_data,
                              columns=training_feature_names)
    X_final_fit = pandas.DataFrame(data=global_features_final_fit,
                                   columns=training_feature_names)

    if args.multi:
        Y_train = y_train_multi
        Y_test = y_test_multi
    else:
        Y_train = y_train
        Y_test = y_test

    #unique, count =  numpy.unique(multi_label,return_counts=True)
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 0, 1/(count[0]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 1, 1/(count[1]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 2, 1/(count[2]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 3, 1/(count[3]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 4, 1/(count[4]/float(sum(count))), 1) )

    sum_neg_weights = utils.sum_of_weights(weights_train, label, 0)
    sum_pos_weights = utils.sum_of_weights(weights_train, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    scale_tth = False
    if scale_tth:
        for i in range(len(weights_train)):
            if multi_label[i] == 1:
                weights_train[i] *= 6.
    weights_train_modified = weights_train

    equal_weights = args.equal_weights
    if args.multi:
        if not equal_weights:
            for j in range(len(weights_train_modified)):
                if multi_label[j] == 0:
                    weights_train_modified[
                        j] *= sum_neg_weights / sum_pos_weights
        else:
            for i in range(num_multi_class):
                sum_class_weights = utils.sum_of_weights(
                    weights_train_modified, multi_label, i)
                print(("Normalizing class %d by %.6f" %
                       (i, sum_class_weights)))
                for j in range(len(weights_train_modified)):
                    if multi_label[j] == i:
                        weights_train_modified[j] *= 1. / sum_class_weights

    if args.res:
        for i in range(len(weights_train_modified)):
            if label[i] == 1:
                print((
                    weights_train_modified[i], 1 /
                    math.sqrt(lead_sigmaEtoE[i]**2 + sublead_sigmaEtoE[i]**2)))
                weights_train_modified[i] *= 1 / math.sqrt(
                    lead_sigmaEtoE[i]**2 + sublead_sigmaEtoE[i]**2)
                print((weights_train_modified[i]))

    sum_neg_weights = utils.sum_of_weights(weights_train_modified, label, 0)
    sum_pos_weights = utils.sum_of_weights(weights_train_modified, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    d_train = xgboost.DMatrix(X_train,
                              label=Y_train,
                              weight=weights_train_modified)
    d_test = xgboost.DMatrix(X_test, label=Y_test)
    d_data = xgboost.DMatrix(X_data)
    d_final_fit = xgboost.DMatrix(X_final_fit)

    # Define BDT parameters
    if "kparam" not in list(config.keys()):
        param = {
            'max_depth': 4,
            'eta': 0.2,
            'objective': 'binary:logistic',
            'scale_pos_weight': sum_neg_weights / sum_pos_weights,
            'subsample': 1.0,
            'colsample_bytree': 1.0,
            'nthread': 12,
            'min_child_weight': min(
                1, (sum_neg_weights) / 100000.
            ),  # min_child_weight depends on the absolute value of the weights
        }
    else:
        param = config["kparam"]

    if args.multi:
        param["num_class"] = num_multi_class
        param["objective"] = "multi:softprob"
        param["scale_pos_weight"] = 1
        param["min_child_weight"] = 0.000001

    print(param)

    if "n_round" not in list(config.keys()):
        n_round = 300 if args.channel == "Hadronic" else 150
        #n_round = 10
        if "FCNC" in args.input:
            n_round = 150
            if args.multi:
                n_round = 150
            if "SMHiggs" in args.input and args.channel == "Hadronic":
                n_round = 500
    else:
        n_round = config["n_round"]
    evallist = [(d_train, 'train'), (d_test, 'test')]
    progress = {}

    print((param, n_round))

    # train
    bdt = xgboost.train(param,
                        d_train,
                        n_round,
                        evallist,
                        evals_result=progress)

    bdt.save_model(args.channel + "_" + args.tag + "_" + args.ext + "_bdt.xgb")
    model = bdt.get_dump()

    input_variables = []
    for name in feature_names:
        input_variables.append((name, 'F'))
    tmva_utils.convert_model(model,
                             input_variables=input_variables,
                             output_xml=args.channel + "_" + args.tag + "_" +
                             args.ext + '_bdt.xml')

    # predict
    pred_train = bdt.predict(d_train)
    pred_test = bdt.predict(d_test)
    pred_data = bdt.predict(d_data)
    pred_final_fit = bdt.predict(d_final_fit)

    if args.reference_mva != "none":
        if ".xgb" in args.reference_mva:
            ref_mva = xgboost.Booster()
            ref_mva.load_model(args.reference_mva)
            pred_ref_train = ref_mva.predict(d_train, output_margin=args.multi)
            pred_ref_test = ref_mva.predict(d_test, output_margin=args.multi)
            pred_ref_data = ref_mva.predict(d_data, output_margin=args.multi)
            pred_ref_final_fit = ref_mva.predict(d_final_fit,
                                                 output_margin=args.multi)
        elif ".json" in args.reference_mva:
            import dnn_helper
            dnn_features_train = dnn_helper.DNN_Features(
                name='train',
                global_features=global_dnn_features,
                objects=objects)
            dnn_features_validation = dnn_helper.DNN_Features(
                name='validation',
                global_features=global_dnn_features_validation,
                objects=objects_validation)
            dnn_features_data = dnn_helper.DNN_Features(
                name='data',
                global_features=global_dnn_features_data,
                objects=objects_data)
            dnn_features_final_fit = dnn_helper.DNN_Features(
                name='final_fit',
                global_features=global_dnn_features_final_fit,
                objects=objects_final_fit)
            with open(args.reference_mva, "r") as f_in:
                metadata = json.load(f_in)
            dnn = dnn_helper.DNN_Helper(
                features_validation=dnn_features_validation,
                features_train=dnn_features_train,
                features_data=dnn_features_data,
                features_final_fit=dnn_features_final_fit,
                metadata=metadata,
                weights_file="dnn_weights/" + metadata["weights"],
                train_mode=False)
            dnn.predict(debug=True)
            pred_ref_train = dnn.predictions["train"]
            pred_ref_test = dnn.predictions["validation"]
            pred_ref_data = dnn.predictions["data"]
            pred_ref_final_fit = dnn.predictions["final_fit"]

    print((pred_test.shape))

    #if args.multi:
    #  pred_train = pred_train[:,0]
    #  pred_test = pred_test[:,0]
    #  pred_data = pred_data[:,0]
    #  pred_final_fit = pred_final_fit[:,0]

    print((pred_test.shape))

    # analysis
    # ks test

    if args.multi:
        prediction_train = pred_train[:, 0]
        prediction_test = pred_test[:, 0]
    else:
        prediction_train = pred_train
        prediction_test = pred_test

    d_sig, p_value_sig, d_bkg, p_value_bkg = ks_test.ks_test(
        prediction_train, prediction_test, y_train, y_test)
    print(
        ("Results of ks-test (d-score) for signal: %.10f and background: %.10f"
         % (d_sig, d_bkg)))
    print(
        ("Results of ks-test (p-value) for signal: %.10f and background: %.10f"
         % (p_value_sig, p_value_bkg)))

    # roc curves
    fpr_train, tpr_train, thresh_train = metrics.roc_curve(
        y_train, prediction_train, pos_label=1, sample_weight=weights_train)
    fpr_test, tpr_test, thresh_test = metrics.roc_curve(
        y_test, prediction_test, pos_label=1, sample_weight=weights_test)

    y_train_2016 = ks_test.logical_vector(y_train, year, 2016)
    y_test_2016 = ks_test.logical_vector(y_test, year_validation, 2016)
    prediction_train_2016 = ks_test.logical_vector(prediction_train, year,
                                                   2016)
    prediction_test_2016 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2016)
    weights_train_2016 = ks_test.logical_vector(weights_train, year, 2016)
    weights_test_2016 = ks_test.logical_vector(weights_test, year_validation,
                                               2016)

    y_train_2017 = ks_test.logical_vector(y_train, year, 2017)
    y_test_2017 = ks_test.logical_vector(y_test, year_validation, 2017)
    prediction_train_2017 = ks_test.logical_vector(prediction_train, year,
                                                   2017)
    prediction_test_2017 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2017)
    weights_train_2017 = ks_test.logical_vector(weights_train, year, 2017)
    weights_test_2017 = ks_test.logical_vector(weights_test, year_validation,
                                               2017)

    y_train_2018 = ks_test.logical_vector(y_train, year, 2018)
    y_test_2018 = ks_test.logical_vector(y_test, year_validation, 2018)
    prediction_train_2018 = ks_test.logical_vector(prediction_train, year,
                                                   2018)
    prediction_test_2018 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2018)
    weights_train_2018 = ks_test.logical_vector(weights_train, year, 2018)
    weights_test_2018 = ks_test.logical_vector(weights_test, year_validation,
                                               2018)

    if len(y_train_2016) > 0:
        fpr_train_2016, tpr_train_2016, thresh_train_2016 = metrics.roc_curve(
            y_train_2016,
            prediction_train_2016,
            pos_label=1,
            sample_weight=weights_train_2016)
        fpr_test_2016, tpr_test_2016, thresh_test_2016 = metrics.roc_curve(
            y_test_2016,
            prediction_test_2016,
            pos_label=1,
            sample_weight=weights_test_2016)
        auc_2016, unc_2016, blah, blah, blah = utils.auc_and_unc(
            y_test_2016, prediction_test_2016, weights_test_2016, 25)
        print(("Testing  AUC (2016): %.3f +/- %.4f" % (auc_2016, unc_2016)))
        numpy.savez("bdt_roc_2016_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2016,
                    y_test=y_test_2016,
                    prediction_train=prediction_train_2016,
                    prediction_test=prediction_test_2016,
                    fpr_train=fpr_train_2016,
                    fpr_test=fpr_test_2016,
                    tpr_train=tpr_train_2016,
                    tpr_test=tpr_test_2016)

    if len(y_train_2017) > 0:
        fpr_train_2017, tpr_train_2017, thresh_train_2017 = metrics.roc_curve(
            y_train_2017,
            prediction_train_2017,
            pos_label=1,
            sample_weight=weights_train_2017)
        fpr_test_2017, tpr_test_2017, thresh_test_2017 = metrics.roc_curve(
            y_test_2017,
            prediction_test_2017,
            pos_label=1,
            sample_weight=weights_test_2017)
        auc_2017, unc_2017, blah, blah, blah = utils.auc_and_unc(
            y_test_2017, prediction_test_2017, weights_test_2017, 25)
        print(("Testing  AUC (2017): %.3f +/- %.4f" % (auc_2017, unc_2017)))
        numpy.savez("bdt_roc_2017_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2017,
                    y_test=y_test_2017,
                    prediction_train=prediction_train_2017,
                    prediction_test=prediction_test_2017,
                    fpr_train=fpr_train_2017,
                    fpr_test=fpr_test_2017,
                    tpr_train=tpr_train_2017,
                    tpr_test=tpr_test_2017)

    if len(y_train_2018) > 0:
        fpr_train_2018, tpr_train_2018, thresh_train_2018 = metrics.roc_curve(
            y_train_2018,
            prediction_train_2018,
            pos_label=1,
            sample_weight=weights_train_2018)
        fpr_test_2018, tpr_test_2018, thresh_test_2018 = metrics.roc_curve(
            y_test_2018,
            prediction_test_2018,
            pos_label=1,
            sample_weight=weights_test_2018)
        auc_2018, unc_2018, blah, blah, blah = utils.auc_and_unc(
            y_test_2018, prediction_test_2018, weights_test_2018, 25)
        print(("Testing  AUC (2018): %.3f +/- %.4f" % (auc_2018, unc_2018)))
        numpy.savez("bdt_roc_2018_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2018,
                    y_test=y_test_2018,
                    prediction_train=prediction_train_2018,
                    prediction_test=prediction_test_2018,
                    fpr_train=fpr_train_2018,
                    fpr_test=fpr_test_2018,
                    tpr_train=tpr_train_2018,
                    tpr_test=tpr_test_2018)

    auc_train = metrics.auc(fpr_train, tpr_train, reorder=True)
    auc_test = metrics.auc(fpr_test, tpr_test, reorder=True)

    auc, unc, blah, blah, blah = utils.auc_and_unc(y_test, prediction_test,
                                                   weights_test, 25)

    results["auc_train"] = auc_train
    results["auc_test"] = auc_test
    results["auc_test_unc"] = unc
    if "skip_tree" in list(config.keys()):
        return results

    print(("Training AUC: %.3f" % auc_train))
    print(("Testing  AUC: %.3f" % auc_test))

    print(("Testing  AUC: %.3f +/- %.4f" % (auc, unc)))

    numpy.savez("bdt_roc_%s.npz" % (args.channel + "_" + args.tag),
                y_train=y_train,
                y_test=y_test,
                prediction_train=prediction_train,
                prediction_test=prediction_test,
                fpr_train=fpr_train,
                fpr_test=fpr_test,
                tpr_train=tpr_train,
                tpr_test=tpr_test)

    # Write output to TTree
    tree_train_id = numpy.concatenate(
        (numpy.zeros(len(pred_train)), numpy.ones(len(pred_test)),
         numpy.ones(len(pred_data)), numpy.ones(len(pred_final_fit))))
    tree_sample_id = numpy.concatenate(
        (label, label_validation, label_data, numpy.ones(len(pred_final_fit))))
    tree_mass = numpy.concatenate(
        (mass, mass_validation, mass_data, mass_final_fit))
    tree_weight = numpy.concatenate(
        (weights, weights_validation, weights_data, weights_final_fit))
    tree_signal_mass_label = numpy.concatenate(
        (signal_mass_label, signal_mass_label_validation,
         signal_mass_label_data, numpy.zeros(len(pred_final_fit))))
    tree_signal_mass_category = numpy.concatenate(
        (signal_mass_category, signal_mass_category_validation,
         signal_mass_category_data, numpy.zeros(len(pred_final_fit))))
    tree_tth_2017_reference_mva = numpy.concatenate(
        (tth_2017_reference_mva, tth_2017_reference_mva_validation,
         tth_2017_reference_mva_data, tth_2017_reference_mva_final_fit))
    tree_evt = numpy.concatenate(
        (evt, evt_validation, evt_data, evt_final_fit))
    tree_tth_runII_mva = numpy.concatenate(
        (tth_runII_mva, tth_runII_mva_validation, tth_runII_mva_data,
         tth_runII_mva_final_fit))
    tree_run = numpy.concatenate(
        (run, run_validation, run_data, run_final_fit))
    tree_lumi = numpy.concatenate(
        (lumi, lumi_validation, lumi_data, lumi_final_fit))
    tree_process_id = numpy.concatenate(
        (process_id, process_id_validation, process_id_data,
         process_id_final_fit))
    tree_year = numpy.concatenate(
        (year, year_validation, year_data, year_final_fit))
    tree_global_features = numpy.concatenate(
        (global_features, global_features_validation, global_features_data,
         global_features_final_fit))

    if ".json" in args.reference_mva:
        tree_dnn_features = numpy.concatenate(
            (global_dnn_features, global_dnn_features_validation,
             global_dnn_features_data, global_dnn_features_final_fit))

    training_feature_names = [
        training_feature_names for i in range(len(label))
    ]
    training_feature_names_validation = [
        training_feature_names for i in range(len(label_validation))
    ]
    training_feature_names_data = [
        training_feature_names for i in range(len(label_data))
    ]
    training_feature_names_final_fit = [
        training_feature_names for i in range(len(label_final_fit))
    ]
    #tree_training_feature_names = numpy.concatenate((training_feature_names, training_feature_names_validation, training_feature_names_data, training_feature_names_final_fit))

    tree_train_id = tree_train_id.astype(numpy.int64)
    tree_sample_id = tree_sample_id.astype(numpy.int64)
    tree_mass = tree_mass.astype(numpy.float64)
    tree_weight = tree_weight.astype(numpy.float64)
    tree_signal_mass_label = tree_signal_mass_label.astype(numpy.int64)
    tree_signal_mass_category = tree_signal_mass_category.astype(numpy.int64)
    tree_tth_2017_reference_mva = tree_tth_2017_reference_mva.astype(
        numpy.float64)
    tree_evt = tree_evt.astype(numpy.uint64)
    tree_tth_runII_mva = tree_tth_runII_mva.astype(numpy.float64)
    tree_run = tree_run.astype(numpy.uint64)
    tree_lumi = tree_lumi.astype(numpy.uint64)
    tree_process_id = tree_process_id.astype(numpy.int64)
    tree_year = tree_year.astype(numpy.int64)
    tree_global_features = tree_global_features.astype(numpy.float64)
    if ".json" in args.reference_mva:
        tree_dnn_features = tree_dnn_features.astype(numpy.float64)


#tree_training_feature_names = tree_training_feature_names.astype(numpy.string_)

    dict = {
        "train_id": tree_train_id,
        "sample_id": tree_sample_id,
        "mass": tree_mass,
        "weight": tree_weight,
        "signal_mass_label": tree_signal_mass_label,
        "signal_mass_category": tree_signal_mass_category,
        "tth_2017_reference_mva": tree_tth_2017_reference_mva,
        "process_id": tree_process_id,
        "year": tree_year,
        "event": tree_evt,
        "lumi": tree_lumi,
        "run": tree_run,
        "global_features": tree_global_features,
        "tth_runII_mva": tree_tth_runII_mva
    }  #, "training_feature_names" : tree_training_feature_names}

    if ".json" in args.reference_mva:
        dict["dnn_global_features"] = tree_dnn_features

    if args.multi:
        tree_bdt_score = []
        for i in range(num_multi_class):
            tree_bdt_score.append(
                numpy.concatenate(
                    (pred_train[:, i], pred_test[:, i], pred_data[:, i],
                     numpy.ones(len(pred_final_fit)))))
            tree_bdt_score[i] = tree_bdt_score[i].astype(numpy.float64)
            dict["mva_score_%d" % i] = tree_bdt_score[i]

    else:
        tree_bdt_score = numpy.concatenate(
            (pred_train, pred_test, pred_data, pred_final_fit))
        tree_bdt_score = tree_bdt_score.astype(numpy.float64)
        dict["mva_score"] = tree_bdt_score

    if args.reference_mva != "none":
        tree_ref_mva_score = numpy.concatenate(
            (pred_ref_train, pred_ref_test, pred_ref_data, pred_ref_final_fit))
        tree_ref_mva_score = tree_ref_mva_score.astype(numpy.float64)
        dict[args.reference_mva_name] = tree_ref_mva_score

    tree_utils.numpy_to_tree(
        dict, "ttH%s_%s_FinalFitTree.root" % (args.channel, args.tag))

    ### Make diagnostic plots ###
    import matplotlib.pyplot as plt

    # variable importance #
    fig = plt.figure()
    xgboost.plot_importance(bdt)
    plt.tight_layout()
    plt.savefig('feature_importance_' + args.channel + '.pdf')

    # make ROC curve #
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.yaxis.set_ticks_position('both')
    ax.grid(True)
    plt.grid(color='black', linestyle='--', linewidth=0.1, which='both')

    plt.plot(fpr_train, tpr_train, color='red', label='Training Set', lw=3)
    plt.plot(fpr_test, tpr_test, color='green', label='Testing Set', lw=3)

    plt.xscale('log')

    plt.xlim([0.005, 1.0])
    plt.ylim([0.3, 1.05])
    plt.xlabel('False Positive Rate (background efficiency)')
    plt.ylabel('True Positive Rate (signal efficiency)')
    plt.legend(loc='lower right')
    plt.savefig('roc' + args.channel + '.pdf', bbox_inches='tight')

    estimate_za = True
    use_tth_runII_mva = False
    use_tth_2017_mva = False
    if estimate_za:
        n_quantiles = 30

        if args.multi:
            signal_mva_scores = {}
            bkg_mva_scores = {}
            data_mva_scores = {}
            for i in range(
                    0, num_multi_class - 1
            ):  # optimize with each of the bkg probabilities (the signal probability is redundant, i.e. sum of probabilities = 1)
                reverse = 1 if i == 0 else -1
                signal_mva_scores[
                    "bdt_score_%d" % i] = reverse * ks_test.logical_vector(
                        pred_test[:, i], y_test, 1
                    )  # factor of -1 so that we cut *below* certain values, as these are background probabilities, not signal
                bkg_mva_scores["bdt_score_%d" %
                               i] = reverse * ks_test.logical_vector(
                                   pred_test[:, i], y_test, 0)
                data_mva_scores["bdt_score_%d" % i] = reverse * pred_data[:, i]
        elif use_tth_runII_mva:
            print "Using RunII MVA from flashgg"
            signal_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_runII_mva_validation, y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_runII_mva_validation, y_test, 0)
            }
            data_mva_scores = {"bdt_score": tth_runII_mva_data}
        elif use_tth_2017_mva:
            print "Using 2017 ttH PAS MVA"
            signal_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_2017_reference_mva_validation,
                                       y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_2017_reference_mva_validation,
                                       y_test, 0)
            }
            data_mva_scores = {"bdt_score": tth_2017_reference_mva_data}
        else:
            print "Using the MVA we just trained"
            signal_mva_scores = {
                "bdt_score": ks_test.logical_vector(pred_test, y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score": ks_test.logical_vector(pred_test, y_test, 0)
            }
            data_mva_scores = {"bdt_score": pred_data}

        signal_mass = ks_test.logical_vector(mass_validation, y_test, 1)
        bkg_mass = ks_test.logical_vector(mass_validation, y_test, 0)

        signal_njets = ks_test.logical_vector(njets_validation, y_test, 1)
        bkg_njets = ks_test.logical_vector(njets_validation, y_test, 0)

        signal_weights = ks_test.logical_vector(weights_validation, y_test, 1)
        #if args.channel == "Leptonic" and "FCNC" in args.input:
        #  signal_weights *= 1./1.53 # to account for bug in MC sample where W->lv decays don't include taus
        bkg_weights = ks_test.logical_vector(weights_validation, y_test, 0)

        bkg_process_id = ks_test.logical_vector(process_id_validation, y_test,
                                                0)

        optimization_vars = args.optimization_vars.split(
            ",") if args.optimization_vars else []
        for var in optimization_vars:
            signal_mva_scores[var] = ks_test.logical_vector(
                utils.load_array(f, var + '_validation'), y_test, 1)
            bkg_mva_scores[var] = ks_test.logical_vector(
                utils.load_array(f, var + '_validation'), y_test, 0)
            data_mva_scores[var] = utils.load_array(f, var + '_data')

        signal_events = {
            "mass": signal_mass,
            "weights": signal_weights,
            "mva_score": signal_mva_scores
        }
        bkg_events = {
            "mass": bkg_mass,
            "weights": bkg_weights,
            "mva_score": bkg_mva_scores,
            "process_id": bkg_process_id
        }
        data_events = {
            "mass": mass_data,
            "weights": weights_data,
            "mva_score": data_mva_scores,
            "process_id": numpy.ones_like(mass_data)
        }

        # Trim these dictionaries down
        #for evts_dict in [signal_events, bkg_events, data_events]:
        #  good_indices = [index for index, value in enumerate(evts_dict["mass"]) if value < 180.]
        #  print float(len(good_indices))/float(len(evts_dict["mass"]))
        #  for key in evts_dict.iterkeys():
        #    full_array = evts_dict[key]
        #    trimmed_array = [full_array[i] for i in good_indices]
        #    evts_dict[key] = trimmed_array

        mass_shift = not (
            "FCNC" in args.input
        )  # if we're using FCNC as signal, all Higgs mass points should be 125
        # but, if we're using ttH as signal, we use M127 sample for testing, so need to shift for proper comparison with other M125 samples
        za, za_unc, s, b, sigma_eff = significance_utils.za_scores(
            n_quantiles, signal_events, bkg_events, False, {}, mass_shift)
        za_data, za_unc_data, s_data, b_data, sigma_eff_data = significance_utils.za_scores(
            n_quantiles, signal_events, data_events, True, bkg_events,
            mass_shift)
        za = numpy.asarray(za)

        max_za = numpy.max(za)
        max_za_unc = za_unc[numpy.argmax(za)]
        print((max_za, max_za_unc))

        numpy.savez("za_%s.npz" %
                    (args.channel + "_" + args.ext + "_" + args.tag),
                    za=za,
                    za_unc=za_unc,
                    signal=s,
                    bkg=b,
                    sigma_eff=sigma_eff,
                    za_data=za_data,
                    za_unc_data=za_unc_data,
                    signal_data=s_data,
                    bkg_data=b_data,
                    sigma_eff_data=sigma_eff_data)
        numpy.savez("sigma_eff.npz", sigma_eff=sigma_eff, n_sig=s)

        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        fig = plt.figure()
        ax1 = fig.add_subplot(111)
        ax1.plot(s, za, label='MC', color='red')
        ax1.fill_between(s,
                         numpy.asarray(za) - numpy.asarray(za_unc),
                         numpy.asarray(za) + numpy.asarray(za_unc),
                         color='red',
                         alpha=0.25)
        ax1.plot(s_data, za_data, label='Data', color='black')
        ax1.fill_between(s_data,
                         numpy.asarray(za_data) - numpy.asarray(za_unc_data),
                         numpy.asarray(za_data) + numpy.asarray(za_unc_data),
                         color='black',
                         alpha=0.25)

        plt.xlabel('# Signal Events')
        ax1.set_ylabel('Significance (Z_A)')

        plt.ylim([0.0, 3.0])
        l, r = plt.xlim()
        plt.xlim([1.0, r])

        ax1.legend(loc='upper right')
        plt.savefig('za_curve.pdf')

    return results