Пример #1
0
def load_experiment(experiment):
    dataset, exp = experiment.split('/')
    exp_path = get_path_to_dataset(dataset)+sep+exp+sep+exp+'.exp'
    with open(exp_path, 'rb') as f:
        exp = Experiment()
        exp.ParseFromString(f.read())
    return exp
Пример #2
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Пример #3
0
def load_experiment(experiment):
    dataset, exp = experiment.split('/')
    exp_path = get_path_to_dataset(dataset) + sep + exp + sep + exp + '.exp'
    with open(exp_path, 'rb') as f:
        exp = Experiment()
        exp.ParseFromString(f.read())
    return exp
Пример #4
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Пример #5
0
def load_model(exp_name):
    data, name = exp_name.split('/')
    exp_dir = ds.get_path_to_dataset(data) + os.sep + name +os.sep
    with open(exp_dir+"cfg.json") as json:
        model = model_from_json(json.read())
    model.set_weights(np.load(exp_dir+"weights.npy"))
    return model
Пример #6
0
def augment(dataset, format, shift_size):
    """

    Parameters
    ----------
    dataset :
    format :
    shift_size :

    Returns
    -------

    """
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Пример #7
0
def read_config_file(dataset_name, format):
    """ Reads a json file containing the locations of train/test data
    Each dataset should contain a file (DATASET_NAME.json).  This file
    should contain a description of the different formats
    that the data is available in.  Each of these coordinate
    systems will most likely be split into two files: one file with the
    training data and another file with the testing data.  The names of
    both of these files should be included in the json file.
    
    Parameters
    ----------
    dataset_name : name of valid dataset in deep_learning/data

    format: name of the format to get the paths for

    Returns
    -------
    train_path : path to the training file listed in the json file

    test_path : path to the testing file listed in the json file
    """
    dataset_path = ds.get_path_to_dataset(dataset_name)
    json_path = os.path.join(dataset_path, ("%s.json" % dataset_name))
    json_file = open(json_path, "r")

    json_data = json.load(json_file)

    file_dict = json_data[format]

    if all([x in file_dict for x in ["train_file", "test_file"]]):
        train_file_name = file_dict["train_file"]
        test_file_name = file_dict["test_file"]
        train_path = os.path.join(dataset_path, train_file_name)
        test_path = os.path.join(dataset_path, test_file_name)
        return dict(train_path=train_path,
                    test_path=test_path)

    elif all([x in file_dict for x in ["background", "signal"]]):
        background = file_dict["background"]
        signal = file_dict["signal"]
        background_path = os.path.join(dataset_path, background)
        signal_path = os.path.join(dataset_path, signal)
        return dict(background_path=background_path,
                    signal_path=signal_path)

    elif "both" in file_dict:
        both_path = os.path.join(dataset_path, file_dict["both"])
        return dict(both=both_path)
Пример #8
0
def read_config_file(dataset_name, format):
    """ Reads a json file containing the locations of train/test data
    Each dataset should contain a file (DATASET_NAME.json).  This file
    should contain a description of the different formats
    that the data is available in.  Each of these coordinate
    systems will most likely be split into two files: one file with the
    training data and another file with the testing data.  The names of
    both of these files should be included in the json file.
    
    Parameters
    ----------
    dataset_name : name of valid dataset in deep_learning/data

    format: name of the format to get the paths for

    Returns
    -------
    train_path : path to the training file listed in the json file

    test_path : path to the testing file listed in the json file
    """
    dataset_path = ds.get_path_to_dataset(dataset_name)
    json_path = os.path.join(dataset_path, ("%s.json" % dataset_name))
    json_file = open(json_path, "r")

    json_data = json.load(json_file)

    file_dict = json_data[format]

    if all([x in file_dict for x in ["train_file", "test_file"]]):
        train_file_name = file_dict["train_file"]
        test_file_name = file_dict["test_file"]
        train_path = os.path.join(dataset_path, train_file_name)
        test_path = os.path.join(dataset_path, test_file_name)
        return dict(train_path=train_path,
                    test_path=test_path)

    elif all([x in file_dict for x in ["background", "signal"]]):
        background = file_dict["background"]
        signal = file_dict["signal"]
        background_path = os.path.join(dataset_path, background)
        signal_path = os.path.join(dataset_path, signal)
        return dict(background_path=background_path,
                    signal_path=signal_path)

    elif "both" in file_dict:
        both_path = os.path.join(dataset_path, file_dict["both"])
        return dict(both=both_path)
Пример #9
0
def augment(dataset, format, shift_size):
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Пример #10
0
    def test_read_config_file(self):
        """ Tests the read_config_file function """
        # nominal case
        train, test = ar.read_config_file("TEST", "PtEtaPhi")
        test_dataset_path = ds.get_path_to_dataset("TEST")
        self.assertEqual(
            train, os.path.join(test_dataset_path, "train_all_ptEtaPhi.txt"))
        self.assertEqual(
            test, os.path.join(test_dataset_path, "test_all_ptEtaPhi.txt"))

        # if the dataset doesn't exist but the format does
        with self.assertRaises(IOError):
            ar.read_config_file("no way does this dataset exist", "PtEtaPhi")

        # if the format doesn't exist in a valid dataset
        with self.assertRaises(KeyError):
            ar.read_config_file("TEST", "Spherical")
Пример #11
0
    def test_read_config_file(self):
        """ Tests the read_config_file function """
        # nominal case
        train, test = ar.read_config_file("TEST", "PtEtaPhi")
        test_dataset_path = ds.get_path_to_dataset("TEST")
        self.assertEqual(train, os.path.join(test_dataset_path,
                                             "train_all_ptEtaPhi.txt"))
        self.assertEqual(test, os.path.join(test_dataset_path,
                                            "test_all_ptEtaPhi.txt"))

        # if the dataset doesn't exist but the format does
        with self.assertRaises(IOError):
            ar.read_config_file("no way does this dataset exist",
                             "PtEtaPhi")

        # if the format doesn't exist in a valid dataset
        with self.assertRaises(KeyError):
            ar.read_config_file("TEST", "Spherical")
Пример #12
0
def permutate_individual_sorted(dataset):
    """ Only use this for sorted data! Also, this takes up a significant amount of RAM """
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)

    # Generate permutations, transforms, and alter the dataset
    perms = list(gen_permutations(2, 7, 2))
    num_perms = len(perms)

    aperms = np.array(perms)
    labels = np.zeros(aperms.shape)
    r = np.arange(11)
    for i,p in enumerate(aperms):
        labels[i] = (p == r).astype('int32')

    transforms = np.zeros((44, 44 * num_perms))
    for i, p in enumerate(perms):
        transforms[:, i * 44:(i + 1) * 44] = E(p)

    # For the training data
    sorted_train_x = np.zeros((x_train.shape[0] * num_perms, x_train.shape[1]))
    sorted_train_y = np.zeros((sorted_train_x.shape[0], 2))

    for i, batch in enumerate(x_train):
        event = np.dot(batch, transforms).reshape((num_perms, x_train.shape[1]))
        arange = np.arange(num_perms)
        np.random.shuffle(arange)
        sorted_train_x[i * num_perms:(i + 1) * num_perms] = event[arange]
        sorted_train_y[i * num_perms:(i + 1) * num_perms] = labels[arange]

    # For the testing data
    sorted_test_x = np.zeros((x_test.shape[0] * num_perms, x_test.shape[1]))
    sorted_test_y = np.zeros((sorted_test_x.shape[0], 2))

    for i, batch in enumerate(x_test):
        event = np.dot(batch, transforms).reshape((num_perms, x_test.shape[1]))
        arange = np.arange(num_perms)
        np.random.shuffle(arange)
        sorted_test_x[i * num_perms:(i + 1) * num_perms] = event[arange]
        sorted_test_y[i * num_perms:(i + 1) * num_perms] = labels[arange]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}_{}.npz".format(format, "Permuted"))
    np.savez(output_path, x_train=sorted_train_x, x_test=sorted_test_x, y_train=sorted_train_y, y_test=sorted_test_y)
Пример #13
0
def load_model(exp_name):
    """
    Loads a model from an experiment and returns the model.

    Parameters
    ----------
    exp_name <string> : a forward-slash separated string for the experiment name i.e. <dataset>/<experiment>

    Returns
    -------
    model <Keras.engine.topology.Model> : a Keras Model instance as defined in the cfg.json of
                                         the experiment that is being loaded.

    """
    data, name = exp_name.split('/')
    exp_dir = ds.get_path_to_dataset(data) + os.sep + name + os.sep
    with open(exp_dir + "cfg.json") as json:
        model = model_from_json(json.read())
    model.set_weights(np.load(exp_dir + "weights.npy"))
    return model
Пример #14
0
    def test_get_path_to_dataset(self):
        """ Test the get_path_to_dataset function """
        # nominal case : just dataset_name
        data_dir_path = ds.get_data_dir_path()
        test_dataset_path = os.path.join(data_dir_path, "TEST")
        self.assertEqual(ds.get_path_to_dataset("TEST"), test_dataset_path)

        # nominal case: dataset_name and format
        test_format_path = os.path.join(test_dataset_path, "PtEtaPhi.npz")
        self.assertEqual(ds.get_path_to_dataset("TEST", "PtEtaPhi"),
                         test_format_path)

        # if the dataset doesn't exist
        with self.assertRaises(IOError):
            ds.get_path_to_dataset("some random folder")

        # if the format doesn't exist in a valid dataset
        with self.assertRaises(IOError):
            ds.get_path_to_dataset("TEST", "Spherical")
Пример #15
0
    def test_get_path_to_dataset(self):
        """ Test the get_path_to_dataset function """
        # nominal case : just dataset_name
        data_dir_path = ds.get_data_dir_path()
        test_dataset_path = os.path.join(data_dir_path, "TEST")
        self.assertEqual(ds.get_path_to_dataset("TEST"), test_dataset_path)

        # nominal case: dataset_name and format
        test_format_path = os.path.join(test_dataset_path,
                                                   "PtEtaPhi.npz")
        self.assertEqual(ds.get_path_to_dataset("TEST", "PtEtaPhi"),
                         test_format_path)

        # if the dataset doesn't exist
        with self.assertRaises(IOError):
            ds.get_path_to_dataset("some random folder")

        # if the format doesn't exist in a valid dataset
        with self.assertRaises(IOError):
            ds.get_path_to_dataset("TEST", "Spherical")
Пример #16
0
def run(model, exp, terms, save_freq=5, data=None):

    exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset))
    save_dir = os.path.join(exp_dir, exp.description)

    ##
    # Load data from .npz archive created by invoking
    # deep_learning/utils/archive.py
    ##

    if data:
        x_train, y_train, x_test, y_test = data
        x_train, x_test = tr.transform(x_train, x_test)
    else:
        h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(
            pb.Experiment.Dataset.Name(exp.dataset),
            exp.coordinates + '/transformed')
        data = x_train, y_train, x_test, y_test

    exp_file_name = exp.description + '.exp'

    # Start training

    train_length = x_train.shape[0]
    num_batches = int(ceil(train_length / exp.batch_size))

    valid = Validator(exp, terms)

    eTimes = np.array([])
    valid._clock = clock()
    model.summary()
    while valid.check():
        t = clock()
        if valid._num_epochs:
            print("Epoch {}/{}".format(valid.epochs + 1, valid._num_epochs))
        else:
            print("Epoch {}".format(valid.epochs + 1))
        bETA = 0
        bTimes = np.array([])
        #print("\t Training: ")
        for b in xrange(num_batches):
            bt = clock()
            # Update progress bar
            progress(b, num_batches, exp.batch_size, bETA)
            # Train on a batch
            x_batch = x_train[b * exp.batch_size:b * exp.batch_size +
                              exp.batch_size, :]
            y_batch = y_train[b * exp.batch_size:b * exp.batch_size +
                              exp.batch_size, :]
            model.train_on_batch(x_batch, y_batch)
            bTimes = np.append(bTimes, clock() - bt)
            bETA = np.median(bTimes) * (num_batches - b - 1)
        # Finish progress bar
        progress(num_batches,
                 num_batches,
                 exp.batch_size,
                 0,
                 end='\n',
                 time=clock() - t)
        # Calculate stats and add the epoch results to the experiment object
        epoch = exp.results.add()
        timer = clock()
        print("Evaluating Train")
        epoch.train_loss, epoch.train_accuracy = model.evaluate_generator(
            ((x_train[i * exp.batch_size:(i + 1) * exp.batch_size],
              y_train[i * exp.batch_size:(i + 1) * exp.batch_size])
             for i in xrange(num_batches)),
            num_batches,
            max_q_size=min((num_batches // 2, 10)))
        #print("Finished {:.2f}s".format(clock()-timer))
        timer = clock()
        print("Evaluating Test")
        epoch.test_loss, epoch.test_accuracy = model.evaluate_generator(
            ((x_test[i * exp.batch_size:(i + 1) * exp.batch_size],
              y_test[i * exp.batch_size:(i + 1) * exp.batch_size])
             for i in xrange(int(ceil(x_test.shape[0] / exp.batch_size)))),
            int(ceil(x_test.shape[0] / exp.batch_size)),
            max_q_size=min(
                (int(ceil(x_test.shape[0] / exp.batch_size)) // 2, 10)))
        #print("Finished {:.2f}s".format(clock() - timer))
        timer = clock()
        print("Calculating Sig")
        epoch.s_b = st.significance(model, data)
        #print("Finished {:.2f}".format(clock() - timer))
        #timer = clock()
        #print("Calculating AUC {:.2f}".format(clock()))
        #epoch.auc = st.AUC(model, data, experiment_epoch=epoch)
        #print("Finished {:.2f}".format(clock() - timer))
        timer = clock()
        for r in st.num_of_each_cell(model, data):
            epoch.matrix.add().columns.extend(r)
        print("Making CFM")
        matrix = st.confusion_matrix(model, data, offset='\t ')
        #print("Finished {:.2f}".format(clock() - timer))
        epoch.num_seconds = clock() - t
        timer = clock()
        print("Getting output")
        output = st.get_output_distro(model, data)
        epoch.output.background.extend(output["background"])
        epoch.output.signal.extend(output["signal"])
        #print("Finished {:.2f}".format(clock() - timer))
        # Print statistics
        print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format(
            epoch.train_accuracy, epoch.test_accuracy))
        if valid.update_w():
            print("\t Slope: {:.5f} (test_accuracy / second)".format(
                valid.slope))
        print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='')
        if valid._num_epochs:
            eTimes = np.append(eTimes, epoch.num_seconds)
            print("\tFinal ETA: {}".format(
                convert_seconds(
                    np.median(eTimes) * (valid._num_epochs - valid.epochs))))
        else:
            print()
        print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b))
        print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc))
        print(matrix)

        # Saves the model
        if (len(exp.results) % save_freq) == 0:
            save(model, exp, save_dir, exp_file_name)
            print("\t ", end='')
        sys.stdout.flush()

    exp.end_date_time = str(datetime.datetime.now())
    exp.total_time = valid.time

    print("\n" + valid.failed)
    print("Total Time: {}".format(convert_seconds(valid.time)))

    save(model, exp, save_dir, exp_file_name)
    print("\t ", end='')
    h_file.close()
Пример #17
0
def create_archive(dataset_name, format, buffer=1000, train_fraction=0.8):
    """ converts a series of text files into a single .npz archive
    create_archive takes the name of a dataset and the
    format that the data is in, loads the config file
    from the dataset's directory, loads the right text files, and saves
    the training and testing data as numpy arrays in a single .npz
    file. The data is normalized and its variance is set to unity
    before saving, but the data will be randomized when it is loaded
    because otherwise we would be using the same "random" ordering of
    the data each time we train a network on the dataset

    Parameters
    ----------
    dataset_name : name of the dataset (directory) to look for a
    configuration file in

    format : name of the format that you want to
    build a .npz archive for

    Notes
    -----
    The locations of the text files to load should be described in
    the config file.  See the example in the OSUTTBAR dataset
    directory.
    """
    path_dict = read_config_file(dataset_name, format)
    output_path = os.path.join(ds.get_path_to_dataset(dataset_name), "{}.hdf5".format(dataset_name))

    if "train_path" in path_dict:
        train_len, train_cols = get_file_len_and_shape(path_dict["train_path"])
        test_len, test_cols = get_file_len_and_shape(path_dict["test_path"])
        assert train_cols == test_cols  # Train and test files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path, format, zip([train_len]*2+[test_len]*2, train_cols+test_cols))
        with open(path_dict["train_path"]) as train_f:
            for l in train_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[0].append(event[1:][None])
                h_data[1].append(make_one_hot(event[0], 0, train_cols[1]-1)[0])
        with open(path_dict["test_path"]) as test_f:
            for l in test_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[2].append(event[1:][None])
                h_data[3].append(make_one_hot(event[0], 0, test_cols[1]-1)[0])

    elif "background_path" in path_dict:
        bkg_len, bkg_cols = get_file_len_and_shape(path_dict["background_path"])
        sig_len, sig_cols = get_file_len_and_shape(path_dict["signal_path"])
        n_labels = 2

        total_len = bkg_len+sig_len
        bkg_read_amt = int(bkg_len*buffer/total_len)
        sig_read_amt = int(sig_len*buffer/total_len)
        assert bkg_cols == sig_cols # Bkg and sig files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path,
                                        format,
                                        zip([round(total_len*train_fraction)] * 2 + [round(total_len*(1-train_fraction))] * 2,
                                            [bkg_cols[0], n_labels]*2))
        # Read in a buffer of 1000 lines at a time (allows fraction accuracy to 0.xxx)
        with open(path_dict["background_path"]) as bkg_f:
            with open(path_dict["signal_path"]) as sig_f:
                i = 0
                while i < total_len - 1:
                    x_buffer_array = np.zeros((buffer, bkg_cols[0]))
                    y_buffer_array = np.zeros((buffer, n_labels))
                    ix = 0
                    for j in xrange(bkg_read_amt):
                        line = bkg_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    for k in xrange(sig_read_amt):
                        line = sig_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    indices = np.any(~(x_buffer_array==0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0]*train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    i += ix
    elif "both" in path_dict:

        total_len, total_cols = get_file_len_and_shape(path_dict["both"])
        train_len = round(train_fraction*total_len)
        test_len = round((1-train_fraction)*total_len)
        h_file, h_data = add_group_hdf5(output_path, format,
                                        zip([train_len] * 2 + [test_len] * 2, total_cols*2))
        with open(path_dict["both"]) as data_f:
            x_buffer_array = np.zeros((buffer, total_cols[0]))
            y_buffer_array = np.zeros((buffer, total_cols[1]))
            for i, l in enumerate(data_f):
                event = np.fromstring(l, sep=',', dtype="float32")
                x_buffer_array[i%buffer] = event[1:]
                y_buffer_array[i%buffer] = make_one_hot(event[0], 0, total_cols[1] - 1)[0]
                if i%buffer == buffer - 1:
                    indices = np.any(~(x_buffer_array == 0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0] * train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    x_buffer_array = np.zeros((buffer, total_cols[0]))
                    y_buffer_array = np.zeros((buffer, total_cols[1]))

    h_file.flush()
    h_file.close()
Пример #18
0
def save_ratios(dataset, ratios, buffer=1000):
    """
    Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of
    length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal
    starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this.

    Parameters
    ----------
    dataset <string> : the name of the dataset (/-separated)
    ratios <list> : a list of integers that define ratios of background to signal.
    buffer <int> : an integer defining the number of data points to load into memory at a time.

    """
    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data = dataset.split('/')[0]
    format = '/'.join(dataset.split('/')[1:])
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.h5")
Пример #19
0
def save_ratios(dataset, ratios, buffer=1000):

    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data, format = dataset.split('/')
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.hdf5")
Пример #20
0
def create_archive(dataset_name, format, buffer=1000, train_fraction=0.8):
    """ converts a series of text files into a single hdf5 archive
    create_archive takes the name of a dataset and the
    format that the data is in, loads the config file
    from the dataset's directory, loads the right text files, and saves
    the training and testing data as numpy arrays in a single hdf5
    file.

    Parameters
    ----------
    dataset_name <string> : name of the dataset (directory) to look for a
    configuration file in

    format <string> : name of the format that you want to
    build a hdf5 archive for

    Notes
    -----
    The locations of the text files to load should be described in
    the config file.  See the example in the OSUTTBAR dataset
    directory.
    """
    path_dict = read_config_file(dataset_name, format)
    output_path = os.path.join(ds.get_path_to_dataset(dataset_name), "{}.h5".format(dataset_name))

    if "train_path" in path_dict:
        train_len, train_cols = get_file_len_and_shape(path_dict["train_path"])
        test_len, test_cols = get_file_len_and_shape(path_dict["test_path"])
        assert train_cols == test_cols  # Train and test files should have the same data shape
        h_file, h_data = add_group_hdf5(output_path, format, zip([train_len]*2+[test_len]*2, train_cols+test_cols))
        with open(path_dict["train_path"]) as train_f:
            for l in train_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[0].append(event[1:][None])
                h_data[1].append(make_one_hot(event[0], 0, train_cols[1]-1)[0])
        with open(path_dict["test_path"]) as test_f:
            for l in test_f:
                event = np.fromstring(l, sep=',', dtype="float32")
                h_data[2].append(event[1:][None])
                h_data[3].append(make_one_hot(event[0], 0, test_cols[1]-1)[0])

    elif "background_path" in path_dict:
        bkg_len, bkg_cols = get_file_len_and_shape(path_dict["background_path"])
        sig_len, sig_cols = get_file_len_and_shape(path_dict["signal_path"])
        assert sig_cols[0] == bkg_cols[0] # the background and signal should have the same shape
        n_labels = 1 + bkg_cols[1]

        total_len = bkg_len+sig_len
        bkg_read_amt = int(bkg_len*buffer/total_len)
        sig_read_amt = int(sig_len*buffer/total_len)
        h_file, h_data = add_group_hdf5(output_path,
                                        format,
                                        zip([round(total_len*train_fraction)] * 2 + [round(total_len*(1-train_fraction))] * 2,
                                            [bkg_cols[0], n_labels]*2))
        # Read in a buffer of 1000 lines at a time (allows fraction accuracy to 0.xxx)
        with open(path_dict["background_path"]) as bkg_f:
            with open(path_dict["signal_path"]) as sig_f:
                i = 0
                while i < total_len - 1:
                    x_buffer_array = np.zeros((buffer, bkg_cols[0]))
                    y_buffer_array = np.zeros((buffer, n_labels))
                    ix = 0
                    for j in xrange(bkg_read_amt):
                        line = bkg_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    for k in xrange(sig_read_amt):
                        line = sig_f.readline()
                        if line:
                            event = np.fromstring(line, sep=',', dtype="float32")
                            x_buffer_array[ix] = event[1:]
                            y_buffer_array[ix] = make_one_hot(event[0], 0, n_labels - 1)[0]
                            ix += 1
                        else:
                            break
                    indices = np.any(~(x_buffer_array==0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0]*train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    i += ix
    elif "both" in path_dict:

        total_len, total_cols = get_file_len_and_shape(path_dict["both"])
        train_len = round(train_fraction*total_len)
        test_len = round((1-train_fraction)*total_len)
        h_file, h_data = add_group_hdf5(output_path, format,
                                        zip([train_len] * 2 + [test_len] * 2, total_cols*2))
        with open(path_dict["both"]) as data_f:
            x_buffer_array = np.zeros((buffer, total_cols[0]))
            y_buffer_array = np.zeros((buffer, total_cols[1]))
            for i, l in enumerate(data_f):
                event = np.fromstring(l, sep=',', dtype="float32")
                x_buffer_array[i%buffer] = event[1:]
                y_buffer_array[i%buffer] = make_one_hot(event[0], 0, total_cols[1] - 1)[0]
                if i%buffer == buffer - 1:
                    indices = np.any(~(x_buffer_array == 0), axis=1)
                    x_buffer_array = x_buffer_array[indices]
                    y_buffer_array = y_buffer_array[indices]
                    tr.shuffle_in_unison(x_buffer_array, y_buffer_array)
                    cutoff = int(x_buffer_array.shape[0] * train_fraction)
                    for r in x_buffer_array[:cutoff]:
                        h_data[0].append(r[None])
                    for r in y_buffer_array[:cutoff]:
                        h_data[1].append(r[None])
                    for r in x_buffer_array[cutoff:]:
                        h_data[2].append(r[None])
                    for r in y_buffer_array[cutoff:]:
                        h_data[3].append(r[None])
                    x_buffer_array = np.zeros((buffer, total_cols[0]))
                    y_buffer_array = np.zeros((buffer, total_cols[1]))

    h_file.flush()
    h_file.close()
Пример #21
0
def set_configurations():
    print("What would you like to name this experiment setup?")
    print("*Note: this is used for naming the result files")
    name = raw_input(">> ")

    print("Which dataset would you like to run on?")
    ## Collect the names of the directories here
    # Selects dataset
    data_dir = get_data_dir_path()
    print(' '.join(filter(lambda x: path.isdir(path.join(data_dir, x)), os.listdir(data_dir))))
    dataset = raw_input(">> ")
    print()

    data_dir = get_path_to_dataset(dataset)
    with open(path.join(data_dir, dataset+".json")) as fp:
        j = json.load(fp)
    print("Which coordinates would you like to use?")
    print(' '.join(j.keys()))
    coords = raw_input(">> ")
    print()

    # Collect data files from that dataset and repeat question

    batch_size = raw_input("What will be your batch size?\n>> ")
    print()
    learning_rate = raw_input("What will be your learning rate?\n>> ")
    print()

    layers = raw_input("How many layers do you want?\n>> ")
    print()

    nodes = raw_input("How many nodes per layer do you want?\n>> ")
    print()

    print("How often do you want to save the model?")
    print("(Number of epochs)")
    sf = raw_input(">> ")
    print()

    print("How would you like to determine the end of the program:")
    print("1. After a certain number of epochs")
    print("2. After a certain amount of time")
    print("3. When the accuracy plateaus")
    print("*Note: You may select multiple, just separate them by spaces.")
    ending = filter(None, raw_input("[1/2/3] >> ").split())
    print()

    terms = {"epochs": None, "timeout": None, "plateau": {"m": None, "w": None}}
    if '1' in ending:
        terms['epochs'] = int(raw_input("How many epochs do you want to run?\n>> "))
        print()
    if '2' in ending:
        print("After how long do you want the program to be killed?")
        print("(Put units after the number, i.e. s/m/h/d/w/y)")
        time = raw_input(">> ")
        print()
        unit = time[-1]
        time = int(time[:-1])
        if unit == 'm':
            time *= 60
        elif unit == 'h':
            time *= 3600
        elif unit == 'd':
            time *= 3600*24
        elif unit == 'w':
            time *= 3600*24*7
        elif unit == 'y':
            time *= 3600*24*7*52
        terms['timeout'] = time
    if '3' in ending:
        print("For determining plateau:")
        x = raw_input("Over what interval would you like to measure the accuracy change?\n>> ")
        print()
        y = raw_input("What is the minimal increase in percentile you can accept over this interval?\n>> ")
        print()
        terms['plateau'] = dict(x=float(x), y=int(y))

    d = dict(save_name=name,
             dataset=dataset,
             coords=coords,
             batch_size=int(batch_size),
             learning_rate=float(learning_rate),
             layers=int(layers),
             nodes=int(nodes),
             save_freq=int(sf),
             terms=terms)
    return d
Пример #22
0
def run(model, exp, terms, save_freq=5, data=None):

    exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset))
    save_dir = os.path.join(exp_dir, exp.description)

    ##
    # Load data from .npz archive created by invoking
    # deep_learning/utils/archive.py
    ##

    if data:
        x_train, y_train, x_test, y_test = data
        x_train, x_test = tr.transform(x_train, x_test)
    else:
        h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(pb.Experiment.Dataset.Name(exp.dataset), exp.coordinates)
        x_train, x_test = tr.transform(x_train, x_test)
        data = x_train, y_train, x_test, y_test

    exp_file_name = exp.description + '.exp'

    train_length = x_train.shape[0]
    num_batches = int(ceil(train_length / exp.batch_size))

    valid = Validator(exp, terms)

    eTimes = np.array([])
    valid._clock = clock()
    model.summary()
    while valid.check():
        t = clock()
        if valid._num_epochs:
            print("Epoch {}/{}".format(valid.epochs+1, valid._num_epochs))
        else:
            print("Epoch {}".format(valid.epochs+1))
        bETA = 0
        bTimes = np.array([])
        for b in xrange(num_batches):
            bt = clock()
            # Update progress bar
            progress(b, num_batches, exp.batch_size, bETA)
            # Train on a batch
            model.train_on_batch(x_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :],
                                 y_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :])
            bTimes = np.append(bTimes, clock()-bt)
            bETA = np.median(bTimes)*(num_batches-b-1)
        # Finish progress bar
        progress(num_batches, num_batches, exp.batch_size, 0, end='\n')
        # Calculate stats and add the epoch results to the experiment object
        epoch = exp.results.add()
        epoch.train_loss, epoch.train_accuracy = model.evaluate_generator(((x_train[i*exp.batch_size:(i+1)*exp.batch_size],
                                                                           y_train[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(int(ceil(x_test.shape[0]/exp.batch_size)))),
                                                                          int(ceil(x_test.shape[0]/exp.batch_size)))
        epoch.test_loss, epoch.test_accuracy = model.evaluate_generator(((x_test[i*exp.batch_size:(i+1)*exp.batch_size],
                                                                           y_test[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(num_batches)),
                                                              num_batches)
        epoch.s_b = st.significance(model, data)
        epoch.auc = st.AUC(model, data, experiment_epoch=epoch)
        for r in st.num_of_each_cell(model, data):
            epoch.matrix.add().columns.extend(r)
        matrix = st.confusion_matrix(model, data, offset='\t ')
        epoch.num_seconds = clock() - t
        # Print statistics
        print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format(epoch.train_accuracy, epoch.test_accuracy))
        if valid.update_w():
            print("\t Slope: {:.5f} (test_accuracy / second)".format(valid.slope))
        print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='')
        if valid._num_epochs:
            eTimes = np.append(eTimes, epoch.num_seconds)
            print("\tFinal ETA: {}".format(convert_seconds(np.median(eTimes) * (valid._num_epochs - valid.epochs))))
        else:
            print()
        print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b))
        print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc))
        print(matrix)

        if (len(exp.results) % save_freq) == 0:
            save(model, exp, save_dir, exp_file_name)
            print("\t Saved the model\n")
        sys.stdout.flush()

    exp.end_date_time = str(datetime.datetime.now())
    exp.total_time = valid.time

    print("\n"+valid.failed)
    print("Total Time: {}".format(convert_seconds(valid.time)))

    save(model, exp, save_dir, exp_file_name, graph=True)
    h_file.close()