Пример #1
0
def get_training_data(DatabaseConfig, PreprocessingConfig, FileConfig):

    tmp_file = FileConfig['temp_file']
    erase_above = PreprocessingConfig['erase_above']
    remove_negatives = PreprocessingConfig['remove_negative_inputs']
    flip = PreprocessingConfig['flip']
    random_input = PreprocessingConfig['random_input']
    test_data = DatabaseConfig['test_data']

    bin_type = PreprocessingConfig['binning_type']
    bin_precision = PreprocessingConfig['binning_precision']

    single_input = PreprocessingConfig['single_input']
    single_output = PreprocessingConfig['single_output']

    input_normaliser = PreprocessingConfig['normalise_input']
    output_normaliser = PreprocessingConfig['normalise_output']

    if check_temp(tmp_file, DatabaseConfig): # Check the temp file

        LOG.info('Correct temp file exists at {0}, loading from temp'.format(tmp_file))
        all_in, all_out, redshifts, galaxy_ids = load_from_file(tmp_file)
        LOG.info('Done.')
    else:

        LOG.info('No temp file, reading from database.')
        all_in, all_out, redshifts, galaxy_ids = get_train_test_data(DatabaseConfig)

        LOG.info('Done. Writing temp file for next time.')
        write_file(tmp_file, DatabaseConfig, all_in, all_out, redshifts, galaxy_ids)
        LOG.info('Done. Temp file written to {0}'.format(tmp_file))

    # Convert the arrays to np arrays
    all_in = np.array(all_in)
    all_out = np.array(all_out)
    redshifts = np.array(redshifts)
    galaxy_ids = np.array(galaxy_ids)

    # Flip input and output
    if flip:
        LOG.info('****Flipping input and output!****')
        tmp = all_in
        all_in = all_out
        all_out = tmp

    # Use random values for input
    if random_input:
        LOG.info('****Using randomly generated input values!****')
        all_in = random_in_shape(all_in)

    # Do we want only a single output/input?
    if single_output is not None:
        LOG.info('****Using single output {0}****'.format(single_output))
        all_out = all_out[:,single_output]

    if single_input is not None:
        LOG.info('****Using single input {0}****'.format(single_input))
        all_in = all_in[:,single_input]

    # Remove negative values in the input (BEFORE normalisation!)
    if remove_negatives:
        LOG.info('****Removing negative input values****'.format(single_input))
        all_in, all_out, redshifts, galaxy_ids = remove_negative_values(all_in, all_out, redshifts, galaxy_ids)

    # Remove negative flux and values above percentile 99
    if erase_above is not None:
        LOG.info('****Removing all input values above percentile {0}'.format(erase_above))
        all_in, all_out, redshifts, galaxy_ids = remove_above_percentile(all_in, all_out, redshifts, galaxy_ids
                                                                         , erase_above)
    # Bin the values in to percentile groups
    if bin_type is not None:
        LOG.info('****Binning all input and output values at precision {0}****'.format(bin_precision))
        bin_func = get_binning_function(bin_type)
        all_in = bin_func(all_in, bin_precision)
        all_in = all_in.astype('float32')

        all_out = bin_func(all_out, bin_precision)
        all_out = all_out.astype('float32')
    """
    print 'In shape'
    print np.shape(all_in)
    print 'Out shape'
    print np.shape(all_out)
    print 'Gal id'
    print np.shape(galaxy_ids)
    print 'Redshift'
    print np.shape(redshifts)

    print '\n\n\nFirst 5'
    for i in range(0, 5):
        print galaxy_ids[i]
        print redshifts[i]
        print all_in[i]
        print all_out[i]
        print
    """

    in_normaliser = None
    out_normaliser = None

    if input_normaliser is not None:
        # We should normalise the input
        LOG.info('Normalising input with {0}'.format(input_normaliser))

        # Get the normaliser class specified by the user
        in_normaliser = normaliser_from_user(input_normaliser)

        all_in = in_normaliser.normalise(all_in)

        LOG.info('Normalising input done.')

    if output_normaliser is not None:
        # We should normalise the output
        LOG.info('Normalising output with {0}'.format(output_normaliser))

        # Get the normaliser class specified by the user
        out_normaliser = normaliser_from_user(output_normaliser)

        all_out = out_normaliser.normalise(all_out)

        LOG.info('Normalising output done.')

    """
    print '\n\n\nFirst 5'
    for i in range(0, 5):
        print galaxy_ids[i]
        print redshifts[i]
        print all_in[i]
        print all_out[i]
        print
    """

    # Grab some statistics of everything.
    mean_in = np.mean(all_in, axis=0)
    mean_out = np.mean(all_out, axis=0)
    std_in = np.std(all_in, axis=0)
    std_out = np.std(all_out, axis=0)

    min_in = np.min(all_in, axis=0)
    max_in = np.max(all_in, axis=0)

    min_out = np.min(all_out, axis=0)
    max_out = np.max(all_out, axis=0)

    # Shuffle all arrays in the same way
    all_in, all_out, redshifts, galaxy_ids = shuffle_arrays(all_in, all_out, redshifts, galaxy_ids)

    # Split the data up in to training and test sets.
    split_point = test_data
    test_in, train_in = split_data(all_in, split_point)
    redshift_test, redshift_train = split_data(redshifts, split_point)
    galaxy_ids_test, galaxy_ids_train = split_data(galaxy_ids, split_point)
    test_out, train_out = split_data(all_out, split_point)

    # A lot of data to return
    return {'train_in': train_in, 'train_out': train_out, 'test_in': test_in, 'test_out': test_out,
            'galaxy_ids_test': galaxy_ids_test, 'galaxy_ids_train': galaxy_ids_train,
            'redshifts_test': redshift_test, 'redshifts_train': redshift_train,
            'in_normaliser': in_normaliser, 'out_normaliser': out_normaliser,
            'mean_in': mean_in, 'mean_out': mean_out,
            'stddev_in':std_in, 'stddev_out':std_out,
            'min_in':min_in, 'max_in':max_in,
            'min_out':min_out, 'max_out':max_out}
Пример #2
0
    'train_data': 200000,
    'test_data': 1000,
    'run_id': '06',
    'output_type':
    'median',  # median, best_fit, best_fit_model, best_fit_inputs
    'input_type': 'normal',  # normal, Jy
    'include_sigma': False,  # True, False
    'unknown_input_handler': None,
    'input_filter_types': None
}

if check_temp('nn_last_tmp_input3.tmp', {1: 1}):
    all_in, all_out, redshifts, galaxy_ids = load_from_file(
        'nn_last_tmp_input3.tmp')
else:
    all_in, all_out, redshifts, galaxy_ids = get_train_test_data(
        DatabaseConfig)
    write_file('nn_last_tmp_input3.tmp', {1: 1}, all_in, all_out, redshifts,
               galaxy_ids)

all_in = np.array(all_in)
all_out = np.array(all_out)
redshifts = np.array(redshifts)
galaxy_ids = np.array(galaxy_ids)

all_in, all_out, redshifts, galaxy_ids = shuffle_arrays(
    all_in, all_out, redshifts, galaxy_ids)

std = get_normaliser('standardise')
standardised = std.normalise(all_in)
norm = get_normaliser('normalise')
normalised = norm.normalise(all_in)
Пример #3
0
DatabaseConfig = {'database_connection_string': 'sqlite:///Database_run06.db',
                  'train_data': 200000,
                  'test_data': 1000,
                  'run_id': '06',
                  'output_type': 'median',  # median, best_fit, best_fit_model, best_fit_inputs
                  'input_type': 'normal',  # normal, Jy
                  'include_sigma': False,  # True, False
                  'unknown_input_handler': None,
                  'input_filter_types': None
                  }

if check_temp('nn_last_tmp_input3.tmp', {1:1}):
    all_in, all_out, redshifts, galaxy_ids = load_from_file('nn_last_tmp_input3.tmp')
else:
    all_in, all_out, redshifts, galaxy_ids = get_train_test_data(DatabaseConfig)
    write_file('nn_last_tmp_input3.tmp', {1:1}, all_in, all_out, redshifts, galaxy_ids)


all_in = np.array(all_in)
all_out = np.array(all_out)
redshifts = np.array(redshifts)
galaxy_ids = np.array(galaxy_ids)

all_in, all_out, redshifts, galaxy_ids = shuffle_arrays(all_in, all_out, redshifts, galaxy_ids)

std = get_normaliser('standardise')
standardised = std.normalise(all_in)
norm = get_normaliser('normalise')
normalised = norm.normalise(all_in)
logged = np.log(all_in)
Пример #4
0
def run_network(connections,
                layers,
                single_value=None,
                input_filter_types=None):

    nn_config_dict = {
        'test': test_data,
        'train': train_data,
        'run': run_id,
        'input_type': input_type,
        'output_type': output_type,
        'repeat_redshift': repeat_redshift,
        'value': single_value,
        'input_filter_types': input_filter_types
    }

    if check_temp(tmp_file, nn_config_dict):
        LOG.info('Correct temp file exists at {0}, loading from temp'.format(
            tmp_file))
        test_in, test_out, train_in, train_out, galaxy_ids = load_from_file(
            tmp_file)
        LOG.info('Done.')
    else:
        LOG.info('No temp file, reading from database.')
        test_in, test_out, train_in, train_out, galaxy_ids = get_train_test_data(
            test_data,
            train_data,
            input_type=input_type,
            output_type=output_type,
            repeat_redshift=repeat_redshift,
            input_filter_types=input_filter_types)

        LOG.info('Done. Writing temp file for next time.')
        write_file(tmp_file, nn_config_dict, test_in, test_out, train_in,
                   train_out, galaxy_ids)
        LOG.info('Done. Temp file written to {0}'.format(tmp_file))

    LOG.info('\nNormalising...')
    train_in_min, train_in_max, train_in = normalise_2Darray(train_in)
    #train_out_min, train_out_max, train_out = normalise_2Darray(train_out)

    test_in_min, test_in_max, test_in = normalise_2Darray(test_in)
    #test_out_min, test_out_max, test_out = normalise_2Darray(test_out)

    LOG.info('Normalising done.')

    print np.shape(train_in)
    print np.shape(train_out)
    print np.shape(test_in)
    print np.shape(test_out)

    input_dim = 0

    if 'optical' in input_filter_types:
        input_dim += 10

    if 'ir' in input_filter_types:
        input_dim += 9

    if 'uv' in input_filter_types:
        input_dim += 2

    input_dim *= 2

    data_set = SupervisedDataSet(input_dim + repeat_redshift, 15)

    for i in range(0, len(train_in)):
        data_set.addSample(train_in[i], train_out[i])

    LOG.info('Compiling neural network model')

    network = FeedForwardNetwork()

    input_layer = TanhLayer(input_dim + repeat_redshift, 'Input')
    network.addInputModule(input_layer)

    prev_layer = TanhLayer(connections, 'hidden0')
    network.addModule(prev_layer)
    network.addConnection(FullConnection(input_layer, prev_layer))

    for i in range(0, layers):
        new_layer = TanhLayer(connections, 'hidden{0}'.format(i))
        network.addModule(new_layer)
        network.addConnection(FullConnection(new_layer, prev_layer))
        prev_layer = new_layer

    output_layer = LinearLayer(15, 'output')
    network.addOutputModule(output_layer)
    network.addConnection(FullConnection(new_layer, output_layer))

    network.sortModules()

    trainer = BackpropTrainer(network, data_set, verbose=True)

    LOG.info("Compiled.")

    epochs = 0
    do_test = 10
    trained = False
    while not trained:
        error = trainer.train()
        epochs += 1
        do_test -= 1

        LOG.info('Error rate at epoch {0}: {1}'.format(epochs, error))

        if error < 0.001 or epochs == 500:
            trained = True

        if do_test == 0:
            with open(
                    'pybrain_inputs_{0}_outputs_{1}.txt'.format(
                        input_filter_types, output_names[single_value]),
                    'w') as f:
                for i in range(0, 20):

                    test_to_use = rand.randint(0, test_data - 1)
                    ans = network.activate(np.array(test_in[test_to_use]))

                    #f.write('Test {0} for epoch {1}\n'.format(i, total_epoch))
                    f.write('\nGalaxy number = {0}\n'.format(
                        galaxy_ids[test_to_use]))
                    f.write('Inputs: ')
                    for item in test_in[test_to_use]:
                        f.write(str(item))
                    f.write('\nOutput   Correct\n')
                    for a in range(0, len(test_out[test_to_use])):
                        #f.write('{0}: {1}  =   {2}\n'.format(output_names[a], denormalise_value(ans[a], train_out_min[a], train_out_max[a]), denormalise_value(test_out[test_to_use][a], test_out_min[a], test_out_max[a])))
                        f.write('{0}: {1}  =   {2}\n'.format(
                            output_names[a], ans[a], test_out[test_to_use][a]))
                    f.write('\n\n')

            do_test = 10
Пример #5
0
def run_network(connections, layers, single_value=None, input_filter_types=None):

    nn_config_dict = {'test':test_data, 'train':train_data, 'run':run_id, 'input_type': input_type, 'output_type':output_type, 'repeat_redshift':repeat_redshift, 'value':single_value, 'input_filter_types':input_filter_types}

    if check_temp(tmp_file, nn_config_dict):
        LOG.info('Correct temp file exists at {0}, loading from temp'.format(tmp_file))
        test_in, test_out, train_in, train_out, galaxy_ids = load_from_file(tmp_file)
        LOG.info('Done.')
    else:
        LOG.info('No temp file, reading from database.')
        test_in, test_out, train_in, train_out, galaxy_ids = get_train_test_data(test_data, train_data, input_type=input_type,
                                                                                 output_type=output_type,
                                                                                 repeat_redshift=repeat_redshift,
                                                                                 input_filter_types=input_filter_types)

        LOG.info('Done. Writing temp file for next time.')
        write_file(tmp_file, nn_config_dict, test_in, test_out, train_in, train_out, galaxy_ids)
        LOG.info('Done. Temp file written to {0}'.format(tmp_file))

    LOG.info('\nNormalising...')
    train_in_min, train_in_max, train_in = normalise_2Darray(train_in)
    #train_out_min, train_out_max, train_out = normalise_2Darray(train_out)

    test_in_min, test_in_max, test_in = normalise_2Darray(test_in)
    #test_out_min, test_out_max, test_out = normalise_2Darray(test_out)

    LOG.info('Normalising done.')

    print np.shape(train_in)
    print np.shape(train_out)
    print np.shape(test_in)
    print np.shape(test_out)

    input_dim = 0

    if 'optical' in input_filter_types:
        input_dim += 10

    if 'ir' in input_filter_types:
        input_dim += 9

    if 'uv' in input_filter_types:
        input_dim += 2

    input_dim *= 2

    data_set = SupervisedDataSet(input_dim+repeat_redshift, 15)

    for i in range(0, len(train_in)):
        data_set.addSample(train_in[i], train_out[i])

    LOG.info('Compiling neural network model')

    network = FeedForwardNetwork()

    input_layer = TanhLayer(input_dim+repeat_redshift,'Input')
    network.addInputModule(input_layer)

    prev_layer = TanhLayer(connections, 'hidden0')
    network.addModule(prev_layer)
    network.addConnection(FullConnection(input_layer, prev_layer))

    for i in range(0, layers):
        new_layer = TanhLayer(connections, 'hidden{0}'.format(i))
        network.addModule(new_layer)
        network.addConnection(FullConnection(new_layer, prev_layer))
        prev_layer = new_layer

    output_layer = LinearLayer(15, 'output')
    network.addOutputModule(output_layer)
    network.addConnection(FullConnection(new_layer, output_layer))

    network.sortModules()

    trainer = BackpropTrainer(network, data_set, verbose=True)

    LOG.info("Compiled.")

    epochs = 0
    do_test = 10
    trained = False
    while not trained:
        error = trainer.train()
        epochs += 1
        do_test -= 1

        LOG.info('Error rate at epoch {0}: {1}'.format(epochs, error))

        if error < 0.001 or epochs == 500:
            trained = True

        if do_test == 0:
            with open('pybrain_inputs_{0}_outputs_{1}.txt'.format(input_filter_types, output_names[single_value]), 'w') as f:
                for i in range(0, 20):

                    test_to_use = rand.randint(0, test_data - 1)
                    ans = network.activate(np.array(test_in[test_to_use]))

                    #f.write('Test {0} for epoch {1}\n'.format(i, total_epoch))
                    f.write('\nGalaxy number = {0}\n'.format(galaxy_ids[test_to_use]))
                    f.write('Inputs: ')
                    for item in test_in[test_to_use]:
                        f.write(str(item))
                    f.write('\nOutput   Correct\n')
                    for a in range(0, len(test_out[test_to_use])):
                        #f.write('{0}: {1}  =   {2}\n'.format(output_names[a], denormalise_value(ans[a], train_out_min[a], train_out_max[a]), denormalise_value(test_out[test_to_use][a], test_out_min[a], test_out_max[a])))
                        f.write('{0}: {1}  =   {2}\n'.format(output_names[a], ans[a], test_out[test_to_use][a]))
                    f.write('\n\n')

            do_test = 10