def get_training_data(DatabaseConfig, PreprocessingConfig, FileConfig): tmp_file = FileConfig['temp_file'] erase_above = PreprocessingConfig['erase_above'] remove_negatives = PreprocessingConfig['remove_negative_inputs'] flip = PreprocessingConfig['flip'] random_input = PreprocessingConfig['random_input'] test_data = DatabaseConfig['test_data'] bin_type = PreprocessingConfig['binning_type'] bin_precision = PreprocessingConfig['binning_precision'] single_input = PreprocessingConfig['single_input'] single_output = PreprocessingConfig['single_output'] input_normaliser = PreprocessingConfig['normalise_input'] output_normaliser = PreprocessingConfig['normalise_output'] if check_temp(tmp_file, DatabaseConfig): # Check the temp file LOG.info('Correct temp file exists at {0}, loading from temp'.format(tmp_file)) all_in, all_out, redshifts, galaxy_ids = load_from_file(tmp_file) LOG.info('Done.') else: LOG.info('No temp file, reading from database.') all_in, all_out, redshifts, galaxy_ids = get_train_test_data(DatabaseConfig) LOG.info('Done. Writing temp file for next time.') write_file(tmp_file, DatabaseConfig, all_in, all_out, redshifts, galaxy_ids) LOG.info('Done. Temp file written to {0}'.format(tmp_file)) # Convert the arrays to np arrays all_in = np.array(all_in) all_out = np.array(all_out) redshifts = np.array(redshifts) galaxy_ids = np.array(galaxy_ids) # Flip input and output if flip: LOG.info('****Flipping input and output!****') tmp = all_in all_in = all_out all_out = tmp # Use random values for input if random_input: LOG.info('****Using randomly generated input values!****') all_in = random_in_shape(all_in) # Do we want only a single output/input? if single_output is not None: LOG.info('****Using single output {0}****'.format(single_output)) all_out = all_out[:,single_output] if single_input is not None: LOG.info('****Using single input {0}****'.format(single_input)) all_in = all_in[:,single_input] # Remove negative values in the input (BEFORE normalisation!) if remove_negatives: LOG.info('****Removing negative input values****'.format(single_input)) all_in, all_out, redshifts, galaxy_ids = remove_negative_values(all_in, all_out, redshifts, galaxy_ids) # Remove negative flux and values above percentile 99 if erase_above is not None: LOG.info('****Removing all input values above percentile {0}'.format(erase_above)) all_in, all_out, redshifts, galaxy_ids = remove_above_percentile(all_in, all_out, redshifts, galaxy_ids , erase_above) # Bin the values in to percentile groups if bin_type is not None: LOG.info('****Binning all input and output values at precision {0}****'.format(bin_precision)) bin_func = get_binning_function(bin_type) all_in = bin_func(all_in, bin_precision) all_in = all_in.astype('float32') all_out = bin_func(all_out, bin_precision) all_out = all_out.astype('float32') """ print 'In shape' print np.shape(all_in) print 'Out shape' print np.shape(all_out) print 'Gal id' print np.shape(galaxy_ids) print 'Redshift' print np.shape(redshifts) print '\n\n\nFirst 5' for i in range(0, 5): print galaxy_ids[i] print redshifts[i] print all_in[i] print all_out[i] print """ in_normaliser = None out_normaliser = None if input_normaliser is not None: # We should normalise the input LOG.info('Normalising input with {0}'.format(input_normaliser)) # Get the normaliser class specified by the user in_normaliser = normaliser_from_user(input_normaliser) all_in = in_normaliser.normalise(all_in) LOG.info('Normalising input done.') if output_normaliser is not None: # We should normalise the output LOG.info('Normalising output with {0}'.format(output_normaliser)) # Get the normaliser class specified by the user out_normaliser = normaliser_from_user(output_normaliser) all_out = out_normaliser.normalise(all_out) LOG.info('Normalising output done.') """ print '\n\n\nFirst 5' for i in range(0, 5): print galaxy_ids[i] print redshifts[i] print all_in[i] print all_out[i] print """ # Grab some statistics of everything. mean_in = np.mean(all_in, axis=0) mean_out = np.mean(all_out, axis=0) std_in = np.std(all_in, axis=0) std_out = np.std(all_out, axis=0) min_in = np.min(all_in, axis=0) max_in = np.max(all_in, axis=0) min_out = np.min(all_out, axis=0) max_out = np.max(all_out, axis=0) # Shuffle all arrays in the same way all_in, all_out, redshifts, galaxy_ids = shuffle_arrays(all_in, all_out, redshifts, galaxy_ids) # Split the data up in to training and test sets. split_point = test_data test_in, train_in = split_data(all_in, split_point) redshift_test, redshift_train = split_data(redshifts, split_point) galaxy_ids_test, galaxy_ids_train = split_data(galaxy_ids, split_point) test_out, train_out = split_data(all_out, split_point) # A lot of data to return return {'train_in': train_in, 'train_out': train_out, 'test_in': test_in, 'test_out': test_out, 'galaxy_ids_test': galaxy_ids_test, 'galaxy_ids_train': galaxy_ids_train, 'redshifts_test': redshift_test, 'redshifts_train': redshift_train, 'in_normaliser': in_normaliser, 'out_normaliser': out_normaliser, 'mean_in': mean_in, 'mean_out': mean_out, 'stddev_in':std_in, 'stddev_out':std_out, 'min_in':min_in, 'max_in':max_in, 'min_out':min_out, 'max_out':max_out}
'train_data': 200000, 'test_data': 1000, 'run_id': '06', 'output_type': 'median', # median, best_fit, best_fit_model, best_fit_inputs 'input_type': 'normal', # normal, Jy 'include_sigma': False, # True, False 'unknown_input_handler': None, 'input_filter_types': None } if check_temp('nn_last_tmp_input3.tmp', {1: 1}): all_in, all_out, redshifts, galaxy_ids = load_from_file( 'nn_last_tmp_input3.tmp') else: all_in, all_out, redshifts, galaxy_ids = get_train_test_data( DatabaseConfig) write_file('nn_last_tmp_input3.tmp', {1: 1}, all_in, all_out, redshifts, galaxy_ids) all_in = np.array(all_in) all_out = np.array(all_out) redshifts = np.array(redshifts) galaxy_ids = np.array(galaxy_ids) all_in, all_out, redshifts, galaxy_ids = shuffle_arrays( all_in, all_out, redshifts, galaxy_ids) std = get_normaliser('standardise') standardised = std.normalise(all_in) norm = get_normaliser('normalise') normalised = norm.normalise(all_in)
DatabaseConfig = {'database_connection_string': 'sqlite:///Database_run06.db', 'train_data': 200000, 'test_data': 1000, 'run_id': '06', 'output_type': 'median', # median, best_fit, best_fit_model, best_fit_inputs 'input_type': 'normal', # normal, Jy 'include_sigma': False, # True, False 'unknown_input_handler': None, 'input_filter_types': None } if check_temp('nn_last_tmp_input3.tmp', {1:1}): all_in, all_out, redshifts, galaxy_ids = load_from_file('nn_last_tmp_input3.tmp') else: all_in, all_out, redshifts, galaxy_ids = get_train_test_data(DatabaseConfig) write_file('nn_last_tmp_input3.tmp', {1:1}, all_in, all_out, redshifts, galaxy_ids) all_in = np.array(all_in) all_out = np.array(all_out) redshifts = np.array(redshifts) galaxy_ids = np.array(galaxy_ids) all_in, all_out, redshifts, galaxy_ids = shuffle_arrays(all_in, all_out, redshifts, galaxy_ids) std = get_normaliser('standardise') standardised = std.normalise(all_in) norm = get_normaliser('normalise') normalised = norm.normalise(all_in) logged = np.log(all_in)
def run_network(connections, layers, single_value=None, input_filter_types=None): nn_config_dict = { 'test': test_data, 'train': train_data, 'run': run_id, 'input_type': input_type, 'output_type': output_type, 'repeat_redshift': repeat_redshift, 'value': single_value, 'input_filter_types': input_filter_types } if check_temp(tmp_file, nn_config_dict): LOG.info('Correct temp file exists at {0}, loading from temp'.format( tmp_file)) test_in, test_out, train_in, train_out, galaxy_ids = load_from_file( tmp_file) LOG.info('Done.') else: LOG.info('No temp file, reading from database.') test_in, test_out, train_in, train_out, galaxy_ids = get_train_test_data( test_data, train_data, input_type=input_type, output_type=output_type, repeat_redshift=repeat_redshift, input_filter_types=input_filter_types) LOG.info('Done. Writing temp file for next time.') write_file(tmp_file, nn_config_dict, test_in, test_out, train_in, train_out, galaxy_ids) LOG.info('Done. Temp file written to {0}'.format(tmp_file)) LOG.info('\nNormalising...') train_in_min, train_in_max, train_in = normalise_2Darray(train_in) #train_out_min, train_out_max, train_out = normalise_2Darray(train_out) test_in_min, test_in_max, test_in = normalise_2Darray(test_in) #test_out_min, test_out_max, test_out = normalise_2Darray(test_out) LOG.info('Normalising done.') print np.shape(train_in) print np.shape(train_out) print np.shape(test_in) print np.shape(test_out) input_dim = 0 if 'optical' in input_filter_types: input_dim += 10 if 'ir' in input_filter_types: input_dim += 9 if 'uv' in input_filter_types: input_dim += 2 input_dim *= 2 data_set = SupervisedDataSet(input_dim + repeat_redshift, 15) for i in range(0, len(train_in)): data_set.addSample(train_in[i], train_out[i]) LOG.info('Compiling neural network model') network = FeedForwardNetwork() input_layer = TanhLayer(input_dim + repeat_redshift, 'Input') network.addInputModule(input_layer) prev_layer = TanhLayer(connections, 'hidden0') network.addModule(prev_layer) network.addConnection(FullConnection(input_layer, prev_layer)) for i in range(0, layers): new_layer = TanhLayer(connections, 'hidden{0}'.format(i)) network.addModule(new_layer) network.addConnection(FullConnection(new_layer, prev_layer)) prev_layer = new_layer output_layer = LinearLayer(15, 'output') network.addOutputModule(output_layer) network.addConnection(FullConnection(new_layer, output_layer)) network.sortModules() trainer = BackpropTrainer(network, data_set, verbose=True) LOG.info("Compiled.") epochs = 0 do_test = 10 trained = False while not trained: error = trainer.train() epochs += 1 do_test -= 1 LOG.info('Error rate at epoch {0}: {1}'.format(epochs, error)) if error < 0.001 or epochs == 500: trained = True if do_test == 0: with open( 'pybrain_inputs_{0}_outputs_{1}.txt'.format( input_filter_types, output_names[single_value]), 'w') as f: for i in range(0, 20): test_to_use = rand.randint(0, test_data - 1) ans = network.activate(np.array(test_in[test_to_use])) #f.write('Test {0} for epoch {1}\n'.format(i, total_epoch)) f.write('\nGalaxy number = {0}\n'.format( galaxy_ids[test_to_use])) f.write('Inputs: ') for item in test_in[test_to_use]: f.write(str(item)) f.write('\nOutput Correct\n') for a in range(0, len(test_out[test_to_use])): #f.write('{0}: {1} = {2}\n'.format(output_names[a], denormalise_value(ans[a], train_out_min[a], train_out_max[a]), denormalise_value(test_out[test_to_use][a], test_out_min[a], test_out_max[a]))) f.write('{0}: {1} = {2}\n'.format( output_names[a], ans[a], test_out[test_to_use][a])) f.write('\n\n') do_test = 10
def run_network(connections, layers, single_value=None, input_filter_types=None): nn_config_dict = {'test':test_data, 'train':train_data, 'run':run_id, 'input_type': input_type, 'output_type':output_type, 'repeat_redshift':repeat_redshift, 'value':single_value, 'input_filter_types':input_filter_types} if check_temp(tmp_file, nn_config_dict): LOG.info('Correct temp file exists at {0}, loading from temp'.format(tmp_file)) test_in, test_out, train_in, train_out, galaxy_ids = load_from_file(tmp_file) LOG.info('Done.') else: LOG.info('No temp file, reading from database.') test_in, test_out, train_in, train_out, galaxy_ids = get_train_test_data(test_data, train_data, input_type=input_type, output_type=output_type, repeat_redshift=repeat_redshift, input_filter_types=input_filter_types) LOG.info('Done. Writing temp file for next time.') write_file(tmp_file, nn_config_dict, test_in, test_out, train_in, train_out, galaxy_ids) LOG.info('Done. Temp file written to {0}'.format(tmp_file)) LOG.info('\nNormalising...') train_in_min, train_in_max, train_in = normalise_2Darray(train_in) #train_out_min, train_out_max, train_out = normalise_2Darray(train_out) test_in_min, test_in_max, test_in = normalise_2Darray(test_in) #test_out_min, test_out_max, test_out = normalise_2Darray(test_out) LOG.info('Normalising done.') print np.shape(train_in) print np.shape(train_out) print np.shape(test_in) print np.shape(test_out) input_dim = 0 if 'optical' in input_filter_types: input_dim += 10 if 'ir' in input_filter_types: input_dim += 9 if 'uv' in input_filter_types: input_dim += 2 input_dim *= 2 data_set = SupervisedDataSet(input_dim+repeat_redshift, 15) for i in range(0, len(train_in)): data_set.addSample(train_in[i], train_out[i]) LOG.info('Compiling neural network model') network = FeedForwardNetwork() input_layer = TanhLayer(input_dim+repeat_redshift,'Input') network.addInputModule(input_layer) prev_layer = TanhLayer(connections, 'hidden0') network.addModule(prev_layer) network.addConnection(FullConnection(input_layer, prev_layer)) for i in range(0, layers): new_layer = TanhLayer(connections, 'hidden{0}'.format(i)) network.addModule(new_layer) network.addConnection(FullConnection(new_layer, prev_layer)) prev_layer = new_layer output_layer = LinearLayer(15, 'output') network.addOutputModule(output_layer) network.addConnection(FullConnection(new_layer, output_layer)) network.sortModules() trainer = BackpropTrainer(network, data_set, verbose=True) LOG.info("Compiled.") epochs = 0 do_test = 10 trained = False while not trained: error = trainer.train() epochs += 1 do_test -= 1 LOG.info('Error rate at epoch {0}: {1}'.format(epochs, error)) if error < 0.001 or epochs == 500: trained = True if do_test == 0: with open('pybrain_inputs_{0}_outputs_{1}.txt'.format(input_filter_types, output_names[single_value]), 'w') as f: for i in range(0, 20): test_to_use = rand.randint(0, test_data - 1) ans = network.activate(np.array(test_in[test_to_use])) #f.write('Test {0} for epoch {1}\n'.format(i, total_epoch)) f.write('\nGalaxy number = {0}\n'.format(galaxy_ids[test_to_use])) f.write('Inputs: ') for item in test_in[test_to_use]: f.write(str(item)) f.write('\nOutput Correct\n') for a in range(0, len(test_out[test_to_use])): #f.write('{0}: {1} = {2}\n'.format(output_names[a], denormalise_value(ans[a], train_out_min[a], train_out_max[a]), denormalise_value(test_out[test_to_use][a], test_out_min[a], test_out_max[a]))) f.write('{0}: {1} = {2}\n'.format(output_names[a], ans[a], test_out[test_to_use][a])) f.write('\n\n') do_test = 10