def main(image_size=10): print('Loading pairs data and converting to images') with open('../../data/training-flipped/CEdata_train_pairs.csv', 'r') as pairs_data_file: pairs_header = pairs_data_file.readline() pairs_body = pairs_data_file.readlines() Inps = np.zeros([len(pairs_body), image_size ** 2]) prog = Progress(len(pairs_body)) for (i, line) in enumerate(pairs_body): A = np.array([float(a) for a in line.strip().split(',')[1].strip().split(' ')]) B = np.array([float(b) for b in line.strip().split(',')[2].strip().split(' ')]) Inps[i,:] = pairs_to_image(A, B, image_size) prog.tick() prog.done() print('Loading validation data and converting to images') with open('../../data/validation/CEfinal_valid_pairs.csv', 'r') as valid_data_file: valid_header = valid_data_file.readline() valid_body = valid_data_file.readlines() validInps = np.zeros([len(valid_body), image_size ** 2]) prog = Progress(len(valid_body)) for (i, line) in enumerate(valid_body): A = np.array([float(a) for a in line.strip().split(',')[1].strip().split(' ')]) B = np.array([float(b) for b in line.strip().split(',')[2].strip().split(' ')]) validInps[i,:] = pairs_to_image(A, B, image_size) prog.tick() prog.done() print('Saving data to MATLAB format') scipy.io.savemat('images_10_pit.mat', {'train_images' : Inps, 'valid_images' : validInps})
def fineTune(self, minibatchStream, epochs, mbPerEpoch, loss = None, progressBar = True, useDropout = False): for ep in range(epochs): totalCases = 0 sumErr = 0 sumLoss = 0 if self.nesterov: step = self.stepNesterov else: step = self.step prog = Progress(mbPerEpoch) if progressBar else DummyProgBar() for i in range(mbPerEpoch): inpMB, targMB = minibatchStream.next() usemaxNorm = False usenoises = False if ep > 6: usemaxNorm = True usenoises = True err, outMB = step(inpMB, targMB, self.learnRates, self.momentum, self.L2Costs, useDropout, usemaxNorm, usenoises) sumErr += err if loss != None: sumLoss += loss(targMB, outMB) totalCases += inpMB.shape[0] prog.tick() prog.done() yield sumErr/float(totalCases), sumLoss/float(totalCases)
def read_valid_pairs(): valid_path = get_paths()["valid_pairs_path"] with open(valid_path, 'r') as pairs_data_file: pairs_header = pairs_data_file.readline() pairs_body = pairs_data_file.readlines() pairs = {} prog = Progress(len(pairs_body)) for line in pairs_body: A = np.array([float(a) for a in line.strip().split(',')[1].strip().split(' ')]) B = np.array([float(b) for b in line.strip().split(',')[2].strip().split(' ')]) pairs[line.split(',')[0]] = (A, B) prog.tick() prog.done() return pairs
def apply_features(data, features): #### TODO - can I be made more efficient? prog = Progress(len(data)) output = {key : np.zeros(len(features)) for key in data.keys()} for (key, (A, B)) in data.iteritems(): for (j, (name, variable, f)) in enumerate(features): if variable == 'A': value = f(A) elif variable == 'B': value = f(B) elif variable == 'derived': value = eval(f) else: value = f(A, B) output[key][j] = value prog.tick() prog.done() return output
def fineTune(self, minibatchStream, epochs, mbPerEpoch, loss=None, progressBar=True, useDropout=False): for ep in range(epochs): totalCases = 0 sumErr = 0 sumLoss = 0 if self.nesterov: step = self.stepNesterov else: step = self.step prog = Progress(mbPerEpoch) if progressBar else DummyProgBar() for i in range(mbPerEpoch): if isinstance(self.outputActFunct, LinearMasked): inpMB, targMB, targMaskMB = minibatchStream.next() err, outMB = step(inpMB, targMB, self.learnRates, self.momentum, self.L2Costs, useDropout, targMaskMB) else: inpMB, targMB = minibatchStream.next() err, outMB = step(inpMB, targMB, self.learnRates, self.momentum, self.L2Costs, useDropout) sumErr += err if loss != None: sumLoss += loss(targMB, outMB) totalCases += inpMB.shape[0] prog.tick() prog.done() yield sumErr / float(totalCases), sumLoss / float(totalCases)
def fineTune(self, minibatchStream, epochs, mbPerEpoch, loss = None, progressBar = True, useDropout = False): for ep in range(epochs): totalCases = 0 sumErr = 0 sumLoss = 0 #### TODO - What is nesterov? if self.nesterov: step = self.stepNesterov else: step = self.step prog = Progress(mbPerEpoch) if progressBar else DummyProgBar() for i in range(mbPerEpoch): inpMB, targMB = minibatchStream.next() #### TODO - different version of step for when using droupout or not err, outMB = step(inpMB, targMB, self.learnRates, self.momentum, self.L2Costs, useDropout) sumErr += err if loss != None: sumLoss += loss(targMB, outMB) totalCases += inpMB.shape[0] prog.tick() prog.done() yield sumErr/float(totalCases), sumLoss/float(totalCases)
def reverse_it(overwrite=False): if (not overwrite) and os.path.exists('training-reversed/CEdata_train_pairs.csv') : print 'Output already exists - not overwriting' return # Open info with open('training/CEdata_train_publicinfo.csv', 'r') as info_data_file: info_header = info_data_file.readline() info_body = info_data_file.readlines() # Reverse it (no change) original_length = len(info_body) for i in range(original_length): info_body.append(','.join(['train%d' % (len(info_body)+1)] + info_body[i].split(',')[1:])) info_body.append(','.join(['train%d' % (len(info_body)+1)] + info_body[i].split(',')[1:])) info_body.append(','.join(['train%d' % (len(info_body)+1)] + info_body[i].split(',')[1:])) # Open targets with open('training/CEdata_train_target.csv', 'r') as target_data_file: target_header = target_data_file.readline() target_body = target_data_file.readlines() # Reverse it - no change original_length = len(target_body) for i in range(original_length): target_body.append(','.join(['train%d' % (len(target_body)+1)] + target_body[i].split(',')[1:])) target_body.append(','.join(['train%d' % (len(target_body)+1)] + target_body[i].split(',')[1:])) target_body.append(','.join(['train%d' % (len(target_body)+1)] + target_body[i].split(',')[1:])) # Open pairs with open('training/CEdata_train_pairs.csv', 'r') as pairs_data_file: pairs_header = pairs_data_file.readline() # Write reversed lines to temporary file with open('temp.csv', 'w') as temp_file: temp_file.write(pairs_header) prog = Progress(original_length * 5) for line in pairs_data_file: A = np.array([float(a) for a in line.strip().split(',')[1].strip().split(' ')]) B = np.array([float(b) for b in line.strip().split(',')[2].strip().split(' ')]) if set(A) == set([0, 1]): A_reversed = 1 - A else: A_reversed = 2 * np.mean(A) - A if set(B) == set([0, 1]): B_reversed = 1 - B else: B_reversed = 2 * np.mean(B) - B temp_file.write(','.join(['dummy-id'] + [' '.join(str(a) for a in A_reversed)] + [' '.join(str(b) for b in B)]) + '\n') temp_file.write(','.join(['dummy-id'] + [' '.join(str(a) for a in A)] + [' '.join(str(b) for b in B_reversed)]) + '\n') temp_file.write(','.join(['dummy-id'] + [' '.join(str(a) for a in A_reversed)] + [' '.join(str(b) for b in B_reversed)]) + '\n') prog.tick() # Concatenate original pairs and temporary file with open('training-reversed/CEdata_train_pairs.csv', 'w') as pairs_data_file: pairs_data_file.write(pairs_header) i = 1 for file_name in ['training/CEdata_train_pairs.csv', 'temp.csv']: with open(file_name, 'r') as input_file: input_file.readline() for line in input_file: pairs_data_file.write(','.join(['train%d' % i] + line.split(',')[1:])) prog.tick() i += 1 prog.done() os.remove('temp.csv') # Save other files with open('training-reversed/CEdata_train_target.csv', 'w') as target_data_file: target_data_file.write(target_header + ''.join(target_body)) with open('training-reversed/CEdata_train_publicinfo.csv', 'w') as info_data_file: info_data_file.write(info_header + ''.join(info_body))
def flip_it(overwrite=False): if (not overwrite) and os.path.exists('training-flipped/CEdata_train_pairs.csv') : print 'Output already exists - not overwriting' return # Open info with open('training/CEdata_train_publicinfo.csv', 'r') as info_data_file: info_header = info_data_file.readline() info_body = info_data_file.readlines() # Flip it original_length = len(info_body) prog = Progress(original_length) for i in range(original_length): info_body.append(','.join(['train%d' % (len(info_body)+1)] + list(reversed(info_body[i].strip().split(',')[1:]))) + '\n') prog.tick() prog.done() # Open targets with open('training/CEdata_train_target.csv', 'r') as target_data_file: target_header = target_data_file.readline() target_body = target_data_file.readlines() # Flip it original_length = len(target_body) prog = Progress(original_length) for i in range(original_length): targets = target_body[i].split(',')[1:] if targets[0] == '1': targets[0] = '-1' elif targets[0] == '-1': targets[0] = '1' if targets[1] == '1\n': targets[1] = '2\n' elif targets[1] == '2\n': targets[1] = '1\n' target_body.append(','.join(['train%d' % (len(target_body)+1)] + targets)) prog.tick() prog.done() # Open pairs with open('training/CEdata_train_pairs.csv', 'r') as pairs_data_file: pairs_header = pairs_data_file.readline() with open('temp.csv', 'w') as temp_file: temp_file.write(pairs_header) prog = Progress(original_length * 3) # Save flipped lines to temporary file for line in pairs_data_file: temp_file.write(','.join(['dummy-id'] + list(reversed(line.strip().split(',')[1:]))) + '\n') prog.tick() # Concatenate original pairs and temporary file with open('training-flipped/CEdata_train_pairs.csv', 'w') as pairs_data_file: pairs_data_file.write(pairs_header) i = 1 for file_name in ['training/CEdata_train_pairs.csv', 'temp.csv']: with open(file_name, 'r') as input_file: input_file.readline() for line in input_file: pairs_data_file.write(','.join(['train%d' % i] + line.split(',')[1:])) prog.tick() i += 1 # Save other files with open('training-flipped/CEdata_train_target.csv', 'w') as target_data_file: target_data_file.write(target_header + ''.join(target_body)) with open('training-flipped/CEdata_train_publicinfo.csv', 'w') as info_data_file: info_data_file.write(info_header + ''.join(info_body))
def run_batch_locally(scripts, language='python', paths=[], max_cpu=0.9, max_mem=0.9, submit_sleep=1, job_check_sleep=30, \ verbose=True, max_files_open=100, max_running_jobs=10, single_thread=True): ''' Receives a list of python scripts to run Assumes the code has an output file that will be managed by this function Returns a list of local file names where the code has presumably stored output ''' # Define some code constants #### Do we need to set paths explicitly? #### This will be deprecated in future MATLAB - hopefully the -singleCompThread command is sufficient matlab_single_thread = ''' maxNumCompThreads(1); ''' python_path_code = ''' import sys sys.path.append('%s') ''' matlab_path_code = ''' addpath(genpath('%s')) ''' python_completion_code = ''' print 'Writing completion flag' with open('%(flag_file)s', 'w') as f: f.write('Goodbye, World') print "Goodbye, World" quit() ''' #### TODO - Is this completely stable matlab_completion_code = ''' fprintf('\\nWriting completion flag\\n'); ID = fopen('%(flag_file)s', 'w'); fprintf(ID, 'Goodbye, world'); fclose(ID); fprintf('\\nGoodbye, World\\n'); quit() ''' # Initialise lists of file locations job ids shell_files = [None] * len(scripts) script_files = [None] * len(scripts) output_files = [None] * len(scripts) stdout_files = [None] * len(scripts) stdout_file_handles = [None] * len(scripts) flag_files = [None] * len(scripts) processes = [None] * len(scripts) fear_finished = False job_finished = [False] * len(scripts) files_open = 0 # Loop through jobs, submitting jobs whenever CPU usage low enough, re-submitting failed jobs if not verbose: prog = Progress(len(scripts)) while not fear_finished: should_sleep = True for (i, code) in enumerate(scripts): if (not job_finished[i]) and (processes[i] is None) and (files_open <= max_files_open) and (len([1 for p in processes if not p is None]) < max_running_jobs): # This script has not been run - check CPU and potentially run #### FIXME - Merge if statements if (psutil.cpu_percent() < max_cpu * 100) and (psutil.virtual_memory().percent < max_mem * 100): # Jobs can run should_sleep = False # Get the job ready if LOCATION == 'local': temp_dir = LOCAL_TEMP_PATH else: temp_dir = HOME_TEMP_PATH if language == 'python': script_files[i] = (mkstemp_safe(temp_dir, '.py')) elif language == 'matlab': script_files[i] = (mkstemp_safe(temp_dir, '.m')) # Create necessary files in local path shell_files[i] = (mkstemp_safe(temp_dir, '.sh')) output_files[i] = (mkstemp_safe(temp_dir, '.out')) stdout_files[i] = (mkstemp_safe(temp_dir, '.o')) flag_files[i] = (mkstemp_safe(temp_dir, '.flg')) # Customise code #### TODO - make path and output_transfer optional if language == 'python': code = code + python_completion_code for path in paths: code = (python_path_code % path) + code elif language == 'matlab': code = code + matlab_completion_code for path in paths: code = (matlab_path_code % path) + code code = code % {'output_file': output_files[i], 'flag_file' : flag_files[i]} # Write code and shell file with open(script_files[i], 'w') as f: f.write(code) with open(shell_files[i], 'w') as f: #### TODO - is os.path.join always correct - what happens if this program is being run on windows? if language == 'python': f.write('python ' + script_files[i] + '\n') elif language == 'matlab': if LOCATION == 'home': matlab_path = HOME_MATLAB else: matlab_path = LOCAL_MATLAB if single_thread: f.write('cd ' + os.path.split(script_files[i])[0] + ';\n' + matlab_path + ' -nosplash -nojvm -nodisplay -singleCompThread -r ' + \ os.path.split(script_files[i])[-1].split('.')[0] + '\n') else: f.write('cd ' + os.path.split(script_files[i])[0] + ';\n' + matlab_path + ' -nosplash -nojvm -nodisplay -r ' + \ os.path.split(script_files[i])[-1].split('.')[0] + '\n') # Start running the job if verbose: print 'Submitting job %d of %d' % (i + 1, len(scripts)) stdout_file_handles[i] = open(stdout_files[i], 'w') files_open = files_open + 1 processes[i] = subprocess.Popen(['sh', shell_files[i]], stdout = stdout_file_handles[i]); # Sleep for a bit so the process can kick in (prevents 100s of jobs being sent to processor) time.sleep(submit_sleep) elif (not job_finished[i]) and (not processes[i] is None): # Ask the process how its doing processes[i].poll() # Check to see if the process has completed if not processes[i].returncode is None: if os.path.isfile(flag_files[i]): job_finished[i] = True if verbose: print 'Job %d of %d has completed' % (i + 1, len(scripts)) else: prog.tick() else: if verbose: print 'Job %d has failed - will try again later' % i + 1 processes[i] = None # Tidy up temp files os.remove(script_files[i]) os.remove(shell_files[i]) stdout_file_handles[i].close() files_open = files_open - 1 os.remove(stdout_files[i]) os.remove(flag_files[i]) processes[i] = None # Something useful happened should_sleep = False if all(job_finished): fear_finished = True if not verbose: prog.done() elif should_sleep: # Count how many jobs are queued n_queued = 0 # Count how many jobs are running n_running = 0 if verbose: # print '%d jobs running' % n_running # print '%d jobs queued' % n_queued print 'Sleeping for %d seconds' % job_check_sleep time.sleep(job_check_sleep) #### TODO - return job output and error files as applicable (e.g. there may be multiple error files associated with one script) return output_files
def fineTune(self, minibatchStream, trainInps, epochs, mbPerEpoch, loss=None, validSet=False, progressBar=True, useDropout=False): for ep in xrange(epochs): print print 'learnRates:', self.learnRates totalCases = 0 sumErr = 0 sumLoss = 0 if self.nesterov: step = self.stepNesterov else: step = self.step prog = Progress(mbPerEpoch) if progressBar else DummyProgBar() for i in range(mbPerEpoch): # print 'Epoch:', ep, 'minibatch', i (inpMB, targMB, mbgraph) = minibatchStream.next() if len(targMB.shape ) != 3: # Convert to a cubic matrix (3d matrix) targMB = targMB.reshape(-1, 1, targMB.shape[1]) # Each dimensions of inpMB (3d), refers to a pivot vector. Now, we want to select # training samples that falls in the neighborhood of this guy, and store in the # corresponding dimension of xsl (x_selected). xsl = np.zeros((mbgraph.indx.shape[0], mbgraph.indx.shape[1], trainInps.shape[1])) for j in xrange(mbgraph.indx.shape[0]): xsl[j] = trainInps[mbgraph.indx[j] - 1] # -1 because I need to covert the indices from matlab format to python #distribute graph.vals to 3d vals_select = mbgraph.vals #It has been converted to 3d inside manifold.py del mbgraph err = step(xsl, vals_select, inpMB, targMB, self.learnRates, self.momentum, self.L2Costs, useDropout) # gnp.free_reuse_cache() sumErr += err # print err, sumErr totalCases += inpMB.shape[0] prog.tick() prog.done() self.learnRates = [ y * self.learnRatesMultiplier for y in self.learnRates ] # If validation set is given if validSet: val_outputActs = self.fprop_xf(validSet['trainInps']) val_error = self.outputActFunct.error( gnp.garray(validSet['trainTargs']), self.state[-1], val_outputActs) yield sumErr / float( totalCases), val_error / validSet['trainInps'].shape[0] else: yield sumErr / float(totalCases)
def run_batch_locally(scripts, language='python', paths=[], max_cpu=0.9, max_mem=0.9, submit_sleep=1, job_check_sleep=30, \ verbose=True, max_files_open=100, max_running_jobs=10, single_thread=True): ''' Receives a list of python scripts to run Assumes the code has an output file that will be managed by this function Returns a list of local file names where the code has presumably stored output ''' # Define some code constants #### Do we need to set paths explicitly? #### This will be deprecated in future MATLAB - hopefully the -singleCompThread command is sufficient matlab_single_thread = ''' maxNumCompThreads(1); ''' python_path_code = ''' import sys sys.path.append('%s') ''' matlab_path_code = ''' addpath(genpath('%s')) ''' python_completion_code = ''' print 'Writing completion flag' with open('%(flag_file)s', 'w') as f: f.write('Goodbye, World') print "Goodbye, World" quit() ''' #### TODO - Is this completely stable matlab_completion_code = ''' fprintf('\\nWriting completion flag\\n'); ID = fopen('%(flag_file)s', 'w'); fprintf(ID, 'Goodbye, world'); fclose(ID); fprintf('\\nGoodbye, World\\n'); quit() ''' # Initialise lists of file locations job ids shell_files = [None] * len(scripts) script_files = [None] * len(scripts) output_files = [None] * len(scripts) stdout_files = [None] * len(scripts) stdout_file_handles = [None] * len(scripts) flag_files = [None] * len(scripts) processes = [None] * len(scripts) fear_finished = False job_finished = [False] * len(scripts) files_open = 0 # Loop through jobs, submitting jobs whenever CPU usage low enough, re-submitting failed jobs if not verbose: prog = Progress(len(scripts)) while not fear_finished: should_sleep = True for (i, code) in enumerate(scripts): if (not job_finished[i]) and (processes[i] is None) and ( files_open <= max_files_open) and (len([ 1 for p in processes if not p is None ]) < max_running_jobs): # This script has not been run - check CPU and potentially run #### FIXME - Merge if statements if (psutil.cpu_percent() < max_cpu * 100) and ( psutil.virtual_memory().percent < max_mem * 100): # Jobs can run should_sleep = False # Get the job ready if LOCATION == 'local': temp_dir = LOCAL_TEMP_PATH else: temp_dir = HOME_TEMP_PATH if language == 'python': script_files[i] = (mkstemp_safe(temp_dir, '.py')) elif language == 'matlab': script_files[i] = (mkstemp_safe(temp_dir, '.m')) # Create necessary files in local path shell_files[i] = (mkstemp_safe(temp_dir, '.sh')) output_files[i] = (mkstemp_safe(temp_dir, '.out')) stdout_files[i] = (mkstemp_safe(temp_dir, '.o')) flag_files[i] = (mkstemp_safe(temp_dir, '.flg')) # Customise code #### TODO - make path and output_transfer optional if language == 'python': code = code + python_completion_code for path in paths: code = (python_path_code % path) + code elif language == 'matlab': code = code + matlab_completion_code for path in paths: code = (matlab_path_code % path) + code code = code % { 'output_file': output_files[i], 'flag_file': flag_files[i] } # Write code and shell file with open(script_files[i], 'w') as f: f.write(code) with open(shell_files[i], 'w') as f: #### TODO - is os.path.join always correct - what happens if this program is being run on windows? if language == 'python': f.write('python ' + script_files[i] + '\n') elif language == 'matlab': if LOCATION == 'home': matlab_path = HOME_MATLAB else: matlab_path = LOCAL_MATLAB if single_thread: f.write('cd ' + os.path.split(script_files[i])[0] + ';\n' + matlab_path + ' -nosplash -nojvm -nodisplay -singleCompThread -r ' + \ os.path.split(script_files[i])[-1].split('.')[0] + '\n') else: f.write('cd ' + os.path.split(script_files[i])[0] + ';\n' + matlab_path + ' -nosplash -nojvm -nodisplay -r ' + \ os.path.split(script_files[i])[-1].split('.')[0] + '\n') # Start running the job if verbose: print 'Submitting job %d of %d' % (i + 1, len(scripts)) stdout_file_handles[i] = open(stdout_files[i], 'w') files_open = files_open + 1 processes[i] = subprocess.Popen( ['sh', shell_files[i]], stdout=stdout_file_handles[i]) # Sleep for a bit so the process can kick in (prevents 100s of jobs being sent to processor) time.sleep(submit_sleep) elif (not job_finished[i]) and (not processes[i] is None): # Ask the process how its doing processes[i].poll() # Check to see if the process has completed if not processes[i].returncode is None: if os.path.isfile(flag_files[i]): job_finished[i] = True if verbose: print 'Job %d of %d has completed' % (i + 1, len(scripts)) else: prog.tick() else: if verbose: print 'Job %d has failed - will try again later' % i + 1 processes[i] = None # Tidy up temp files os.remove(script_files[i]) os.remove(shell_files[i]) stdout_file_handles[i].close() files_open = files_open - 1 os.remove(stdout_files[i]) os.remove(flag_files[i]) processes[i] = None # Something useful happened should_sleep = False if all(job_finished): fear_finished = True if not verbose: prog.done() elif should_sleep: # Count how many jobs are queued n_queued = 0 # Count how many jobs are running n_running = 0 if verbose: # print '%d jobs running' % n_running # print '%d jobs queued' % n_queued print 'Sleeping for %d seconds' % job_check_sleep time.sleep(job_check_sleep) #### TODO - return job output and error files as applicable (e.g. there may be multiple error files associated with one script) return output_files