def __init__(self, image_repository, path_columns, savefile, registrator=None, n_processes=mp.cpu_count(), debug=False): self.debug = debug print('initializing analysis...', '\timage repository:\t{}'.format(image_repository), '\tpath columns:\t{}'.format(path_columns), '\tsavefile:\t{}'.format(savefile), '\tprocesses:\t{}'.format(n_processes), '\tmeasurements:\{}'.format(list(MEASUREMENTS.keys())), '\tdenoising methods:\{}'.format(list(METHODS.keys())), sep='\n') self.methods = METHODS.copy() self.measurements = MEASUREMENTS.copy() self.savefile = savefile self.image_repository = image_repository self.path_columns = path_columns self.pool = mp.Pool(n_processes) self.registrator = registrator if isinstance(registrator, Registrator) else Registrator(verbose=debug, graphic=debug) self.denoising = None # make save dir if it does not exist save_path = os.path.dirname(self.savefile) if not os.path.isdir(save_path): os.makedirs(save_path) print('done!\n')
def run_workers(self, _num_work, _type, _configs, _args): """ Starts the Pool of Workers and executes them. The method blocks until all workers have completed. However, it also starts a background update-thread which publishes information about progress. :param _num_work: Number of workers to initialise :param _type: The worker type to run :param _configs: These are extensions across all workers: may be None :param _args: These are arguments per-worker. Must be a list equal in length to _num_work or None :return: Result of the Aggregator """ # Reset Everything self._reset(_num_work) _args = _args if _args is not None else [ None for _ in range(_num_work) ] # Prepare the Progress Bar: will automatically handle None self.__progress = ProgressBar(100 * _num_work, sink=self.__sink.Obj) # Create List of Worker Objects, and initialise thread _workers = [_type(_i + 1, self) for _i in range(_num_work)] self.__thread.start() # Start Pool and aggregate results if self.NumProc > 0: with mp.Pool(processes=self.NumProc) as pool: processes = [ pool.apply_async(self.__computer, args=(_workers[_i], (_configs, _args[_i]))) for _i in range(_num_work) ] aggregated = self._aggregate_results( [result.get() for result in processes]) else: r_q = queue.Queue() threads = [ threading.Thread(target=self.__threader, args=(_workers[_i], (_configs, _args[_i]), r_q)) for _i in range(_num_work) ] for thr in threads: thr.start() thr.join() results = [] while not r_q.empty(): results.append(r_q.get()) r_q.task_done() aggregated = self._aggregate_results(results) # Inform and join thread self.Queue.put([0, -1]) self.__thread.join() # Return the aggregated information return aggregated
def iterate_in_parallel(method, nproc=1, iterkeys=None, **params): ''' evaluate a given method for a given parameter set. params is a dict and some of its values are allowed to be iterable. the method is expected to return a dict with the SAME KEYS for every parameter in one iteration. an exception ocurring in the method is NOT handled without stopping the iteration. ''' # find the parameters to be iterated through iterkeys2 = [key for key in params if hasattr(params[key], "__iter__")] if iterkeys is None: iterkeys = iterkeys2 elif set(iterkeys) <= set(iterkeys2): for key in iterkeys: iterkeys2.remove(key) iterkeys = iterkeys + iterkeys2 else: print("I'm ignoring your iterkeys.") iterkeys = iterkeys2 # create stamp of the input stamp = dict(params) stamp["iterkeys"] = iterkeys stamp["method"] = method.__name__ # create list of params instances to be mapped iterator = combinations(params, iterkeys) # create the function to be mapped with def f(params): return method(**params) # map iterator using mpi4py # FIXME: doesn't work if some dolfin function are used, e.g. Function.extrapolate if MPI.COMM_WORLD.Get_size() > 1: result = mpimap(f, iterator) # map iterator using multiprocessing.Pool # FIXME: this approach of distributing across multiple processors is inconvenient # since a single error kills the whole simulation. # (not necessarily, error can be catched and displayed by method) # also it's not supposed to be appropriate for HPC architectures elif nproc > 1: pool = mp.Pool(nproc) result = pool.map(f, iterator) pool.close() pool.join() # map in serial else: result = map(f, iterator) return join_dicts(result), stamp
def main(patientFile, procedureFile, observationFile, conditionFile, coreNum): patientRecs, IDs = util.combineDatasets(patientFile, procedureFile, observationFile, conditionFile) patientRecCopy = list(repeat(patientRecs, coreNum)) splitIDs = np.array_split(list(IDs), coreNum) pooler = mp.Pool(coreNum) with open('AggregatePatientData.csv', 'a') as fout: for result in pooler.starmap(util.AggregateQuantValues, zip(patientRecCopy, splitIDs)): result.to_csv(fout, index=False, header=False)
def gen_programs(program_len, num_programs, args): """ Generates the specified amount of programs of the given length. These are the exact steps performed: 1. Generate <num_programs> programs using gen_program_worker in a process pool 2. Generate examples for each program by executing gen_examples_worker in a process pool. Discard programs for which the required amount of examples could not be generated. 3. Return a dictionary of the form {program: examples} """ progress_counter = multiprocessing.Value('i', 0) gen_prog_pool = multiprocessing.Pool(processes=args.num_workers, initializer=init_gen_prog_worker, initargs=(progress_counter, num_programs, program_len)) input_type_combinations = get_input_type_combinations(params.num_inputs) programs = gen_prog_pool.map(gen_program_worker, input_type_combinations) print('') # Flatten programs = [item for sublist in programs for item in sublist] programs = list(set(programs)) # Generate examples and filter out null programs progress_counter.value = 0 valid_counter = multiprocessing.Value('i', len(programs)) gen_examples_pool = multiprocessing.Pool( processes=args.num_workers, initializer=init_gen_examples_worker, initargs=(progress_counter, valid_counter, len(programs), args.num_examples, args.num_example_tries)) res = gen_examples_pool.map(gen_examples_worker, programs) print('') examples = dict(zip(programs, res)) examples = {k: v for k, v in examples.items() if v} return examples
def ParallelRxnBuildWriteOutRd2(LabeledList,BuildFunction,RoundNum): #No chunks for round 2 SplitList=numpy.array_split(LabeledList,16) #Fill new list of labeled cpds NewCpdList=[] #Build multiprocessing pool object with num of cpus pooler=mp.Pool(16) #Set up the new file with open('TempPandaDF.csv','w') as fp: #For each result in the Build function using an item from SplitList for result in pooler.imap(BuildFunction,SplitList): #Building new cpd list NewCpds=NewLabeledCpdList(result) NewCpdList.extend(NewCpds) #Each result is a Pandas Object, so write it to csv result.to_csv(fp,index=False,header=False) pooler.close() pooler.join() #Unique items NewCpdList=list(set(NewCpdList)) #Write File, optional #CpdFile='LabeledCpds_FromRound_{0}.csv'.format(RoundNum) #Change #with open(CpdFile,'w') as output: # writer=csv.writer(output,lineterminator='\n') # for val in NewCpdList: # writer.writerow([val]) #Return CpdList for next round return(NewCpdList)
def BuildPathsCoreParallel(ResultDictionaryNum, metabname, userinputname, StopDictNum): #The starting set of reactants that led to the product isotopomer of interest StartingSet = BuildOneRoundPath(ResultsList[ResultDictionaryNum], metabname) if len(StartingSet) > 0: #Starter path matrix SeedPath = AddOnPath(StartingSet, ResultsList[ResultDictionaryNum - 1]) if SeedPath is not None: try: #Go one more path matrix SeedPath = AddOnPath(SeedPath, ResultsList[ResultDictionaryNum - 2]) #if len(SeedPath)>48: SeedPathSplit = list(numpy.array_split(SeedPath, 32)) StopDictRepeat = list(repeat(ResultDictionaryNum - 2, 32)) pooler = mp.Pool(16) try: with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths for result in pooler.starmap( AddOnPathParallel3, zip(SeedPathSplit, StopDictRepeat)): result.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: try: #print('Go smaller - 16') #Smaller splits SeedPathSplit = list(numpy.array_split(SeedPath, 16)) StopDictRepeat = list( repeat(ResultDictionaryNum - 2, 16)) #pooler=mp.Pool(8) #with closing(mp.Pool(8)) as pooler: with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths for result in pooler.starmap( AddOnPathParallel3, zip(SeedPathSplit, StopDictRepeat)): result.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: #print('Error 1- Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]') try: #print('Even Smaller - 8') #Try smaller splits SeedPathSplit = list(numpy.array_split( SeedPath, 8)) StopDictRepeat = list( repeat(ResultDictionaryNum - 2, 8)) #pooler=mp.Pool(4) with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths for result in pooler.starmap( AddOnPathParallel3, zip(SeedPathSplit, StopDictRepeat)): result.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: #print('Error 2- Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]') try: #print('Even Smaaaalller - 4') #Try smaller splits SeedPathSplit = list( numpy.array_split(SeedPath, 4)) StopDictRepeat = list( repeat(ResultDictionaryNum - 2, 4)) #pooler=mp.Pool(4) with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths for result in pooler.starmap( AddOnPathParallel3, zip(SeedPathSplit, StopDictRepeat)): result.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: try: #print('Tiny! - 2') #Try smaller split SeedPathSplit = list( numpy.array_split(SeedPath, 2)) StopDictRepeat = list( repeat(ResultDictionaryNum - 2, 2)) #pooler=mp.Pool(2) with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths for result in pooler.starmap( AddOnPathParallel3, zip(SeedPathSplit, StopDictRepeat)): result.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: #print('Error 3 - Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]') try: #print('No Parallel') Output = BuildPathsCoreNonParallel( ResultDictionaryNum - 2, SeedPath, metabname) with open( '{0}_Paths_{1}Rxns.csv'.format( userinputname, StopDictNum), 'a' ) as fp: #originally 'w' - but append for looping through shorter path lengths Output.to_csv(fp, index=False, header=False) pooler.close() pooler.join() gc.collect() except: print('Paths cannot be built') pooler.close() pooler.join() gc.collect() return except: print('Isotopomer Failed') pooler.close() pooler.join() gc.collect() return
def main(): """ Generates programs. These are the basic steps performed: D = {} for 1 <= i <= max_train_len: 1. P = Generate programs of length i 2. E = Generate examples for the generated programs 3. Discard programs in P that are equivalent to any program in D 4. D += (P, E) for j in test_lengths: Sample num_test programs Discard all programs of equal length in D which are equivalent. Note: 1. Step 3 of the first greatly increases the richness of the dataset. We ensure this way that our programs aren't likely to have shorter equivalents. 2. It is recommended to use --cache to load a dataset cache. The algorithm then continues generating for lengths larger than the maximum length of the cache. This allows incremental dataset generation and also helps with the generation of shorter programs where generation is slow due to randomness. Furthermore, we can (and should!) have virtually all programs of length <=3, to ensure our dataset is meaningful. 3. During test sampling we only compare to programs of equivalent lengths for efficiency. This is since our data generation algorithm already ensures that for all longer and shorter programs there is no equivalence. 4. Since the pruning is done after program generation, rather than during, the number of programs generated in each iteration is NOT args.num_train. This is done purely due to implementation details: it is challenging to discard whilst generating since it would require all processes to write and read from the same dictionary in parallel. However, this is a good feature for the future, to avoid having to try multiple values for num_train via trial-and-error. """ parser = argparse.ArgumentParser() parser.add_argument('--num_train', type=int, required=True) parser.add_argument('--num_test', type=int, required=True) parser.add_argument('--train_output_path', type=str, required=True) parser.add_argument('--test_output_path', type=str, required=True) parser.add_argument('--max_train_len', type=int, required=True) parser.add_argument('--min_train_len', default=5, type=int, required=False) # me parser.add_argument('--test_lengths', type=str, required=True, help="List of test lengths to generate") parser.add_argument('--num_workers', type=int, default=8) parser.add_argument('--num_examples', type=int, default=params.num_examples) parser.add_argument( '--num_example_tries', type=int, default=200, help='total amount of tries to generate examples to try to generate') parser.add_argument( '--cache', type=str, default=None, help="Dataset cache from which to continue generating programs") args = parser.parse_args() test_lens = set([int(x) for x in args.test_lengths.split()]) if args.min_train_len != -1: examples = {} min_len = args.min_train_len - 1 # as following loops start from +1 else: if args.cache: examples = load_cache(args.cache) min_len = max([len(k) for k in examples]) else: examples = {} min_len = 0 for program_len in range(min_len + 1, args.max_train_len + 1): num_programs = args.num_train + args.num_test if program_len in KNOWN_TRAIN_SIZES: num_programs = min(num_programs, KNOWN_TRAIN_SIZES[program_len]) print("Generating programs of length %d (current dataset size: %d)" % (program_len, len(examples))) new_examples = gen_programs(program_len, num_programs, args) existing_programs = list(examples.keys()) counter = multiprocessing.Value('i', 0) new_programs = list(new_examples.keys()) discard_pool = multiprocessing.Pool( processes=args.num_workers, initializer=init_discard_identical_worker, initargs=(existing_programs, counter, len(new_programs))) new_program_parts = [ new_programs[i::args.num_workers] for i in range(args.num_workers) ] new_example_parts = [{p: new_examples[p] for p in programs} for programs in new_program_parts] res = discard_pool.map(discard_identical_worker, new_example_parts) print('') for d in res: examples.update(d) train_programs = list(examples.keys()) print("Finished generation. Total programs: %d" % len(train_programs)) # Generate test programs (they're not equivalent to all shorter programs so only same length needs to be considered) for test_len in test_lens: test_programs = [] test_candidates = [ x for x in train_programs if len(x.statements) == test_len ] train_programs = [ x for x in train_programs if len(x.statements) != test_len ] random.shuffle(test_candidates) indices_to_discard = set() for i, program in enumerate(test_candidates): if len(test_programs) >= args.num_test: break if i in indices_to_discard: continue print("\rCreating test programs for length %d... %d\\%d" % (test_len, len(test_programs), args.num_test), end="") test_programs.append(program) indices_to_discard.add(i) for j, other in enumerate(test_candidates[i + 1:]): if j in indices_to_discard: continue if constraint.is_same(program, other, examples[program]): indices_to_discard.add(j) print('') print("Removed %d programs" % len(indices_to_discard)) train_programs += [ test_candidates[i] for i in range(len(test_candidates)) if i not in indices_to_discard ] output_path = args.test_output_path + '_' + str(test_len) print('Writing %d test programs to %s' % (len(test_programs), output_path)) with open(output_path, 'w') as f: write_programs_to_file(f, test_programs, examples) print('Writing %d train programs to %s' % (len(train_programs), args.train_output_path)) with open(args.train_output_path, 'w') as f: write_programs_to_file(f, train_programs, examples)
#Clean hideous lists for enz in range(len(PathDF)): PathDF.iloc[enz]['Circadian Enzymes'] = list(set(re.split('\,', re.sub('\\\|\'|\[|\]|\"| ', '', str(PathDF['Circadian Enzymes'][enz]))))) return(PathDF) #Read in the results of the isotopologue file of interest rxnlength=int(re.findall('(\d+)Rxns', 'Serine M+3_Paths_9Rxns.csv')[0]) MetabDF=pd.read_csv('Serine M+3_Paths_9Rxns.csv',header=None,error_bad_lines=False, names=list(range(rxnlength*2+1))) #Clean up serine ParaNum = 2 PathMatrixSplit=list(np.array_split(MetabDF,ParaNum)) #this could give an error if the number of rows is <16 pooler=mp.Pool(ParaNum) Call = list(np.repeat('Filter', ParaNum)) #In case it exists already if 'Trimmed_Paths.csv' in os.listdir(): os.remove('Trimmed_Paths.csv') with open('Trimmed_Paths.csv','a') as fp: #originally 'w' - but append for looping through shorter path lengths #written out to file as its being built #for result in pooler.imap(MatchCircECHitsAndGetGibbsFilter,PathMatrixSplit): for result in pooler.starmap(MatchCircECHitsAndGetGibbs,zip(PathMatrixSplit, Call)): #Each result is a Pandas Object, so write it to csv result.to_csv(fp,index=False,header=False) pooler.close() pooler.join()