def _calc_threads(self): if mp.cpu_count() == 1: max = 1 else: max = mp.cpu_count() - 1 return max
def exposure_analysis(rail=False): """ Get exposure statistics for all road or railway assets in all regions. Optional Arguments: *rail* : Default is **False**. Set to **True** if you would like to intersect the railway assets in a region. """ # set data path data_path = load_config()['paths']['data'] # load shapefile with all regions global_regions = geopandas.read_file( os.path.join(data_path, 'input_data', 'global_regions_v2.shp')) # load csv with income group data and assign income group to regions incomegroups = pandas.read_csv(os.path.join(data_path, 'input_data', 'incomegroups_2018.csv'), index_col=[0]) income_dict = dict(zip(incomegroups.index, incomegroups.GroupCode)) global_regions['wbincome'] = global_regions.GID_0.apply( lambda x: income_dict[x]) # only keep regions for which we have data global_regions = global_regions.loc[global_regions.GID_2.isin([ (x.split('.')[0]) for x in os.listdir(os.path.join(data_path, 'region_osm')) ])] # create dictionary with information on protection standards prot_lookup = dict( zip(global_regions['GID_2'], global_regions['prot_stand'])) # create lists for the parallelization regions = list(global_regions.index) prot_lookups = [prot_lookup] * len(regions) data_paths = [data_path] * len(regions) # run exposure analysis parallel if not rail: with Pool(cpu_count() - 1) as pool: collect_output = pool.starmap(regional_roads, zip(regions, prot_lookups, data_paths), chunksize=1) pandas.concat(collect_output).to_csv( os.path.join(data_path, 'summarized', 'total_exposure_road.csv')) else: with Pool(cpu_count() - 1) as pool: collect_output = pool.starmap(regional_railway, zip(regions, prot_lookups, data_paths), chunksize=1) pandas.concat(collect_output).to_csv( os.path.join(data_path, 'summarized', 'total_exposure_railway.csv'))
def _calc_num_threads(self, df_size: int, query_size: int, max_threads=None) -> int: num_queries = df_size * query_size if mp.cpu_count() == 1: max = 1 else: max = mp.cpu_count() - 1 calc = int(num_queries / 5000) if calc > max: r = max elif calc <= 1: if num_queries > 1000: r = 2 else: r = 1 else: r = calc if max_threads is not None and r > max_threads: return max_threads return r
def generate(self, sample_size): self.size = sample_size # number of samples to be collected print(mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) self.samples = pool.map(self.method, range( self.size)) # container for the collected samples #print(self.samples) --> debug pool.close() pool.join()
def run_preprocess(self, database_destination, table_destination): """ :return: """ list_of_dfs = [] for journal, path in self._mydict.iteritems(): list_of_dfs.append(self._identify_files_and_titles(path, journal)) dataframe = pd.concat(list_of_dfs) if self._n_cores != 1: if self._n_cores == mp.cpu_count(): print "Going to WARP SPEED!!!!" else: print "Meh, multiprocessing, but not good enough!" titles_to_process = dataframe['file_complete_path'].values list_features = self._pool.map( self._feat_gen.gen_features_from_pdf_file, titles_to_process) else: print "Why you didn't choose multiprocessing??? WHY???? WHY???????!" print '' list_features = [] total_size = len(dataframe['file_complete_path'].values) count_ite = 1 for t in dataframe['file_complete_path'].values: try: list_features.append( self._feat_gen.gen_features_from_pdf_file(t)) count_ite += 1 print 'Done ' + str(t) print str(count_ite) + ' of ' + str(total_size) except: print ' ' print 'Problems with pdf: ' + str(t) print 'Iteration: ' + str(count_ite) print ' ' file_features = pd.concat(list_features) print "Sorry, no multiprocessing implemented for this part :/ " list_of_features_from_db = [] total_size = len(dataframe['title'].values) count_ite = 1 for t in dataframe['title'].values: list_of_features_from_db.append( self._feat_gen.gen_features_from_database(t)) count_ite += 1 print 'Done ' + str(t) print str(count_ite) + ' of ' + str(total_size) db_features = pd.concat(list_of_features_from_db) rtn_dataframe = file_features.set_index('title').join( db_features.set_index('title')) rtn_dataframe = rtn_dataframe.join(dataframe.set_index('title')) rtn_dataframe.reset_index(level=0, inplace=True) return rtn_dataframe
def preprocess_from_files(self,shot_files,use_shots): #all shots, including invalid ones all_signals = self.conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) #empty used_shots = ShotList() use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,shot) in enumerate(pool.imap_unordered(self.preprocess_single_file,shot_list_picked)): #for (i,shot) in enumerate(map(self.preprocess_single_file,shot_list_picked)): sys.stdout.write('\r{}/{}'.format(i,len(shot_list_picked))) used_shots.append_if_valid(shot) pool.close() pool.join() print('Finished Preprocessing {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) print('Omitted {} shots of {} total.'.format(len(shot_list_picked) - len(used_shots),len(shot_list_picked))) print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),len(used_shots))) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data is complete and available at {}.".format(self.conf['paths']['signal_prepath'])) return used_shots
def correction_factor(p, number_of_runs, method, X, y, n_jobs=None): # Setup parallel job if n_jobs == -1: n_jobs = cpu_count() elif n_jobs == None: n_jobs = 1 pool = Pool(n_jobs, maxtasksperchild=1000) def run(_): # Artificially falsify y_f = falsify(y, p, random_state=_) # Correct labels y_corrected = method.fit_transform(X, y_f) N = X.shape[0] return ((y == y_corrected).sum() - (1 - p) * N) / (p * N) factor = np.array(pool.map(run, range(number_of_runs))) # Close the pool again pool.close() pool.join() pool.clear() return np.mean(factor), np.std(factor)
def bin_func(raster_folder, raster_filename, output_folder, binary_functions, binary_args): start_time = datetime.now() input_raster = os.path.join(raster_folder, raster_filename) hdr_file = "" # input_raster + ".hdr" # only used for ENVI stacks outname = os.path.join(output_folder, raster_filename) if outname.find(".tif") != -1: outname = outname[0:len(outname) - 4] # arr: full size numpy array 3D XxYxZ 200x300x100 arr = preprocessing.rio_array(input_raster, hdr_file=hdr_file) # activate to get list of dates from .hdr file (.hdr file needs to be specified above) dates = arr[1] for i, func in enumerate(binary_functions): # threshold_size = str(statistical_args[i]['threshold_size']) # creating results with calling wanted algorithm in parallel_apply_along_axis for quick runtime result = apply_along_axis.parallel_apply_along_axis(func1d=func, arr=arr[0], axis=0, cores=mp.cpu_count(), **binary_args[i]) # selecting dtype based on result dtype = type(result[0][0]) func_name_end = str(func).find(" at") func_name_start = 10 func_name = str(func)[func_name_start:func_name_end] # exporting result to new raster export_arr.functions_out_array(outname=outname + "_" + func_name + str(17), arr=result, input_file=input_raster, dtype=dtype) # print time to this point statistics_time = datetime.now() print("breakpoint-time = ", statistics_time - start_time, "Hr:min:sec")
def _on_done(artifacts, graph, node, done, rem, value, apf=False, **kwargs): done.add(node) artifacts[node] = value frontier = rem | set(graph[node]) batch = {n for n in frontier if not _pendencies(graph, n, done)} rem = frontier - batch if not batch: return pool = mp.Pool(processes=min(max(1, mp.cpu_count() - 1), len(batch))) each( lambda node: pool.apply_async( _resolve, args=(node, artifacts), callback=partial( _on_done, artifacts, graph, node, done, rem, apf=apf, **kwargs ), ), batch, ) pool.close() pool.join()
def _build_async(artifacts, apf=False, processes=None): graph = u.to_graph(artifacts) frontier = u.initial(graph) if not frontier: return {} done, rem = set(), set() if processes is None: processes = min(max(1, mp.cpu_count() - 1), len(frontier)) pool = mp.Pool(processes=processes) each( lambda node: pool.apply_async( partial(_resolve, apf=apf), args=(node, artifacts), callback=partial( _on_done, artifacts, graph, node, done, rem, apf=apf), ), frontier, ) pool.close() pool.join() artifacts.update({n: _resolve(n, artifacts, apf=apf) for n in rem}) return artifacts
def clear_stack(cls): cpu = cpu_count() pool = Pool(cpu) def classify_set(c, pd, pc): pd = Directory.classify_product_auto(config, pd) # if product.part_id: pc.product = pd Directory.set_price(pc) # else: # manuals.append((config, product, price)) # manuals = [] for config, product, price in cls.STACK: pool.apply_async(classify_set, args=(config, product, price)) # for config, product, price in manuals: # product = Directory.classify_product_manual(config, product) # if product.part_id: # price.product = product # Directory.set_price(price) # else: # Directory.set_product(product) cls.STACK = []
def construct_adjacency(fips_data, filename=os.path.join(resourceDir, 'fips_2019_adj.pkl.gz')): """ Creates, and then stores (or loads) the adjacency dictionary of all US counties and territorial units. If the storage file, which is by default :download:`fips_2019_adj.pkl.gz </_static/gis/fips_2019_adj.pkl.gz>`, does not exist, then will create and store this data into the storage file. Will return the data in the end. :param dict fips_data: the US county :py:class:`dict` produced by, for example, :py:meth:`create_and_store_fips_2018 <covid19_stats.engine.gis.create_and_store_fips_2018>`. :param str filename: the location of the adjacency dictionary file, which is by default :download:`fips_2019_adj.pkl.gz </_static/gis/fips_2019_adj.pkl.gz>` located in the ``covid19_stats`` resource directory. :returns: a :py:class:`dict` of adjacency. Each key is a `FIPS code`_ of a county, and each value is a :py:class:`set` of counties and other territories adjacent to it. See :py:meth:`get_fips_adjacency <covid19_stats.engine.gis.get_fips_adjacency>` to see an example of this adjacency information for a single county. :rtype: dict """ if os.path.isfile(filename): return pickle.load(gzip.open(filename, 'rb')) with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: all_adj = dict( map(lambda fips: (fips, get_fips_adjacency(fips, fips_data)), fips_data)) set_of_adjacents = set( chain.from_iterable( map( lambda fips: map( lambda fips2: tuple(sorted([fips, fips2])), all_adj[ fips]), all_adj))) if filename is not None: pickle.dump(set_of_adjacents, gzip.open(filename, 'wb')) return set_of_adjacents
def map_list_as_chunks(l, f, extra_data, cpus=None, max_chunk_size=None): ''' A wrapper around `pathos.multiprocessing.ProcessPool.uimap` that processes a list in chunks. Differs from `map_list_in_chunks` in that this method calls `f` once for each chunk. uimap already chunks but if you have extra data to pass in it will pickle it for every item. This function passes in the extra data to each chunk which significantly saves on pickling. https://stackoverflow.com/questions/53604048/iterating-the-results-of-a-multiprocessing-list-is-consuming-large-amounts-of-me Parameters ---------- l : list the list f : function the function to process each item takes two parameters: chunk, extra_data extra_data : object the extra data to pass to each f cpus : int the number of cores to use to split the chunks across max_chunk_size : int the maximum size for each chunk ''' cpus = cpu_count() if cpus is None else cpus max_chunk_size = float('inf') if max_chunk_size is None else max_chunk_size chunk_length = min(max_chunk_size, max(1, ceil(len(l) / cpus))) chunks = [l[x:x + chunk_length] for x in range(0, len(l), chunk_length)] pool = Pool(nodes=cpus) f_dumps = cloudpickle.dumps(f) tuples = [(chunk, f_dumps, extra_data) for chunk in chunks] return pool.map(_process_whole_chunk, tuples)
def _movie_casedeaths( msas_or_conus, dirname, time0, type_disp = 'cases' ): all_msas = set(map(lambda msa_or_conus: msa_or_conus.lower( ), msas_or_conus)) with ThreadPool(processes = min(8, cpu_count( ), len( all_msas ) ) ) as pool: _ = list(pool.map(lambda msa_or_conus: _movie_casedeaths_metro_or_conus( msa_or_conus, dirname, time0, type_disp = type_disp ), all_msas ) ) logging.info( 'at %0.3f seconds to create all %d movies of %s.' % ( time.time( ) - time0, len( all_msas ), type_disp.upper( ) ) )
def parallel_search(self, query): self.parallel = 1 processes = cpu_count() data_set_split = [] query_filename = "query" query_bytes = objectToBytes(query, self.predinstance.group) for j in range(processes): start = ceil(j * self.num_records / processes) end = ceil((j + 1) * self.num_records / processes) if end > self.num_records: end = self.num_records data_set_split.append((start, end)) overall_return_list = [] (matrix_str, generator_bytes) = self.serialize_key() with Pool(processes) as p: with concurrent.futures.ProcessPoolExecutor(processes) as executor: future_list = { executor.submit(self.augment_search, self.vector_length, self.predicate_scheme, self.group_name, matrix_str, generator_bytes, query_bytes, self.public_parameters, start, end) for (start, end) in data_set_split } for future in concurrent.futures.as_completed(future_list): res = future.result() if res is not None and len(res) > 0: overall_return_list = overall_return_list + res return overall_return_list
def _summarize( msas_or_conus, dirname, time0 ): all_msas = set(map(lambda msa_or_conus: msa_or_conus.lower( ), msas_or_conus)) with ThreadPool(processes = min(8, cpu_count( ), len( all_msas ) ) ) as pool: _ = list(pool.map(lambda msa_or_conus: _summarize_metro_or_conus( msa_or_conus, dirname, time0 ), all_msas ) ) logging.info( 'at %0.3f seconds to create all %d summaries.' % ( time.time( ) - time0, len( all_msas ) ) )
def runLGB(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(test_X, label=test_y) watchlist = [d_train, d_valid] params = { 'learning_rate': 0.05, 'application': 'binary', 'num_leaves': 31, 'verbosity': -1, 'metric': 'auc', 'data_random_seed': 3, 'bagging_fraction': 1.0, 'feature_fraction': 0.4, 'nthread': min(mp.cpu_count() - 1, 6), 'lambda_l1': 1, 'lambda_l2': 1 } rounds_lookup = { 'toxic': 1400, 'severe_toxic': 500, 'obscene': 550, 'threat': 380, 'insult': 500, 'identity_hate': 480 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=10) print(model.feature_importance()) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def ProcessFrag( f ): with file_handle(f) as fi: Tmp = [l.rstrip() for l in fi] Frags = Tmp[10:] # remove first 10 lines, which are just file formatting del Tmp mpi = multiprocessing.Pool(processes=multiprocessing.cpu_count()) start = time.perf_counter() # Tmp = [frag(x) for x in tqdm(Frags, total=len(Frags))] Tmp = [x for x in tqdm(mpi.imap(read_in_json, Frags, chunk),total=len(Frags))] mpi.close() mpi.join() end = time.perf_counter() print((end-start)/1000, 'ms') Tmp2 = [frag for frag in Tmp if frag is not None] Tmp3 = list(itertools.chain.from_iterable(Tmp2)) # flatten nested lists FgSele = list(set(tuple(Tmp3))) # select unique smiles del Tmp del Tmp2 del Tmp3 del Frags gc.collect() return FgSele
def get_n_cpu(): try: available_cpus = int(getenv('SLURM_NTASKS')) except: available_cpus = int(pp.cpu_count() / 2) return available_cpus
def evaluate_summarizer(self, parsers, **kwargs): """ :param parsers: list List stores newssum.parser.StoryParser. :param kwargs: See below :Keyword Arguments: * *w_threshod* (``int``) word length threshold * *s_threshod* (``int``) sentence length threshold """ print("Evaluating summarizer...") p = multiprocessing.ProcessingPool(multiprocessing.cpu_count() - 2) def get_rouges(parser): print("Get rouges...") print(parser) selected_sents = self.get_best_sents(parser, **kwargs) rouge = Rouge(selected_sents, parser.highlights).get_rouge() return rouge rouges = p.map(get_rouges, parsers) # p.close() # p.join() avg_rouge = Rouge.cal_avg_rouge(rouges) Rouge.print("InfoFilter", avg_rouge)
def single_show_summary_dataframe( df_sub, showname, mode_dataformat = DATAFORMAT.IS_LATER ): df_show = df_sub[ df_sub.shows == showname ].copy( ) assert( len( df_show ) != 0 ) if mode_dataformat == DATAFORMAT.IS_AVI_OR_MPEG: return df_show # def get_ffprobe_json( filename ): stdout_val = subprocess.check_output( [ _ffprobe_exec, '-v', 'quiet', '-show_streams', '-show_format', '-print_format', 'json', filename ], stderr = subprocess.STDOUT ) file_info = json.loads( stdout_val ) return file_info def is_hevc( filename ): data = get_ffprobe_json( filename ) return data['streams'][0]['codec_name'].lower( ) == 'hevc' with Pool( processes = min( cpu_count( ), len( df_show ) ) ) as pool: dict_of_episodes_hevc = dict( pool.map( lambda filename: ( filename, is_hevc( filename ) ), list( df_show.paths ) ) ) df_show[ 'is hevc' ] = list(map(lambda filename: dict_of_episodes_hevc[ filename ], list( df_show.paths ) ) ) return df_show
def direct(self): def browse_each(config): log.info(Directory.INFO_MAP[0] % (self.market.name, config.name)) try: map_strs = self.PRODUCT_MAP[config.name] for map_str in map_strs: results = self.get_products_prices(map_str) for product, price in results: product = Directory.check_product(product) if not product.id: Directory.STACK.append((config, product, price)) elif product.part_id: price.product = product Directory.set_price(price) except KeyError: log.error(Directory.ERROR_MAP[1] % config.name) cpu = cpu_count() pool = _ThreadPool(cpu) for c in self.configs: pool.apply_async(browse_each, args=(c, )) pool.close() pool.join()
def __init__(self, journal_paths_dict, feature_generation_instance, n_cores=-1): """ Class constructor :param mypath: system path to the folder with the files to be be organized """ if isinstance(journal_paths_dict, dict): self._mypaths = journal_paths_dict.keys() self._myjournals = journal_paths_dict.values() self._mydict = journal_paths_dict else: print 'Input must be a dictionary, with the keys being the journal and the values being the respective ' \ 'paths' raise ValueError self._feat_gen = feature_generation_instance if n_cores == 1: self._n_cores = 1 self._pool = None elif n_cores == -1: self._n_cores = mp.cpu_count() self._pool = mp.ProcessingPool(self._n_cores) else: self._n_cores = n_cores self._pool = mp.ProcessingPool(self._n_cores)
def DomainDistances(Ref_Coords, PDB_Coords, RefReg2, Reg2, Data, parm, output): # Input_Coords = [pdb_name, H_Crds, N_Crds, C_Crds, G_Crds, R_Crds, T_Crds] # x_Coords = [resname, resid, bb_crds, ca_crd, cg_crd, avg_crd, cb_crd] print( '##################################################################\n') # Create distance object for MPI Ref = CalculateDist([Ref_Coords, RefReg2]) if parm['MPICPU'][0] == 1: Tmp = [ CalculateDist([Tgt, Reg2[idx]]) for idx, Tgt in enumerate(PDB_Coords) ] else: if parm['MPICPU'][0] == 0: mpi_cpu = multiprocessing.cpu_count() else: mpi_cpu = parm['MPICPU'][0] mpi = multiprocessing.Pool(mpi_cpu) Tmp = [ x for x in tqdm(mpi.imap(CalculateDist, list(zip(PDB_Coords, Reg2))), total=len(Reg2)) ] mpi.close() mpi.join() Tgt_List = [x for x in Tmp if x is not None] print('\n ## Domain Distances return: {0}\n'.format(len(Tgt_List))) CollectDomain(Ref, Tgt_List, Data)
def map_list_in_chunks(l, f, extra_data): ''' A wrapper around ProcessPool.uimap that processes a list in chunks. Differs from `map_list_as_chunks` in that this method calls `f` once for each item in `l`. uimap already chunks but if you have extra data to pass in it will pickle it for every item. This function passes in the extra data to each chunk which significantly saves on pickling. https://stackoverflow.com/questions/53604048/iterating-the-results-of-a-multiprocessing-list-is-consuming-large-amounts-of-me Parameters ---------- l : list the list f : function the function to process each item takes two parameters: item, extra_data extra_data : object the extra data to pass to each f ''' cpus = cpu_count() chunk_length = max(1, int(len(l) / cpus)) chunks = [l[x:x + chunk_length] for x in range(0, len(l), chunk_length)] pool = Pool(nodes=cpus) f_dumps = cloudpickle.dumps(f) tuples = [(chunk, f_dumps, extra_data) for chunk in chunks] mapped_chunks = pool.map(_process_chunk, tuples) return (item for chunk in mapped_chunks for item in chunk)
def relative_flammability( firescar_list, output_filename, ncores=None, mask_arr=None, mask_value=None, crs=None ): ''' run relative flammability. Arguments: firescar_list = [list] string paths to all GeoTiff FireScar outputs to be processed output_filename = [str] path to output relative flammability filename to be generated. * only GTiff supported. * ncores = [int] number of cores to use if None multiprocessing.cpu_count() used. mask_arr = [numpy.ndarray] numpy ndarray with dimensions matching the rasters' arrays listed in firescar_list and masked where 1=dontmask 0=mask (this is opposite numpy mask behavior, but follows common GIS patterns ) * THIS MAY CHANGE. * mask_value = [numeric] single numeric value determining the value of the newly masked out regions. If None, the nodata value from the firescar outputs will be used and if this is an invalid value, it will be set to -9999. crs=[dict] rasterio-compatible crs dict object i.e.: {'init':'epsg:3338'} Returns: output_filename, with the side effect of the relative flammability raster being written to disk in that location. ''' tmp_rst = rasterio.open( firescar_list[0] ) if ncores == None: ncores = multiprocessing.cpu_count() - 1 out = sum_firescars( firescar_list, ncores=ncores ) # calculate the relative flammability -- and fill in the mask with -9999 relative_flammability = ( out.astype( np.float32 ) / len( firescar_list ) ).filled() if mask_value == None: mask_value = tmp_rst.nodata if mask_value == None or mask_value == '': print( 'setting mask_value to -9999') mask_value = -9999 if mask_arr: relative_flammability[ mask_arr == 0 ] = mask_value meta = tmp_rst.meta # pop out transform to overcome warning if 'transform' in meta.keys(): _ = meta.pop( 'transform' ) meta.update( compress='lzw', count=1, dtype='float32', nodata=mask_value ) if crs: meta.update( crs=crs ) try: dirname = os.path.dirname( output_filename ) if not os.path.exists( dirname ): os.makedirs( dirname ) except: pass with rasterio.open( output_filename, 'w', **meta ) as out_rst: out_rst.write( np.around( relative_flammability, 4 ), 1 ) return output_filename
def __init__(self, directory=os.getcwd(), model_file=os.path.join(os.getcwd(), 'model'), classifierType='gradientboosting', verbose=False, num_threads=mp.cpu_count()): self.directory = directory self.model_file = model_file self.classifierType = classifierType self.verbose = verbose self.num_threads = num_threads try: db = MySQLdb.connect(host=host, user=user, passwd=passwd, db=database) except _mysql_exceptions.OperationalError, e: if e[0] != 1049: raise else: with MySQLdb.connect(host=host, user=user, passwd=passwd, db='') as cur: cur.execute("CREATE DATABASE %s;" % database)
def bootstrap(data, fun, n_resamples=10000, alpha=0.05): """Compute confidence interval for values of function fun Parameters ========== data: list of arguments to fun """ assert isinstance(data, list) n_samples = len(data[0]) idx = np.random.randint(0, n_samples, (n_resamples, n_samples)) def select(data, sample): return [d[sample] for d in data] def evaluate(sample): return fun(*select(data, sample)) pool = multiprocessing.Pool(multiprocessing.cpu_count()) values = pool.map(evaluate, idx) pool.terminate() idx = idx[np.argsort(values, axis=0, kind='mergesort')] values = np.sort(values, axis=0, kind='mergesort') stat = namedtuple('stat', ['value', 'index']) low = stat(value=values[int((alpha/2.0)*n_resamples)], index=idx[int((alpha/2.0)*n_resamples)]) high = stat(value=values[int((1-alpha/2.0)*n_resamples)], index=idx[int((1-alpha/2.0)*n_resamples)]) return low, high
def encrypt_dataset_parallel(self, data_set): self.parallel = 1 for data_item in data_set: if len(data_item) != self.vector_length: raise ValueError("Improper Vector Size") self.enc_data = {} processes = cpu_count() data_set_split = [] data_set_len = len(data_set) self.num_records = data_set_len for j in range(processes): start = ceil(j * data_set_len / processes) end = ceil((j + 1) * data_set_len / processes) if end > data_set_len: end = data_set_len data_set_split.append((start, end, data_set[start:end])) total_data_size = 0 (matrix_str, generator_bytes) = self.serialize_key() with Pool(processes) as p: with concurrent.futures.ProcessPoolExecutor(processes) as executor: future_list = {executor.submit(self.augment_encrypt, self.vector_length, self.predicate_scheme, self.group_name, matrix_str, generator_bytes, self.public_parameters, data_set_component, start, end) for (start, end, data_set_component) in data_set_split } for future in concurrent.futures.as_completed(future_list): res = future.result() if res is not None: total_data_size = total_data_size + res self.enc_data_size = total_data_size
def createPNGPicObjects(cls, pImgClient): """ :param PlexIMGClient pImgClient: the :py:class:`PlexIMGClient <nprstuff.npremail.email_imgur.NPRStuffIMGClient>` used to access and manipulate (add, delete, rename) images in the main Imgur_ album. :returns: a :py:class:`list` of :py:class:`PNGPicObject <nprstuff.npremail.PNGPicObject>` representing the images in the main Imgur_ album. :rtype: list """ pngPICObjects = [] def _create_object(imgMD5): imgName, imgID, imgurlLink, imgDateTime = pImgClient.imghashes[ imgMD5] try: newObj = PNGPicObject( { 'initialization': 'SERVER', 'imgurlLink': imgurlLink, 'imgName': imgName, 'imgMD5': imgMD5, 'imgDateTime': imgDateTime }, pImgClient) return newObj except: return None with multiprocessing.Pool( processes=multiprocessing.cpu_count()) as pool: # ## doesn't work with multiprocessing for some reason... pngPICObjects = list( filter(None, map(_create_object, pImgClient.imghashes))) return pngPICObjects
def calculatePearsonCorrelationMatrixMultiprocessing(matrix, axis=0, symmetrical=True, getpvalmat=False): if axis == 1: matrix = matrix.T nRows = matrix.shape[0] # create shared array that can be used from multiple processes output_r_arr = Array(ctypes.c_double, matrix.shape[0] * matrix.shape[0]) # then in each new process create a new numpy array using: output_r = np.frombuffer(output_r_arr.get_obj()) # mp_arr and arr share the same memory # make it two-dimensional output_r = output_r.reshape((matrix.shape[0], matrix.shape[0])) # b and arr share the same memory # output_r = np.zeros((nRows,nRows)) # old version output_p_arr = Array(ctypes.c_double, matrix.shape[0] * matrix.shape[0]) output_p = np.frombuffer(output_p_arr.get_obj()) output_p = output_p.reshape((matrix.shape[0], matrix.shape[0])) print 'Calculating Pearson R for each row, multithreaded' print mp.cpu_count(), 'processes in pool' pool = None try: pool = mp.Pool(mp.cpu_count(), initializer=_init_pool, initargs=(matrix, output_r_arr, output_p_arr, nRows, symmetrical)) # bar = tqdm(total=nRows*nRows/2) # tqdm.write('Calculating Pearson R for each row, multithreaded') for result in tqdm(pool.imap_unordered(_f, range(0, nRows)), total=nRows): # bar.update(result) pass # bar.close() finally: # To make sure processes are closed in the end, even if errors happen pool.close() pool.join() print output_r if getpvalmat: return output_r, output_p else: return output_r
def _check_threads(self, message_end=None): """ Checks number of threads :param message_end: closing part of the error message. """ if PATHOS_FOUND: threads = AbinsModules.AbinsParameters.threads if not (isinstance(threads, six.integer_types) and 1 <= threads <= mp.cpu_count()): raise RuntimeError("Invalid number of threads for parallelisation over atoms" + message_end)
def _calc_num_threads(self, df_size: int, query_size: int) -> int: num_queries = df_size * query_size if mp.cpu_count() == 1: max = 1 else: max = mp.cpu_count() - 1 calc = int(num_queries / 5000) if calc > max: r = max elif calc <= 1: if num_queries > 1000: r = 2 else: r = 1 else: r = calc return r
def parallelmap(func, data, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs """ if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, data) except KeyboardInterrupt: pool.terminate() pool.join()
def parallelmap(func, lst, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs using multiprocessing (as opposed to MPI). """ from pathos.multiprocessing import ProcessingPool from pathos import multiprocessing if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, lst) except KeyboardInterrupt: pool.terminate() pool.join()
def make_predictions(conf,shot_list,loader): loader.set_inference_mode(True) use_cores = max(1,mp.cpu_count()-2) if backend == 'tf' or backend == 'tensorflow': first_time = "tensorflow" not in sys.modules if first_time: import tensorflow as tf os.environ['KERAS_BACKEND'] = 'tensorflow' from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto(device_count={"CPU":use_cores}) set_session(tf.Session(config=config)) else: os.environ['THEANO_FLAGS'] = 'device=cpu' import theano from plasma.models.builder import ModelBuilder specific_builder = ModelBuilder(conf) y_prime = [] y_gold = [] disruptive = [] model = specific_builder.build_model(True) model.compile(optimizer=optimizer_class(),loss=conf['data']['target'].loss) specific_builder.load_model_weights(model) model_save_path = specific_builder.get_latest_save_path() start_time = time.time() pool = mp.Pool(use_cores) fn = partial(make_single_prediction,builder=specific_builder,loader=loader,model_save_path=model_save_path) print('running in parallel on {} processes'.format(pool._processes)) for (i,(y_p,y,is_disruptive)) in enumerate(pool.imap(fn,shot_list)): print('Shot {}/{}'.format(i,len(shot_list))) sys.stdout.flush() y_prime.append(y_p) y_gold.append(y) disruptive.append(is_disruptive) pool.close() pool.join() print('Finished Predictions in {} seconds'.format(time.time()-start_time)) loader.set_inference_mode(False) return y_prime,y_gold,disruptive
def __init__(self, logger, directory, urls, max_cores = 0): self.logger = logger # default logger self.directory = directory # working directory self.urls = urls # service URLs self.results = [] # parallel execution results self.returnVal = {} # final execution results self.task_list =[] # parallel task list #self.clearCommonParams() # user defined parameters self.common_params = None self.method_params = None self.num_threads = None self.num_jobs = None #self.callback_url = os.environ['SDK_CALLBACK_URL'] # default parameters self.num_cores = mp.cpu_count() self.max_cores = max_cores
def bootstrap(data, fun, n_resamples=10000, alpha=0.05): """ Compute confidence interval for values of function fun on data_preprocess_scripts Args: data : list of numpy arrays Each numpy array will be subsampled and then passed to fun. fun : Function taking len(data_preprocess_scripts) numpy arrays -> float n_resamples : int, 10000 is default Number of times to resample data_preprocess_scripts to produce intervals. alpha : float, 0.5 is default Confidence level parameter for confidence intervals. """ assert isinstance(data, list) n_samples = len(data[0]) idx = np.random.randint(0, n_samples, (n_resamples, n_samples)) def select(data, sample): return [d[sample] for d in data] def evaluate(sample): return fun(*select(data, sample)) pool = multiprocessing.Pool(multiprocessing.cpu_count()) values = pool.map(evaluate, idx) pool.terminate() idx = idx[np.argsort(values, axis=0, kind='mergesort')] values = np.sort(values, axis=0, kind='mergesort') stat = namedtuple('stat', ['value', 'index']) low = stat( value=values[int((alpha / 2.0) * n_resamples)], index=idx[int((alpha / 2.0) * n_resamples)]) high = stat( value=values[int((1 - alpha / 2.0) * n_resamples)], index=idx[int((1 - alpha / 2.0) * n_resamples)]) return low, high
def train_on_files(self,shot_files,use_shots,all_machines): conf = self.conf all_signals = conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) previously_saved,machines_saved = self.previously_saved_stats() machines_to_compute = all_machines - machines_saved recompute = conf['data']['recompute_normalization'] if recompute: machines_to_compute = all_machines previously_saved = False if not previously_saved or len(machines_to_compute) > 0: if previously_saved: self.load_stats() print('computing normalization for machines {}'.format(machines_to_compute)) use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,stats) in enumerate(pool.imap_unordered(self.train_on_single_shot,shot_list_picked)): #for (i,stats) in enumerate(map(self.train_on_single_shot,shot_list_picked)): if stats.machine in machines_to_compute: self.incorporate_stats(stats) self.machines.add(stats.machine) sys.stdout.write('\r' + '{}/{}'.format(i,len(shot_list_picked))) pool.close() pool.join() print('Finished Training Normalizer on {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) self.save_stats() else: self.load_stats() print(self)
def calculate_centroids(self, p=None): """ Perform integration to find centroid at all turns up to N. Multiprocessing pool used to calculate independent turn values. Will automatically use `integrate_first_order` or `integrate_second_order` if appropriate. Args: p: Specify number of processes for pool. If not given then `cpu_count` is used. Returns: array of floats """ if p: pool_size = p else: pool_size = cpu_count() pool = Pool(pool_size) # attempt to speed things up by spreading out difficult integration values at the end of range # appeared to not work # x = [] # for i in range(cpu_count()): # x += range(N)[i::4] if len(self.mu) == 1: integration_function = self.integrate_first_order elif len(self.mu) == 2: integration_function = self.integrate_second_order else: integration_function = self.integrate_any_order x = range(self.N) results = pool.map(integration_function, x) pool.close() return results
def __init__(self,process): self.size = mp.cpu_count() self.process = process self.phase=None self.pool = mp.ProcessingPool(mp.cpu_count())
def populate(centres, masses, halomodel=None, profile=None, hodmod=None, edges=None): """ Populate a series of DM halos with galaxies given a HOD model. Parameters ---------- centres : (N,3)-array The cartesian co-ordinates of the centres of the halos masses : array_like The masses (in M_sun/h) of the halos halomodel : type :class:`halomod.HaloModel` A HaloModel object pre-instantiated. One can either use this, or *both* `profile` and `hodmod` arguments. profile : type :class:`profile.Profile` A density profile to use. hodmod : object of type :class:`hod.HOD` A HOD model to use to populate the dark matter. edges : float, len(2) iterable, or (2,3)-array Periodic box edges. If float, defines the upper limit of cube, with lower limit at zero. If len(2) iterable, defines edges of cube. If (2,3)-array, specifies edges of arbitrary rectangular prism. Returns ------- pos : array (N,3)-array of positions of galaxies. halo : array (N)-array of associated haloes (by index) H : int Number of central galaxies. The first H galaxies in pos/halo correspond to centrals. """ if halomodel is not None: profile = halomodel.profile hodmod = halomodel.hod masses = np.array(masses) # Define which halos have central galaxies. cgal = np.random.binomial(1, hodmod.nc(masses)) cmask = cgal > 0 central_halos = np.arange(len(masses))[cmask] if hodmod._central: masses = masses[cmask] centres = centres[cmask] # Calculate the number of satellite galaxies in halos # Using _ns, rather than ns, gives the correct answer for both central condition and not. # Note that other parts of the algorithm also need to be changed if central condition is not true. # if hodmod._central: # sgal = poisson.rvs(hodmod._ns(masses[cmask])) # else: sgal = poisson.rvs(hodmod._ns(masses)) # Get an array ready, hopefully speeds things up a bit ncen = np.sum(cgal) nsat = np.sum(sgal) pos = np.empty((ncen + nsat, 3)) halo = np.empty(ncen+nsat) # Assign central galaxy positions halo[:ncen] = central_halos if hodmod._central: pos[:ncen, :] = centres else: pos[:ncen, :] = centres[cmask] smask = sgal > 0 # if hodmod._central: # sat_halos = central_halos[np.arange(len(masses[cmask]))[smask]] # else: if hodmod._central: sat_halos = central_halos[np.arange(len(masses))[smask]] else: sat_halos = np.arange(len(masses))[smask] sgal = sgal[smask] centres = centres[smask] masses = masses[smask] # Now go through each halo and calculate galaxy positions start = time.time() halo[ncen:] = np.repeat(sat_halos,sgal) indx = np.concatenate(([0],np.cumsum(sgal))) + ncen # print "SMASHING THIS NOW" def fill_array(i): m,n,ctr = masses[i], sgal[i],centres[i] pos[indx[i]:indx[i+1],:] = profile.populate(n, m, ba=1, ca=1, centre=ctr) if HAVE_POOL: mp.ProcessingPool(mp.cpu_count()).map(fill_array,range(len(masses))) else: for i in range(len(masses)): fill_array(i) nhalos_with_gal = len(set(central_halos.tolist()+sat_halos.tolist())) print "Took ", time.time() - start, " seconds, or ", (time.time() - start)/nhalos_with_gal, " each halo." print "NhalosWithGal: ", nhalos_with_gal, ", Ncentrals: ", ncen,", NumGal: ", len(halo), ", MeanGal: ", float( len(halo))/nhalos_with_gal, ", MostGal: ", sgal.max() + 1 if len(sgal)>0 else 1 if edges is None: pass elif np.isscalar(edges): edges = np.array([[0, 0, 0], [edges, edges, edges]]) elif np.array(edges).shape == (2,): edges = np.array([[edges[0]]*3, [edges[1]]*3]) if edges is not None: for j in range(3): d = pos[:, j] - edges[0][j] pos[d < 0, j] = edges[1][j] + d[d < 0] d = pos[:, j] - edges[1][j] pos[d > 0, j] = edges[0][j] + d[d > 0] return pos, halo.astype("int"), ncen
# parsing configuration file to import dir of cross-corr results from seissuite.ant import (pscrosscorr, psutils) process_type = "serial" multiprocess = True #import multiprocessing as mp if multiprocess: try: import pathos.multiprocessing as mp except: import multiprocessing as mp no_of_cores = int(mp.cpu_count()) process_type = "parallel" # import CONFIG class initalised in ./configs/tmp_config.pickle config_pickle = 'configs/tmp_config.pickle' f = open(name=config_pickle, mode='rb') CONFIG = pickle.load(f) f.close() # import variables from initialised CONFIG class. MSEED_DIR = CONFIG.MSEED_DIR DATABASE_DIR = CONFIG.DATABASE_DIR DATALESS_DIR = CONFIG.DATALESS_DIR
def __init__(self, source, config, processes=cpu_count()): self._pool = Pool(processes) self._results = self._pool.imap(_merge_records, izip(source, repeat(config)))
def genseq(idx): first = np.where(np.random.multinomial(1,pvals=pops)==1)[0][0] last = first last_ts = datetime.now() result = {'artist_idx':[first],'ts':[last_ts]} for i in xrange(seq_length-1): next_listen = draw(last) last = next_listen gap_bin = 120*np.where(np.random.multinomial(1,pvals=td)==1)[0][0] gap = np.random.randint(gap_bin,gap_bin+120) result['artist_idx'].append(next_listen) new_ts = last_ts+timedelta(0,gap) result['ts'].append(new_ts) last_ts = new_ts df = pd.DataFrame(result) df['block'] = ((df['artist_idx'].shift(1) != df['artist_idx']).astype(int).cumsum())-1 df.to_pickle(str(idx)+'.pkl') logging.info('idx {} complete'.format(idx)) pool = Pool(cpu_count()) indices = range(n) pool.map(genseq,indices) pool.close()
# if os.path.isfile(filename): # data = _merge(filename, data) if data: dill_save_obj(data, filename) def load_finland_ids(): with open('finland_ids.csv', 'r') as f: lines = f.readlines() return map(lambda l: l.split(',')[0].strip(), lines) if __name__ == '__main__': chunk_size = pmp.cpu_count() - 2 id_list = load_finland_ids() start_time = datetime(2013,11,1) end_time = datetime(2013,11,02) id_list_group = (lambda l, s=chunk_size: [l[i:i+s] for i in range(0, len(l), s)])(id_list) print "chuck size : " + str(chunk_size) + " len(id_list_group) : " + str(len(id_list_group)) try: for id_group in id_list_group: p = pmp.Pool(chunk_size) p.map(lambda fin_id: read_vtt_data(fin_id, start_time, end_time), id_group) p.close() p.join() p.terminate()
print('done reading simulation cases') df['sampled_std']=None df['mse']=None df['error_per']=None df['pearson']=None if 0: '''removing non interesting cases''' df=df[(df.quant_size*df.number_of_bins<15)&(df.quant_size*df.number_of_bins>6)] df=df[df.quant_size*df.number_of_bins>8.9] print('dropping non interesting cases - running only %d simulations'%df.shape[0]) df.reset_index(drop=True,inplace=True) if not u.run_serial and u.number_of_splits==1: '''multi cores''' num_of_cpu = mp.cpu_count() if 0: print('disabling parallel work!!!!!') num_of_cpu = 1 print('number of cpus: %d' % num_of_cpu) p = mp.Pool(num_of_cpu) args = [dict(df=df.iloc[inx], A_rows=A_rows, samples=u.samples) for inx in np.array_split(range(df.shape[0]), num_of_cpu)] df[['sampled_std','mse','error_per','pearson']] = pd.concat(p.map(lambda y: y['df'].apply(lambda x:pd.Series(globals()[x.method](samples=y['samples'] ,quant_size=x.quant_size, std_threshold=x.std_threshold,number_of_bins=x.number_of_bins,A_rows=y['A_rows']),index=['sampled_std','mse','error_per','pearson']),axis=1),args)) else: for inx in tqdm(np.array_split(range(df.shape[0]),100)): df.loc[inx+df.index.values[0],['sampled_std','mse','error_per','pearson']]=df.iloc[inx].apply(lambda x:pd.Series(globals()[x.method](samples=u.samples ,quant_size=x.quant_size,number_of_bins=x.number_of_bins, std_threshold=x.std_threshold,A_rows=A_rows),index=['sampled_std','mse','error_per','pearson']),axis=1) print(df.head()) df.to_csv('resutls_%08d.csv.gz'%u.number_of_split,compression='gzip') # df.dropna(how='any',inplace=True) # res=df.pivot_table(values='mse',columns=['number_of_bins','method'],index=['quant_size'],aggfunc=[np.mean,np.median]) # res.to_csv('7.2 resutls.csv')
def __init__(self, source, config, single_process=False, processes=cpu_count()): if single_process: self.results = imap(transform, izip(source, repeat(config))) else: self.pool = Pool(processes) self.results = self.pool.imap(transform, izip(source, repeat(config)))
print(clock) ''' # --------------------------------------- # # This approach uses multiple cores # from pathos.multiprocessing import ProcessingPool, cpu_count if __name__ == '__main__': # This is essential if Used on windows! clock = time() # starting time # creates a worker pool from given comand line parameter. If the given # parameter is to large all detectable CPUs will be utilised. If the given # parameter is nonsense only 1 core will be utilized. workers = 1 if len(sys.argv) >= 2 and sys.argv[1].isdigit() and int(sys.argv[1]) > 0: workers = cpu_count() if int(sys.argv[1]) <= workers: workers = int(sys.argv[1]) print 'N: ' + str(N) print 'PW: ' + str(workers) sleep(3) # just 3 seconds pause to read the input again. # All the magic happens here: pool = ProcessingPool(workers) Ys = pool.map(steadyState,y0) clock = time()-clock # elapsed time print 'Seconds: ' + str(clock) # Not essential but useful. # Serilisation of results and stats:
def set_experiment(self, experiment, options, mstats, logbook, pset, toolbox, population, invalid_ind, hof, START_GEN=None, POP_SIZE=None, SEED=42): if options: self.options = options if not self.re_positive_dataset: print "Loading datasets" self.load_dataset(self.options) if experiment: self.experiment = experiment if hof: self.hof = hof else: self.hof = deap.tools.ParetoFront(self.pareto_similar) if START_GEN: self.START_GEN = START_GEN if POP_SIZE: self.POP_SIZE = POP_SIZE self.CXPB = options.cxpb self.MUTPB = options.mutpb if mstats: self.mstats = mstats else: stats = {} def lambda_factory(idx): return lambda ind: ind.fitness.values[idx] for tag in self.FITNESS_TAGS: s = deap.tools.Statistics( key=lambda_factory( self.FITNESS_TAGS.index(tag) )) stats[tag] = s #stats["Discrimination"] = deap.tools.Statistics(key=lambda ind: ind.fitness.values[0]) #stats["Support"] = deap.tools.Statistics(key=lambda ind: ind.fitness.values[1]) #stats_accuracy = deap.tools.Statistics(key=lambda ind: ind.fitness.values[0]) #stats_coverage = deap.tools.Statistics(key=lambda ind: ind.fitness.values[1]) self.mstats = deap.tools.MultiStatistics( **stats ) #accuracy=stats_accuracy, coverage=stats_coverage ) self.mstats.register("avg", numpy.mean, axis=0) self.mstats.register("std", numpy.std , axis=0) self.mstats.register("min", numpy.min , axis=0) self.mstats.register("max", numpy.max , axis=0) if logbook: self.logbook = logbook else: self.logbook = deap.tools.Logbook() #self.logbook.header = "gen", "evals", "hypervolume", "memoize" self.logbook.header = "gen", "evals", "memoize" self.logbook.header += tuple((self.FITNESS_TAGS)) #tuple(["accuracy","coverage"]) # if pset: self.pset = pset if population: self.population = population if invalid_ind: self.invalid_ind = invalid_ind #creator.create("FitnessMax", base.Fitness, weights=self.FITNESS_WEIGHTS) #creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax, pset=self.pset) if toolbox: self.toolbox = toolbox else: self.toolbox = deap.base.Toolbox() if self.options.ncpu and ( self.options.ncpu == "auto" or int(self.options.ncpu) > 1): """ Check if PATHOS was loaded """ global PATHOS if PATHOS: if self.options.ncpu == "auto": n_cpu = cpu_count() else: n_cpu = int(self.options.ncpu) print "Using pathos.multiprocessing with", n_cpu, "cpus." pool = Pool(n_cpu) self.toolbox.register("map", pool.map) else: print "WARNING: Failed to load pathos; using a single processor." """ Basic toolbox functions""" self.toolbox.register("expr", deap.gp.genHalfAndHalf, pset=self.pset, type_=self.pset.ret, min_=1, max_=4) self.toolbox.register("individual", deap.tools.initIterate, deap.creator.Individual, self.toolbox.expr) self.toolbox.register("population", deap.tools.initRepeat, list, self.toolbox.individual) self.toolbox.register("compile", deap.gp.compile, pset=self.pset) # Set evaluation if self.options.grammar == "pssm": self.toolbox.register("evaluate", self.eval_pssm_match) else: try: #self.toolbox.register("evaluate", self.memoize_pfm_match) #self.toolbox.register("evaluate", self.eval_regex_match) #self.toolbox.register("evaluate", self.memoize_regex_match) #self.toolbox.register("evaluate", self.memoize_python_matcher) if self.options.matcher == "grep": print "Loading grep matcher" self.test_grep() self.toolbox.register("evaluate", self.memoize_grep_match) elif self.options.matcher == "python" : print "Loading Python matcher" self.toolbox.register("evaluate", self.memoize_python_matcher) else: raise Exception("Unknown matching mode" + str(self.options.matcher)) print "Loaded." except Exception as e: print "Exception occured while setting matcher: ({0}): {1}".format( e.message, str(e.args)) print "Use --matcher=python as argument." exit(-1) #print "Falling back to python matcher" #self.toolbox.register("evaluate", self.memoize_python_matcher) # Add penalties for penalty_feasible in self.FITNESS_PENALTIES: if penalty_feasible is not None: self.toolbox.decorate("evaluate", deap.tools.DeltaPenality(penalty_feasible, (0.0,)*len(self.FITNESS_TAGS))) #self.toolbox.register("evaluate", self.eval_pfm_match) #self.toolbox.register("evaluate", self.eval_pfm_match) #self.toolbox.register("evaluate", self.eval_precomputed_pfm_matches) #self.toolbox.register("mate", deap.gp.cxOnePoint) self.toolbox.register("mate", deap.gp.cxOnePointLeafBiased, termpb=0.1 ) self.toolbox.register("expr_mut", deap.gp.genGrow, min_=1, max_=4, pset=self.pset) self.toolbox.register("mutate", deap.gp.mutUniform, expr=self.toolbox.expr_mut, pset=self.pset) #self.toolbox.register("mutate", self.multimutate, pset=self.pset) """ Static Limit for GP Tree """ MAX_HEIGHT = 17 # Koza #MAX_HEIGHT = 8 #MAX_HEIGHT = 89 #MAX_HEIGHT = 25 self.toolbox.decorate("mate", deap.gp.staticLimit(operator.attrgetter('height'), MAX_HEIGHT)) self.toolbox.decorate("mutate", deap.gp.staticLimit(operator.attrgetter('height'), MAX_HEIGHT)) """ Seeding """ self.toolbox.register("init_seeds", self.initSeeds, deap.creator.Individual) self.toolbox.register("init_seeded_population", self.initSeededPopulation, list, self.toolbox.init_seeds) self.population = self.toolbox.population(self.POP_SIZE) #Random pop for algorithms that need a initial pop self.toolbox.register("memoizecount", lambda: self.memoize_count) if self.options.moo == "SPEA2": self.toolbox.register("select", deap.tools.selSPEA2) elif self.options.moo == "NSGA2": self.toolbox.register("select", deap.tools.selNSGA2) elif self.options.moo == "MOEAD": pass else: if self.options.moo == "ESCMA": from deap import cma """ # We start with a centroid in [-4, 4]**D sigma = 2 * 10**(-2 * numpy.random.rand()) strategy = cma.StrategyMultiObjective(self.population, #centroid=numpy.random.uniform(-4, 4, len(self.population)), sigma=sigma, lambda_=len(self.population) ) #self.toolbox.register("generate", strategy.generate, deap.creator.Individual) self.toolbox.register("update", strategy.update) for ind in self.population: ind.fitness.values = self.toolbox.evaluate(ind) """ strategy = cma.StrategyMultiObjective(self.population, sigma=1.0, mu=self.POP_SIZE, lambda_=self.POP_SIZE) self.toolbox.register("generate", strategy.generate, deap.creator.Individual) self.toolbox.register("update", strategy.update) pass else: #DEFAULT if self.options.moo != "NSGAR": print "Unknown algorithm", self.options.moo, ". Defaulting to NSGAR" self.toolbox.register("preselect", nsgafortin.selTournamentFitnessDCD) self.toolbox.register("select", nsgafortin.selNSGA2) # Evaluate the individuals with an invalid fitness self.population = self.toolbox.population(self.POP_SIZE) #Random pop invalid_ind = [ind for ind in self.population if not ind.fitness.valid] fitnesses = self.toolbox.map(self.toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit self.population = self.toolbox.select(self.population, self.POP_SIZE) # Set REVCOMP self.REVCOMP = options.revcomp if not self.options.no_seed: seed_options = copy.deepcopy(self.options) seed_options.no_seed=True seed_options.grammar="alpha" #seed_options.moo="SPEA2" #FIXME There's currently a bug preventing seeds from being produced by the FORTIN selection algorithm # Nope, doesn't work at all, just fix it already. seedengine = Engine(None) #seedengine #seedengine.boot(self.options) #print self.population #raw_input() seedengine.set_experiment("regex", seed_options,None, None, None, self.toolbox, self.toolbox.population(self.POP_SIZE), None, None, 0) ENGINE_GEN = 50 print "Seeded for", ENGINE_GEN," generations using grammar:", seed_options.grammar self_seeds, self.logbook = seedengine.run(ENGINE_GEN) self_seeds = [ind for ind in seedengine.hof] #print self_seeds #raw_input() self.population = self_seeds #TODO: Problem with small seed counts pass else: if len(self.initial_seeds) > 0: #self.initial_seeds = ["add,C,A", "C", "add,A,add,C,add,T,G" ] print "Seeded population initated" self.population = self.toolbox.init_seeded_population() elif not self.population: self.population = self.toolbox.population(self.POP_SIZE) print "Initial pop size:", len(self.population)