def transform(self, X, columns=None): """ Given X, create features of fitted studies :param X: Dataset with features used to create fitted studies :return: """ # Remove trailing identifier in column list if present if columns is not None: columns = [re.sub(r'_[0-9]+$', '', s) for s in columns] X.columns = X.columns.str.lower() # columns must be lower case pool = ProcessPool(nodes=self.n_jobs) # Number of jobs self.result = [] # Iterate fitted studies and calculate TA with fitted parameter set for ind in self.fitted: # Create field if no columns or is in columns list if columns is None or ind.res_y.name in columns: self.result.append(pool.apipe(ind.transform, X)) # Blocking wait for asynchronous results self.result = [res.get() for res in self.result] # Combine results into dataframe to return res = pd.concat(self.result, axis=1) return res
class InterpreterBot(discord.Client): def __init__(self): super().__init__() self.pool = ProcessPool(nodes=4) async def on_ready(self): print("logged in as {}".format(self.user)) async def on_message(self, message): command_str = ">>" content = message.content if content.startswith(command_str): # remove the command prefix itself and (maybe) a space source = re.sub(r"{} ?".format(command_str), "", content, 1) # remove code markers so code boxes work with this "beautiful" regex source = re.sub(r"(^`{1,3}(py(thon)?)?|`{1,3}$)", "", source) # log output to help debugging on failure print("Executed {}".format(repr(source))) sent = await message.channel.send("running code...") result = self.pool.apipe(interpret, source) output = None try: output = result.get(timeout=10) except multiprocess.context.TimeoutError: output = "Timeout error - do you have an infinite loop?" except Exception as e: output = "Runtime error: {}".format(e) await sent.edit(content="```{}```".format(output or "(no output to stdout)")) def run(self): token = os.getenv("TOKEN") if token: super().run(token) else: raise EnvironmentError("TOKEN environment variable doesn't exist")
def transform(self, X): X.columns = X.columns.str.lower() # columns must be lower case pool = ProcessPool(nodes=self.n_jobs) self.result = [] for ind in self.fitted: self.result.append(pool.apipe(ind.transform, X)) self.result = [res.get() for res in self.result] res = pd.concat(self.result, axis=1) return res
def fit(self, X, y, trials=5, indicators=indicators, ranges=ranges, tune_series=tune_series, tune_params=tune_params, tune_column=tune_column): self.fitted = [] X.columns = X.columns.str.lower() # columns must be lower case pool = ProcessPool(nodes=self.n_jobs) for low, high in ranges: if low <= 1: raise ValueError("Range low must be > 1") if high >= len(X): raise ValueError( f"Range high:{high} must be > length of X:{len(X)}") for ind in indicators: idx = 0 if ":" in ind: idx = int(ind.split(":")[1]) ind = ind.split(":")[0] fn = f"{ind}(" if ind[0:3] == "tta": usage = eval(f"{ind}.__doc__").split(")")[0].split("(")[1] params = re.sub('[^0-9a-zA-Z_\s]', '', usage).split() else: sig = inspect.signature(eval(ind)) params = sig.parameters.values() for param in params: param = re.split(':|=', str(param))[0].strip() if param == "open_": param = "open" if param == "real": fn += f"X.close, " elif param == "ohlc": fn += f"X, " elif param == "ohlcv": fn += f"X, " elif param in tune_series: fn += f"X.{param}, " elif param in tune_params: fn += f"{param}=trial.suggest_int('{param}', {low}, {high}), " fn += ")" self.fitted.append( pool.apipe(Optimize(function=fn, n_trials=trials).fit, X, y, idx=idx, verbose=self.verbose)) self.fitted = [fit.get() for fit in self.fitted] # Get results of jobs
def do_parallel_with_pbar(closure, args, num_processes=6): pool = ProcessPool(nodes=num_processes) print('STARTING TASKS') results = [pool.apipe(closure, arg) for arg in tqdm(args)] print('COMPLETING TASKS') total = len(results) with tqdm(total=total) as pbar: num_ready = 0 while num_ready < total: ready = [r for r in results if r.ready()] new_num_ready = len(ready) if new_num_ready > num_ready: pbar.update(new_num_ready - num_ready) num_ready = new_num_ready
def run_pso(self, function, searchspace, target, nparticles, maxiter, precision, domain, verbose=True, pool_size=None): """ Performs a PSO for the given function in the searchspace, looking for the target, which is in the output space. The asynchronous evaluation means the exact definition of iterations may be lost. To preserve some sense of this an iteration is defined to be `nparticles` evaluations performed. This means that not every particle is updated in the history for every iteration. However, the total number of function evaluations (iterations * nparticles) will still be preserved for this definition. function - the function to be optimized. Its domain must include the seachspace and its output must be in the space of target. searchspace - np.array((ssdim, 2)) target - Not used by `ImplicitTargetPSO`. `function` should include any necessary target data. nparticles - number of particles to use in the optimization maxiter - maximum number of iterations to the optimization routine precision - how close to the target to attemp to get domain - absolute boundaries on the trial solutions/particles pool_size - (int) set the ProcessingPool size explicitly. Defaults to 4 if not set. """ if not pool_size: pool_size = 4 # update attributes self.maxiter = maxiter self.precision = precision # search space dimensionality if searchspace.shape[1] != 2: print('WARNING! searchspace does not have dimenstions (N,2).') ssdim = searchspace.shape[0] # init particle positions and velocities xpart = np.random.random((nparticles, ssdim)) for ii in range(ssdim): xpart[:, ii] = (searchspace[ii, 1] - searchspace[ii, 0]) * xpart[:, ii] + searchspace[ ii, 0] # scale the uniform radnom dist vpart = np.zeros(xpart.shape) # init particle best solution pbest = 1.0 * xpart # NOTE: Best not to assume the form of obj function input cpbest = np.array([self.cost(function(*xp), target) for xp in pbest]) # init global best solutions im = np.argmin(cpbest) gbest = pbest[im] cgbest = cpbest[im] if False: return xpart, vpart, pbest, cpbest, gbest, cgbest # intermediate arrays # multiply by 1.0 to make copies not bind references xarr = 1.0 * xpart[:, :, None] varr = 1.0 * vpart[:, :, None] parr = 1.0 * pbest[:, :, None] cparr = 1.0 * cpbest[:, None] garr = 1.0 * gbest[:, None] cgarr = 1.0 * np.array([cgbest]) iternum = 0 evalnum = 0 # Asynchronous process management pool = ProcessPool(pool_size) results = [] # initial submission for fi in range(nparticles): # update velocity vpart[fi] = self.velocity(vpart[fi], xpart[fi], pbest[fi], gbest) # update position xpart[fi] = xpart[fi] + vpart[fi] # keeps particles inside the absolute boundaries given by `domain` xpart[fi] = np.maximum(xpart[fi], domain[:, 0]) xpart[fi] = np.minimum(xpart[fi], domain[:, 1]) # compute cost of new position results.append(pool.apipe(function, xpart[fi])) t1 = time.time() while (iternum <= maxiter) and (cgbest > precision): for i, res in enumerate(results): if res.ready(): # Get result and update cpp = res.get() # update best position if cpp < cpbest[i]: pbest[i] = xpart[i] cpbest[i] = cpp if cpp < cgbest: gbest = xpart[i] cgbest = cpp # update velocity vpart[i] = self.velocity(vpart[i], xpart[i], pbest[i], gbest) # update position xpart[i] = xpart[i] + vpart[i] # keeps particles inside the absolute boundaries given by `domain` xpart[i] = np.maximum(xpart[i], domain[:, 0]) xpart[i] = np.minimum(xpart[i], domain[:, 1]) # Resubmit results[i] = pool.apipe(function, xpart[i]) evalnum += 1 current_iternum = evalnum // nparticles if (current_iternum > iternum) or (cgbest < precision): xarr = np.concatenate((xarr, xpart[:, :, None]), axis=2) varr = np.concatenate((varr, vpart[:, :, None]), axis=2) parr = np.concatenate((parr, pbest[:, :, None]), axis=2) cparr = np.concatenate((cparr, cpbest[:, None]), axis=1) garr = np.concatenate((garr, gbest[:, None]), axis=1) cgarr = np.append(cgarr, cgbest) iternum = current_iternum t2 = time.time() if verbose: print('optimization took {:5.2f} seconds'.format(*[t2 - t1])) return xarr, varr, parr, cparr, garr, cgarr
pool = Pool(5) reslist = list() # variables = ['CLOUD', 'CLDICE', 'CLDLIQ', 'RELUM', 'Q', 'O3', 'T', 'U', 'V', 'OMEGA', 'Z3', "FISCCP1_COSP", # "CLDTOT_ISCCP", "MEANCLDALB_ISCCP", "MEANPTOP_ISCCP", "CLD_CAL", "CLDTOT_CAL", "CLDLOW_CAL", "CLDMED_CAL", "CLDHGH_CAL"] variables = ['CLOUD'] inpath = '/p/user_pub/work/E3SM/cmip6_variables/piControl/atm/vrt_remapped/' outpath = '/p/user_pub/work/E3SM/cmip6_variables/piControl/atm/vrt_remapped_180x360' nativepath = '/p/user_pub/work/E3SM/cmip6_variables/piControl/atm/vrt_remapped_ne30' for v in variables: reslist.append( pool.apipe( run_rgr, inpath, outpath, nativepath, v)) # for idx, res in enumerate(tqdm(reslist)): for idx, res in enumerate(reslist): out, err = res.get(9999999) pprint(out) pprint(err) if out: logging.info(out) if err: logging.error(err)
def regex_sentencize(docs, max_sentence_length=None, min_sentence_length=None, n_threads=1, reg_split=r"((?:\s*\n)+\s*)", reg_token=r"[\w*]+|[^\w\s\n*]", text_col="text", doc_id_col="doc_id", with_tqdm=False, verbose=0): """ Simple split MIMIC docs into sentences: - sentences bounds are found when multiple newline occurs - sentences too long are cut into `max_sentence_length` length sentences by splitting each sentence into the tokens using a dumb regexp. Parameters ---------- docs: pd.DataFrame max_sentence_length: int with_tqdm: bool verbose: int doc_id_col: str text_col: str Returns ------- (np.ndarray, np.ndarray, np.ndarray) """ n_threads = min(n_threads, len(docs)) if n_threads > 1: text_chunks = np.array_split(np.arange(len(docs)), n_threads) pool = ProcessPool(nodes=n_threads) pool.restart(force=True) results = [ pool.apipe(regex_sentencize, docs.iloc[chunk], max_sentence_length, 1, with_tqdm=False) for chunk in text_chunks ] results = [r.get() for r in results] pool.close() return pd.concat(results, ignore_index=True) reg_split = re.compile(reg_split) reg_token = re.compile(reg_token) doc_ids = [] sentence_idx_list = [] begins = [] ends = [] sentences = [] max_size = 0 min_size = 10000000 for doc_id, txt in zip( docs[doc_id_col], (tqdm(docs[text_col], desc="Splitting docs into sentences") if with_tqdm else docs[text_col])): idx = 0 queued_spans = [] sentence_idx = 0 for i, part in enumerate(reg_split.split(txt)): if i % 2 == 0: # we're in a sentence queued_spans.extend([(m.start() + idx, m.end() + idx) for m in reg_token.finditer(part)]) if max_sentence_length is None: max_sentence_length_ = len(queued_spans) else: max_sentence_length_ = max_sentence_length while len(queued_spans) > max_sentence_length_: b = queued_spans[0][0] e = queued_spans[max_sentence_length_ - 1][1] doc_ids.append(doc_id) sentence_idx_list.append(sentence_idx) begins.append(b) ends.append(e) max_size, min_size = max(max_size, max_sentence_length_), min( min_size, max_sentence_length_) queued_spans = queued_spans[max_sentence_length_:] sentences.append(txt[b:e]) sentence_idx += 1 if min_sentence_length is not None and len( queued_spans) < min_sentence_length: idx += len(part) continue if len(queued_spans): b = queued_spans[0][0] e = queued_spans[-1][1] doc_ids.append(doc_id) sentence_idx_list.append(sentence_idx) begins.append(b) ends.append(e) max_size, min_size = max(max_size, len(queued_spans)), min( min_size, len(queued_spans)) queued_spans = [] sentences.append(txt[b:e]) sentence_idx += 1 if part is not None: idx += len(part) if verbose: print("Sentence size: max = {}, min = {}".format(max_size, min_size)) df = pd.DataFrame({ doc_id_col: doc_ids, "sentence_idx": sentence_idx_list, "begin": begins, "end": ends, "text": sentences, }).astype({doc_id_col: docs[doc_id_col].dtype}) df = df.merge(docs[[doc_id_col] + [ col for col in docs.columns if col not in df.columns and col != "text" ]]) df["sentence_id"] = join_cols(df[[doc_id_col, "sentence_idx"]], "/") return df
import time from pathos.multiprocessing import ProcessPool # instantiate and configure the worker pool pool = ProcessPool(nodes=3) print "- Do a blocking (=synchronous) map on the chosen function" print(pool.map(pow, [1, 2, 3, 4], [5, 6, 7, 8])) print "- Do a non-blocking (=asynchronous) map, then get the results" results = pool.amap(pow, [1, 2, 3, 4], [5, 6, 7, 8]) while not results.ready(): time.sleep(1) print(".") print(results.get()) print "- Do a non-blocking (=asynchronous) map, then extract the results from the iterator" results = pool.imap(pow, [1, 2, 3, 4], [5, 6, 7, 8]) print("...") print(list(results)) print "- Do one item at a time, using a pipe" print(pool.pipe(pow, 1, 5)) print(pool.pipe(pow, 2, 6)) print "- Do one item at a time, using a non-blocking (=asynchronous) pipe" result1 = pool.apipe(pow, 1, 5) result2 = pool.apipe(pow, 2, 6) print(result1.get()) print(result2.get())
def run(self, verbose=None): """Runs the model again Parameters ---------- verbose : bool, optional if True, prints the messages from dynare during calculation. Default is True """ if verbose is None: verbose = self.verbose # ensure that the logfile exists and that its empty blank = open(self.logfile, 'w') blank.close() if verbose: if self.isnotebook: old_stdout = sys.stdout sys.stdout = tempfile.TemporaryFile() else: from pathos.multiprocessing import ProcessPool pool = ProcessPool(nodes=2) pipe0 = pool.apipe(print_progress, self.logfile) if self.engine_type == 'matlab': with PipeOutput(self.logfile, sys.stdout): self.eng.eval(self.prefix + self.modname, nargout=0) self.workspace = self.eng.workspace self.oo_ = self.eng.workspace['oo_'] else: # need to dump the original octave plots somewhere pltdir = os.path.join(tempfile.gettempdir(), 'plt') if not os.path.isdir(pltdir): os.mkdir(pltdir) with PipeOutput(self.logfile, sys.stdout): self.eng.feval(self.prefix + self.modname, plot_dir=pltdir) oct_ws_list = self.eng.eval('who', nout=1) self.workspace = {var: self.eng.pull(var) for var in oct_ws_list} self.oo_ = self.workspace['oo_'] if self.plot: if self.isnotebook: if self.engine_type == 'octave': self.imgs = self.eng.extract_figures(pltdir) for img in self.imgs: display(img) else: print( "'plot=True' not supported with matlab engine in Jupyter notebook." ) else: epsfiles = [f for f in os.listdir(self.dirpath) if '.eps' in f] for figname in epsfiles: figpath = os.path.join(self.dirpath, figname) figtitle = figname.replace(self.modname + '_', '').replace('.eps', '') plot_eps(figpath, figtitle) if self.engine_type == 'octave': shutil.rmtree(pltdir) if verbose: if self.isnotebook: sys.stdout = old_stdout if self.engine_type == 'matlab': lf = open(self.logfile, 'r') sys.stdout.write(lf.read()) else: pipe0
def run(input_directory: str, output_directory: str, filename: str, include_vanilla_emotes: bool): # set thread count to allow one parallel thread per logical cpu core in later emote counting step max_threads = ceil(cpu_count() / 2) # predefine counting functions for multi-threading def count_text_uses(emotes_partition: list, index: int): indent = index * (len(emotes) // max_threads) for emote_position in range(0, len(emotes_partition)): # unicode emotes are never surrounded by colons in discord-chat-exporter's .txt format if include_vanilla_emotes and len( emotes_partition[emote_position]) == 1: in_text = re.compile( rf"{re.escape(emotes_partition[emote_position])}") else: in_text = re.compile( rf":{re.escape(emotes_partition[emote_position])}:") for t in range(0, len(text_channels)): text_uses[emote_position + indent] += len( re.findall(in_text, text_channels[t])) return text_uses def count_react_uses(emotes_partition: list, index: int): indent = index * (len(emotes) // max_threads) for emote_position in range(0, len(emotes_partition)): in_react = re.compile( rf"{{Reactions}}\n.*{re.escape(emotes_partition[emote_position])} " ) for t in range(0, len(text_channels)): per_channel_react_uses = len( re.findall(in_react, text_channels[t])) react_uses[emote_position + indent] += per_channel_react_uses # text and react uses for unicode emotes are indistinguishable, remove duplicate counts if include_vanilla_emotes and len( emotes_partition[emote_position]) == 1: text_uses[emote_position + indent] -= per_channel_react_uses return react_uses # collect desired emotes emotes = [] if include_vanilla_emotes: emotes += io.read_emotes(vanilla_emote_filepath) emotes += io.read_emotes(custom_emote_filepath) # ensure emote list is integer dividable by thread_count original_emote_count = len(emotes) for j in range(0, max_threads - (len(emotes) % max_threads)): emotes += ["DUMMY_NOT_AN_EMOTE" ] # nobody should use this as an emote ever # import discord channel text files text_channels = io.read_input_files(input_directory) temp_text_channels = [""] for text_channel in text_channels: temp_text_channels[0] += text_channel text_channels = temp_text_channels # count emote uses text_uses = [0] * len(emotes) react_uses = [0] * len(emotes) pool = ProcessPool(nodes=max_threads * 2) emotes_partitioned = numpy.array_split(emotes, max_threads) text_results = [] react_results = [] for i in range(0, max_threads): text_results += [pool.apipe(count_text_uses, emotes_partitioned[i], i)] react_results += [ pool.apipe(count_react_uses, emotes_partitioned[i], i) ] for i in range(0, max_threads): text_uses = list(map(add, text_uses, text_results[i].get())) react_uses = list(map(add, react_uses, react_results[i].get())) # print csv (usage count) with open(output_directory + filename, 'w', newline='', encoding="utf8") as emote_usage_file: emote_usage_writer = csv.writer(emote_usage_file, delimiter=';') emote_usage_writer.writerow(["emote"] + ["text_uses"] + ["react_uses"] + ["total_uses"]) for e in range(0, original_emote_count): emote_usage_writer.writerow([emotes[e]] + [text_uses[e]] + [react_uses[e]] + [text_uses[e] + react_uses[e]])
def fit(self, X, y, trials=5, indicators=['tta'], ranges=[(3, 180)], spearman=True, weights=None, early_stop=99999, split=None): """ Optimize indicator parameters to maximize correlation :param X: Historical dataset :param y: Target used to measure correlation. Can be a subset index of X :param trials: Number of optimization trials per indicator set :param indicators: List of indicators to optimize :param ranges: Parameter search space :param spearman: Perform spearman vs pearson correlation :param weights: Optional weights sharing the same index as y :param early_stop: Max number of optimization trials before stopping :param split: Index cut points defining time periods """ self.fitted = [] # List containing each indicator completed study X.columns = X.columns.str.lower() # columns must be lower case pool = ProcessPool(nodes=self.n_jobs) # Set parallel cores # Package level optimization if 'tta' in indicators: indicators = indicators + talib_indicators indicators.remove('tta') if 'pta' in indicators: indicators = indicators + pandas_ta_indicators indicators.remove('pta') if 'fta' in indicators: indicators = indicators + finta_indicatrs indicators.remove('fta') if 'all' in indicators: indicators = talib_indicators + pandas_ta_indicators + finta_indicatrs indicators = list(OrderedDict.fromkeys(indicators)) # Create textual representation of function in Optuna format # Example: 'tta.RSI(X.close, length=trial.suggest_int(\'timeperiod1\', 2, 1500))' # Utilizes the signature of the indicator (ie user parameters) if available # TTA uses help docstrings as signature is not available in C bindings # Parameters contained in config.py are tuned # Iterate user defined search space ranges for low, high in ranges: if low <= 1: raise ValueError("Range low must be > 1") if high >= len(X): raise ValueError(f"Range high:{high} must be > length of X:{len(X)}") # Iterate indicators per range for ind in indicators: # Index column to optimize if indicator returns dataframe idx = 0 if ":" in ind: idx = int(ind.split(":")[1]) ind = ind.split(":")[0] fn = f"{ind}(" # If TTA indicator, use doc strings for lack of better way to # get indicator arguments (C binding) if ind[0:3] == "tta": usage = eval(f"{ind}.__doc__").split(")")[0].split("(")[1] params = re.sub('[^0-9a-zA-Z_\s]', '', usage).split() # Pandas-TA and FinTA both can be inspected for parameters else: sig = inspect.signature(eval(ind)) params = sig.parameters.values() # Format function string suggest = False for param in params: param = re.split(':|=', str(param))[0].strip() if param == "open_": param = "open" if param == "real": fn += f"X.close, " elif param == "ohlc": fn += f"X, " elif param == "ohlcv": fn += f"X, " elif param in tune_series: fn += f"X.{param}, " elif param in tune_params: suggest = True fn += f"{param}=trial.suggest_int('{param}', {low}, {high}), " fn += ")" # Only optimize indicators that contain tunable parameters if suggest: self.fitted.append(pool.apipe(Optimize(function=fn, n_trials=trials, spearman=spearman).fit, X, y, idx=idx, verbose=self.verbose, weights=weights, early_stop=early_stop, split=split), ) else: self.fitted.append(pool.apipe(Optimize(function=fn, n_trials=1, spearman=spearman).fit, X, y, idx=idx, verbose=self.verbose, weights=weights, early_stop=early_stop, split=split), ) # Blocking wait to retrieve results self.fitted = [fit.get() for fit in self.fitted]