def test_pmap(self): """ Tests that pmap is parallel but also returns a seq where each element has not moved in the list ie: it behaves just like map """ sequence = [1, 2, 3, 4, 5, 6] new_seq = pmap(plus1, sequence, thread_count=1) regular_map = map(plus1, sequence) self.assertEquals(new_seq, regular_map)
def run_stoke_jobs(jsonl_file, jobs, NUM_WORKERS=2): f = open(jsonl_file, "w") lock = threading.Lock() def run_and_save(job): r = run_trial(job) if r is not None: with lock: f.write(json.dumps(r, separators=(',',':'), ensure_ascii=True)+"\n") f.flush() r = pmap.pmap(run_and_save, jobs, NUM_WORKERS) f.close()
def get_data_frame( cls, name: Optional[str] = None, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, show_progress: bool = False, ): """Execute a request for projects ## Parameters Query: `phc.easy.projects.ProjectListOptions` Execution: `phc.easy.query.Query.execute_paging_api` """ if page_size is None: # Projects do not have much data so use a higher page size page_size = 100 get_data_frame = super().get_data_frame auth = Auth(auth_args) get_data_frame_args = without_keys( cls._get_current_args(inspect.currentframe(), locals()), ["auth_args", "account", "show_progress"], ) def get_projects_for_account(account: dict): df = get_data_frame( ignore_cache=True, all_results=max_pages is None, auth_args=auth.customized({"account": account["id"]}), show_progress=show_progress, **get_data_frame_args, ) df["account"] = account["id"] return df frame = pd.concat(list(pmap(get_projects_for_account, auth.accounts()))) return frame.reset_index(drop=True)
dynspect = foldspec2[:,igate[0]-1:igate[1],:].sum(axis=1) dynspect2 = foldspec2[:,igate[2]-1:igate[3],:].sum(axis=1) f = open('dynspect'+psr+'.bin', 'wb') f.write(dynspect.T.tostring()) f.write(dynspect2.T.tostring()) f.close() with open('flux.dat', 'w') as f: for i, flux in enumerate(fluxes): f.write('{0:12d} {1:12.9g}\n'.format(i+1, flux)) plots = True if plots: if do_waterfall: w = waterfall.copy() w[0] = 0. pmap('waterfall.pgm', w, 1, verbose=True) if do_foldspec: pmap('folded'+psr+'.pgm', foldspec1, 0, verbose) pmap('foldedbin'+psr+'.pgm', f2.transpose(0,2,1).reshape(nchan,-1), 1, verbose) pmap('folded3'+psr+'.pgm', foldspec3, 0, verbose) # open(10,file='dynspect'//psr//'.bin',form='unformatted') # write(10) dynspect # write(10) dynspect2 if igate is not None: dall = dynspect+dynspect2 dall_sum0 = dall.sum(axis=0) dall_sum0 = np.where(dall_sum0, dall_sum0, 1.) dall = dall/(dall_sum0/nchan) dall[0,:] = 0 pmap('dynspect'+psr+'.pgm', dall, 0, verbose)
fluxes = foldspec1.sum(axis=0) foldspec3 = foldspec2.sum(axis=0) dynspect = foldspec2[:,igate[0]-1:igate[1],:].sum(axis=1) dynspect2 = foldspec2[:,igate[2]-1:igate[3],:].sum(axis=1) f = open('dynspect'+psr+'.bin', 'wb') f.write(dynspect.T.tostring()) f.write(dynspect2.T.tostring()) f.close() f = open('flux.dat', 'w') for i, flux in enumerate(fluxes): f.write('{0:12d} {1:12.9g}\n'.format(i+1, flux)) f.close() plots = True if plots: # pmap('waterfall.pgm', waterfall, 1, verbose=True) pmap('folded'+psr+'.pgm', foldspec1, 0, verbose) pmap('foldedbin'+psr+'.pgm', foldspec2.reshape(nblock,-1), 1, verbose) pmap('folded3'+psr+'.pgm', foldspec3, 0, verbose) # open(10,file='dynspect'//psr//'.bin',form='unformatted') # write(10) dynspect # write(10) dynspect2 dall = dynspect+dynspect2 dall_sum0 = dall.sum(axis=0) dall_sum0 = np.where(dall_sum0, dall_sum0, 1.) dall = dall/(dall_sum0/nblock) dall[0,:] = 0 pmap('dynspect'+psr+'.pgm', dall, 0, verbose) t1 = dynspect/(dynspect.sum(axis=0)/nblock) t2 = dynspect2/(dynspect2.sum(axis=0)/nblock) dsub = t1-t2 dsub[0,:] = 0
dynspect = foldspec2[:, igate[0] - 1 : igate[1], :].sum(axis=1) dynspect2 = foldspec2[:, igate[2] - 1 : igate[3], :].sum(axis=1) f = open("dynspect" + psr + ".bin", "wb") f.write(dynspect.T.tostring()) f.write(dynspect2.T.tostring()) f.close() with open("flux.dat", "w") as f: for i, flux in enumerate(fluxes): f.write("{0:12d} {1:12.9g}\n".format(i + 1, flux)) plots = True if plots: if do_waterfall: w = waterfall.copy() w[0] = 0.0 pmap("waterfall.pgm", w, 1, verbose=True) if do_foldspec: pmap("folded" + psr + ".pgm", foldspec1, 0, verbose) pmap("foldedbin" + psr + ".pgm", f2.transpose(0, 2, 1).reshape(nchan, -1), 1, verbose) pmap("folded3" + psr + ".pgm", foldspec3, 0, verbose) # open(10,file='dynspect'//psr//'.bin',form='unformatted') # write(10) dynspect # write(10) dynspect2 if igate is not None: dall = dynspect + dynspect2 dall_sum0 = dall.sum(axis=0) dall_sum0 = np.where(dall_sum0, dall_sum0, 1.0) dall = dall / (dall_sum0 / nchan) dall[0, :] = 0 pmap("dynspect" + psr + ".pgm", dall, 0, verbose) t1 = dynspect / (dynspect.sum(axis=0) / nchan)
if igate is not None: dynspect = foldspec2[:, igate[0] - 1:igate[1], :].sum(axis=1) dynspect2 = foldspec2[:, igate[2] - 1:igate[3], :].sum(axis=1) f = open('dynspect' + psr + '.bin', 'wb') f.write(dynspect.T.tostring()) f.write(dynspect2.T.tostring()) f.close() f = open('flux.dat', 'w') for i, flux in enumerate(fluxes): f.write('{0:12d} {1:12.9g}\n'.format(i + 1, flux)) f.close() plots = True if plots: if do_waterfall: w = waterfall.copy() pmap('waterfall.pgm', w, 1, verbose=True) pmap('folded' + psr + '.pgm', foldspec1, 0, verbose) pmap('foldedbin' + psr + '.pgm', f2.transpose(0, 2, 1).reshape(nchan, -1), 1, verbose) pmap('folded3' + psr + '.pgm', foldspec3, 0, verbose) # open(10,file='dynspect'//psr//'.bin',form='unformatted') # write(10) dynspect # write(10) dynspect2 if igate is not None: dall = dynspect + dynspect2 dall_sum0 = dall.sum(axis=0) dall_sum0 = np.where(dall_sum0, dall_sum0, 1.) dall = dall / (dall_sum0 / nchan) dall[0, :] = 0 pmap('dynspect' + psr + '.pgm', dall, 0, verbose) t1 = dynspect / (dynspect.sum(axis=0) / nchan)
foldspec3 = foldspec2.sum(axis=0) if igate is not None: dynspect = foldspec2[:, igate[0] - 1 : igate[1], :].sum(axis=1) dynspect2 = foldspec2[:, igate[2] - 1 : igate[3], :].sum(axis=1) f = open("dynspect" + psr + ".bin", "wb") f.write(dynspect.T.tostring()) f.write(dynspect2.T.tostring()) f.close() f = open("flux.dat", "w") for i, flux in enumerate(fluxes): f.write("{0:12d} {1:12.9g}\n".format(i + 1, flux)) f.close() plots = True if plots: # pmap('waterfall.pgm', waterfall, 1, verbose=True) pmap("folded" + psr + ".pgm", foldspec1, 0, verbose) pmap("foldedbin" + psr + ".pgm", foldspec2.reshape(nblock, -1), 1, verbose) pmap("folded3" + psr + ".pgm", foldspec3, 0, verbose) if igate is not None: dall = dynspect + dynspect2 dall_sum0 = dall.sum(axis=0) dall_sum0 = np.where(dall_sum0, dall_sum0, 1.0) dall = dall / (dall_sum0 / nblock) dall[0, :] = 0 pmap("dynspect" + psr + ".pgm", dall, 0, verbose) t1 = dynspect / (dynspect.sum(axis=0) / nblock) t2 = dynspect2 / (dynspect2.sum(axis=0) / nblock) dsub = t1 - t2 dsub[0, :] = 0 pmap("dynspectdiff" + psr + ".pgm", dsub, 0, verbose)
def main(targetDirs): errorMsgs = pmap.pmap(checkUrls, targetDirs) for msg in errorMsgs: if msg: print(msg)
def process_files( self, input_dir, output_dir, process_document_content, file_type="xml", multithreaded=False, ): """ Handles the most common case of iteration where processing can be handled with a function that is applied to the contents of each file. (Not every module can accommodate this, e.g. modules that need to write to multiple directories.) This allows the processing function to ignore file I/O. :param input_dir: From `run` :param output_dir: From `run` :param process_document_content: A method that accepts a single argument, the contents of an input file. :param file_type: The AMALGUM file type folder that should be used under input_dir and output_dir. :param multithreaded: Use python-pmap to apply the process_document_content_dict in parallel. Only use if the function is CPU-intensive (and not, e.g., I/O intensive) and there are no race conditions that would be introduced by having multiple threads use the function. (If in doubt, don't set this to True!) :return: None """ os.makedirs(os.path.join(output_dir, file_type), exist_ok=True) sorted_filepaths = sorted(glob(os.path.join(input_dir, file_type, "*"))) progress = tqdm(total=len(sorted_filepaths)) def process_file(filepath, report_progress=False): nonlocal progress filename = filepath.split(os.sep)[-1] with io.open(filepath, "r", encoding="utf8") as f: s = f.read() try: s = process_document_content(s) except Exception as e: logging.error( f"Encountered an error while processing file {filepath}!") raise e with io.open( os.path.join(output_dir, file_type, filename), "w", encoding="utf8", newline="\n", ) as f: f.write(s) # This could lead to race conditions, but it doesn't really matter if # the count gets messed up, so we let it be if report_progress: progress.update(1) if multithreaded: list( pmap.pmap(partial(process_file, report_progress=True), sorted_filepaths)) progress.close() else: for filepath in tqdm(sorted_filepaths): process_file(filepath)
def process_files_multiformat(self, input_dir, output_dir, process_document_content_dict, multithreaded=False): """ Like process_files, with one difference: the supplied function `process_document_content_dict` now (1) receives a dict of dir -> file contents, e.g. {'xml': '<text ...>...</text>', 'rst': '...', ...}, which contains every version of the document that is currently in the pipeline (2) expects a dict with the same structure to be returned, e.g. {'tsv': '...'}. Every pair in the returned dict will be written to the appropriate file, e.g. 'tsv/doc_name.tsv', 'rst/doc_name.rs3'. File extension is determined from NLPModule.FILE_EXT_MAP, or if it is not present there, is assumed to be the same as the name of the subdirectory. :param input_dir: From `run` :param output_dir: From `run` :param process_document_content_dict: A method that accepts a single argument, a dict with the key being the subdirectory that the file is in, and the value being the contents of that file as a string :param output_dir: if True, use the python-pmap library to run the document processing function in parallel. Do NOT set this to True unless you are CERTAIN that there will not be any race conditions that could corrupt the data. :param multithreaded: Use python-pmap to apply the process_document_content_dict in parallel. Only use if the function is CPU-intensive (and not, e.g., I/O intensive) and there are no race conditions that would be introduced by having multiple threads use the function. (If in doubt, don't set this to True!) :return: None """ existing_input_dirs = [ os.path.join(input_dir, subdir) for subdir in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, subdir)) ] if len(existing_input_dirs) == 0: raise Exception("No input directories found!") # Use the first dir to derive filenames without filetypes base_dir = sorted(existing_input_dirs)[0] filenames = sorted( [filename.split(".")[0] for filename in os.listdir(base_dir)]) progress = tqdm(total=len(filenames)) def process_filename(filename, report_progress=False): nonlocal progress # Refuse to proceed if every other directory doesn't also have a file with the same name if not all( any( fname.startswith(filename) for fname in os.listdir(subdir)) for subdir in existing_input_dirs): raise Exception( f"File {filename} does not exist in all of these directories: {existing_input_dirs}" ) # construct the content dict content_dict = {} content_dict["filename"] = filename for subdir in existing_input_dirs: matching_files = [ f for f in os.listdir(subdir) if f.split(".")[0] == filename ] assert (len(matching_files) > 0), f"Couldn't find {filename} in directory {subdir}" assert ( len(matching_files) < 2 ), f"More than one file starting with {filename} in directory {subdir}" filepath = os.path.join(subdir, matching_files[0]) with io.open(filepath, "r", encoding="utf8") as f: content_dict[subdir.split(os.sep)[-1]] = f.read() # run the processing function try: output_dict = process_document_content_dict(content_dict) except Exception as e: logging.error( f"Encountered an error while processing file {filepath}!") raise e # write out all the output documents for subdir, content in output_dict.items(): subdir_path = os.path.join(output_dir, subdir) if not os.path.exists(subdir_path): os.makedirs(subdir_path) file_ext = (NLPModule.FILE_EXT_MAP[subdir] if subdir in NLPModule.FILE_EXT_MAP else subdir) filepath = os.path.join(output_dir, subdir, filename + "." + file_ext) with io.open(filepath, "w", encoding="utf8", newline="\n") as f: f.write(content) # This could lead to race conditions, but it doesn't really matter if # the count gets messed up, so we let it be if report_progress: progress.update(1) if multithreaded: list( pmap.pmap(partial(process_filename, report_progress=True), filenames)) progress.close() else: for filename in tqdm(filenames): process_filename(filename)