예제 #1
0
 def test_pmap(self):
     """
     Tests that pmap is parallel but also returns a seq where each element has not moved in the list
     ie: it behaves just like map
     """
     sequence = [1, 2, 3, 4, 5, 6]
     new_seq = pmap(plus1, sequence, thread_count=1)
     regular_map = map(plus1, sequence)
     self.assertEquals(new_seq, regular_map)
예제 #2
0
def run_stoke_jobs(jsonl_file, jobs, NUM_WORKERS=2):
    f = open(jsonl_file, "w")
    lock = threading.Lock()
    def run_and_save(job):
        r = run_trial(job)
        if r is not None:
            with lock:
                f.write(json.dumps(r,  separators=(',',':'), ensure_ascii=True)+"\n")
                f.flush()
    r = pmap.pmap(run_and_save, jobs, NUM_WORKERS)
    f.close()
예제 #3
0
    def get_data_frame(
        cls,
        name: Optional[str] = None,
        auth_args: Auth = Auth.shared(),
        max_pages: Optional[int] = None,
        page_size: Optional[int] = None,
        log: bool = False,
        show_progress: bool = False,
    ):
        """Execute a request for projects

        ## Parameters

        Query: `phc.easy.projects.ProjectListOptions`

        Execution: `phc.easy.query.Query.execute_paging_api`
        """

        if page_size is None:
            # Projects do not have much data so use a higher page size
            page_size = 100

        get_data_frame = super().get_data_frame

        auth = Auth(auth_args)

        get_data_frame_args = without_keys(
            cls._get_current_args(inspect.currentframe(), locals()),
            ["auth_args", "account", "show_progress"],
        )

        def get_projects_for_account(account: dict):
            df = get_data_frame(
                ignore_cache=True,
                all_results=max_pages is None,
                auth_args=auth.customized({"account": account["id"]}),
                show_progress=show_progress,
                **get_data_frame_args,
            )
            df["account"] = account["id"]
            return df

        frame = pd.concat(list(pmap(get_projects_for_account, auth.accounts())))

        return frame.reset_index(drop=True)
            dynspect = foldspec2[:,igate[0]-1:igate[1],:].sum(axis=1)
            dynspect2 = foldspec2[:,igate[2]-1:igate[3],:].sum(axis=1)
            f = open('dynspect'+psr+'.bin', 'wb')
            f.write(dynspect.T.tostring())
            f.write(dynspect2.T.tostring())
            f.close()
        with open('flux.dat', 'w') as f:
            for i, flux in enumerate(fluxes):
                f.write('{0:12d} {1:12.9g}\n'.format(i+1, flux))

    plots = True
    if plots:
        if do_waterfall:
            w = waterfall.copy()
            w[0] = 0.
            pmap('waterfall.pgm', w, 1, verbose=True)
        if do_foldspec:
            pmap('folded'+psr+'.pgm', foldspec1, 0, verbose)
            pmap('foldedbin'+psr+'.pgm',
                 f2.transpose(0,2,1).reshape(nchan,-1), 1, verbose)
            pmap('folded3'+psr+'.pgm', foldspec3, 0, verbose)
            # open(10,file='dynspect'//psr//'.bin',form='unformatted')
            # write(10) dynspect
            # write(10) dynspect2
            if igate is not None:
                dall = dynspect+dynspect2
                dall_sum0 = dall.sum(axis=0)
                dall_sum0 = np.where(dall_sum0, dall_sum0, 1.)
                dall = dall/(dall_sum0/nchan)
                dall[0,:] = 0
                pmap('dynspect'+psr+'.pgm', dall, 0, verbose)
예제 #5
0
 fluxes = foldspec1.sum(axis=0)
 foldspec3 = foldspec2.sum(axis=0)
 dynspect = foldspec2[:,igate[0]-1:igate[1],:].sum(axis=1)
 dynspect2 = foldspec2[:,igate[2]-1:igate[3],:].sum(axis=1)
 f = open('dynspect'+psr+'.bin', 'wb')
 f.write(dynspect.T.tostring())
 f.write(dynspect2.T.tostring())
 f.close()
 f = open('flux.dat', 'w')
 for i, flux in enumerate(fluxes):
     f.write('{0:12d} {1:12.9g}\n'.format(i+1, flux))
 f.close()
 plots = True
 if plots:
     # pmap('waterfall.pgm', waterfall, 1, verbose=True)
     pmap('folded'+psr+'.pgm', foldspec1, 0, verbose)
     pmap('foldedbin'+psr+'.pgm', foldspec2.reshape(nblock,-1), 1, verbose)
     pmap('folded3'+psr+'.pgm', foldspec3, 0, verbose)
     # open(10,file='dynspect'//psr//'.bin',form='unformatted')
     # write(10) dynspect
     # write(10) dynspect2
     dall = dynspect+dynspect2
     dall_sum0 = dall.sum(axis=0)
     dall_sum0 = np.where(dall_sum0, dall_sum0, 1.)
     dall = dall/(dall_sum0/nblock)
     dall[0,:] = 0
     pmap('dynspect'+psr+'.pgm', dall, 0, verbose)
     t1 = dynspect/(dynspect.sum(axis=0)/nblock)
     t2 = dynspect2/(dynspect2.sum(axis=0)/nblock)
     dsub = t1-t2
     dsub[0,:] = 0
예제 #6
0
            dynspect = foldspec2[:, igate[0] - 1 : igate[1], :].sum(axis=1)
            dynspect2 = foldspec2[:, igate[2] - 1 : igate[3], :].sum(axis=1)
            f = open("dynspect" + psr + ".bin", "wb")
            f.write(dynspect.T.tostring())
            f.write(dynspect2.T.tostring())
            f.close()
        with open("flux.dat", "w") as f:
            for i, flux in enumerate(fluxes):
                f.write("{0:12d} {1:12.9g}\n".format(i + 1, flux))

    plots = True
    if plots:
        if do_waterfall:
            w = waterfall.copy()
            w[0] = 0.0
            pmap("waterfall.pgm", w, 1, verbose=True)
        if do_foldspec:
            pmap("folded" + psr + ".pgm", foldspec1, 0, verbose)
            pmap("foldedbin" + psr + ".pgm", f2.transpose(0, 2, 1).reshape(nchan, -1), 1, verbose)
            pmap("folded3" + psr + ".pgm", foldspec3, 0, verbose)
            # open(10,file='dynspect'//psr//'.bin',form='unformatted')
            # write(10) dynspect
            # write(10) dynspect2
            if igate is not None:
                dall = dynspect + dynspect2
                dall_sum0 = dall.sum(axis=0)
                dall_sum0 = np.where(dall_sum0, dall_sum0, 1.0)
                dall = dall / (dall_sum0 / nchan)
                dall[0, :] = 0
                pmap("dynspect" + psr + ".pgm", dall, 0, verbose)
                t1 = dynspect / (dynspect.sum(axis=0) / nchan)
예제 #7
0
 if igate is not None:
     dynspect = foldspec2[:, igate[0] - 1:igate[1], :].sum(axis=1)
     dynspect2 = foldspec2[:, igate[2] - 1:igate[3], :].sum(axis=1)
     f = open('dynspect' + psr + '.bin', 'wb')
     f.write(dynspect.T.tostring())
     f.write(dynspect2.T.tostring())
     f.close()
 f = open('flux.dat', 'w')
 for i, flux in enumerate(fluxes):
     f.write('{0:12d} {1:12.9g}\n'.format(i + 1, flux))
 f.close()
 plots = True
 if plots:
     if do_waterfall:
         w = waterfall.copy()
         pmap('waterfall.pgm', w, 1, verbose=True)
     pmap('folded' + psr + '.pgm', foldspec1, 0, verbose)
     pmap('foldedbin' + psr + '.pgm',
          f2.transpose(0, 2, 1).reshape(nchan, -1), 1, verbose)
     pmap('folded3' + psr + '.pgm', foldspec3, 0, verbose)
     # open(10,file='dynspect'//psr//'.bin',form='unformatted')
     # write(10) dynspect
     # write(10) dynspect2
     if igate is not None:
         dall = dynspect + dynspect2
         dall_sum0 = dall.sum(axis=0)
         dall_sum0 = np.where(dall_sum0, dall_sum0, 1.)
         dall = dall / (dall_sum0 / nchan)
         dall[0, :] = 0
         pmap('dynspect' + psr + '.pgm', dall, 0, verbose)
         t1 = dynspect / (dynspect.sum(axis=0) / nchan)
예제 #8
0
 fluxes = foldspec1.sum(axis=0)
 foldspec3 = foldspec2.sum(axis=0)
 dynspect = foldspec2[:,igate[0]-1:igate[1],:].sum(axis=1)
 dynspect2 = foldspec2[:,igate[2]-1:igate[3],:].sum(axis=1)
 f = open('dynspect'+psr+'.bin', 'wb')
 f.write(dynspect.T.tostring())
 f.write(dynspect2.T.tostring())
 f.close()
 f = open('flux.dat', 'w')
 for i, flux in enumerate(fluxes):
     f.write('{0:12d} {1:12.9g}\n'.format(i+1, flux))
 f.close()
 plots = True
 if plots:
     # pmap('waterfall.pgm', waterfall, 1, verbose=True)
     pmap('folded'+psr+'.pgm', foldspec1, 0, verbose)
     pmap('foldedbin'+psr+'.pgm', foldspec2.reshape(nblock,-1), 1, verbose)
     pmap('folded3'+psr+'.pgm', foldspec3, 0, verbose)
     # open(10,file='dynspect'//psr//'.bin',form='unformatted')
     # write(10) dynspect
     # write(10) dynspect2
     dall = dynspect+dynspect2
     dall_sum0 = dall.sum(axis=0)
     dall_sum0 = np.where(dall_sum0, dall_sum0, 1.)
     dall = dall/(dall_sum0/nblock)
     dall[0,:] = 0
     pmap('dynspect'+psr+'.pgm', dall, 0, verbose)
     t1 = dynspect/(dynspect.sum(axis=0)/nblock)
     t2 = dynspect2/(dynspect2.sum(axis=0)/nblock)
     dsub = t1-t2
     dsub[0,:] = 0
    foldspec3 = foldspec2.sum(axis=0)
    if igate is not None:
        dynspect = foldspec2[:, igate[0] - 1 : igate[1], :].sum(axis=1)
        dynspect2 = foldspec2[:, igate[2] - 1 : igate[3], :].sum(axis=1)
        f = open("dynspect" + psr + ".bin", "wb")
        f.write(dynspect.T.tostring())
        f.write(dynspect2.T.tostring())
        f.close()
    f = open("flux.dat", "w")
    for i, flux in enumerate(fluxes):
        f.write("{0:12d} {1:12.9g}\n".format(i + 1, flux))
    f.close()
    plots = True
    if plots:
        # pmap('waterfall.pgm', waterfall, 1, verbose=True)
        pmap("folded" + psr + ".pgm", foldspec1, 0, verbose)
        pmap("foldedbin" + psr + ".pgm", foldspec2.reshape(nblock, -1), 1, verbose)
        pmap("folded3" + psr + ".pgm", foldspec3, 0, verbose)
        if igate is not None:
            dall = dynspect + dynspect2
            dall_sum0 = dall.sum(axis=0)
            dall_sum0 = np.where(dall_sum0, dall_sum0, 1.0)
            dall = dall / (dall_sum0 / nblock)
            dall[0, :] = 0
            pmap("dynspect" + psr + ".pgm", dall, 0, verbose)
            t1 = dynspect / (dynspect.sum(axis=0) / nblock)
            t2 = dynspect2 / (dynspect2.sum(axis=0) / nblock)
            dsub = t1 - t2
            dsub[0, :] = 0
            pmap("dynspectdiff" + psr + ".pgm", dsub, 0, verbose)
예제 #10
0
def main(targetDirs):
    errorMsgs = pmap.pmap(checkUrls, targetDirs)
    for msg in errorMsgs:
        if msg:
            print(msg)
예제 #11
0
파일: base.py 프로젝트: gucorpling/amalgum
    def process_files(
        self,
        input_dir,
        output_dir,
        process_document_content,
        file_type="xml",
        multithreaded=False,
    ):
        """
        Handles the most common case of iteration where processing can be handled with a function that is
        applied to the contents of each file. (Not every module can accommodate this, e.g. modules that need
        to write to multiple directories.) This allows the processing function to ignore file I/O.
        :param input_dir: From `run`
        :param output_dir: From `run`
        :param process_document_content: A method that accepts a single argument, the contents of an input file.
        :param file_type: The AMALGUM file type folder that should be used under input_dir and output_dir.
        :param multithreaded: Use python-pmap to apply the process_document_content_dict in parallel. Only use if
                              the function is CPU-intensive (and not, e.g., I/O intensive) and there are no race
                              conditions that would be introduced by having multiple threads use the function.
                              (If in doubt, don't set this to True!)
        :return: None
        """
        os.makedirs(os.path.join(output_dir, file_type), exist_ok=True)
        sorted_filepaths = sorted(glob(os.path.join(input_dir, file_type,
                                                    "*")))

        progress = tqdm(total=len(sorted_filepaths))

        def process_file(filepath, report_progress=False):
            nonlocal progress

            filename = filepath.split(os.sep)[-1]
            with io.open(filepath, "r", encoding="utf8") as f:
                s = f.read()
            try:
                s = process_document_content(s)
            except Exception as e:
                logging.error(
                    f"Encountered an error while processing file {filepath}!")
                raise e
            with io.open(
                    os.path.join(output_dir, file_type, filename),
                    "w",
                    encoding="utf8",
                    newline="\n",
            ) as f:
                f.write(s)

            # This could lead to race conditions, but it doesn't really matter if
            # the count gets messed up, so we let it be
            if report_progress:
                progress.update(1)

        if multithreaded:
            list(
                pmap.pmap(partial(process_file, report_progress=True),
                          sorted_filepaths))
            progress.close()
        else:
            for filepath in tqdm(sorted_filepaths):
                process_file(filepath)
예제 #12
0
파일: base.py 프로젝트: gucorpling/amalgum
    def process_files_multiformat(self,
                                  input_dir,
                                  output_dir,
                                  process_document_content_dict,
                                  multithreaded=False):
        """
        Like process_files, with one difference: the supplied function `process_document_content_dict` now
        (1) receives a dict of dir -> file contents, e.g. {'xml': '<text ...>...</text>', 'rst': '...', ...},
            which contains every version of the document that is currently in the pipeline
        (2) expects a dict with the same structure to be returned, e.g. {'tsv': '...'}. Every pair in the
            returned dict will be written to the appropriate file, e.g. 'tsv/doc_name.tsv', 'rst/doc_name.rs3'.
            File extension is determined from NLPModule.FILE_EXT_MAP, or if it is not present there, is assumed
            to be the same as the name of the subdirectory.
        :param input_dir: From `run`
        :param output_dir: From `run`
        :param process_document_content_dict: A method that accepts a single argument, a dict with
                                              the key being the subdirectory that the file is in, and
                                              the value being the contents of that file as a string
        :param output_dir: if True, use the python-pmap library to run the document processing function in parallel.
                           Do NOT set this to True unless you are CERTAIN that there will not be any race conditions
                           that could corrupt the data.
        :param multithreaded: Use python-pmap to apply the process_document_content_dict in parallel. Only use if
                              the function is CPU-intensive (and not, e.g., I/O intensive) and there are no race
                              conditions that would be introduced by having multiple threads use the function.
                              (If in doubt, don't set this to True!)
        :return: None
        """
        existing_input_dirs = [
            os.path.join(input_dir, subdir) for subdir in os.listdir(input_dir)
            if os.path.isdir(os.path.join(input_dir, subdir))
        ]
        if len(existing_input_dirs) == 0:
            raise Exception("No input directories found!")

        # Use the first dir to derive filenames without filetypes
        base_dir = sorted(existing_input_dirs)[0]
        filenames = sorted(
            [filename.split(".")[0] for filename in os.listdir(base_dir)])

        progress = tqdm(total=len(filenames))

        def process_filename(filename, report_progress=False):
            nonlocal progress
            # Refuse to proceed if every other directory doesn't also have a file with the same name
            if not all(
                    any(
                        fname.startswith(filename)
                        for fname in os.listdir(subdir))
                    for subdir in existing_input_dirs):
                raise Exception(
                    f"File {filename} does not exist in all of these directories: {existing_input_dirs}"
                )

            # construct the content dict
            content_dict = {}
            content_dict["filename"] = filename
            for subdir in existing_input_dirs:
                matching_files = [
                    f for f in os.listdir(subdir)
                    if f.split(".")[0] == filename
                ]
                assert (len(matching_files) >
                        0), f"Couldn't find {filename} in directory {subdir}"
                assert (
                    len(matching_files) < 2
                ), f"More than one file starting with {filename} in directory {subdir}"

                filepath = os.path.join(subdir, matching_files[0])
                with io.open(filepath, "r", encoding="utf8") as f:
                    content_dict[subdir.split(os.sep)[-1]] = f.read()

            # run the processing function
            try:
                output_dict = process_document_content_dict(content_dict)
            except Exception as e:
                logging.error(
                    f"Encountered an error while processing file {filepath}!")
                raise e

            # write out all the output documents
            for subdir, content in output_dict.items():
                subdir_path = os.path.join(output_dir, subdir)
                if not os.path.exists(subdir_path):
                    os.makedirs(subdir_path)

                file_ext = (NLPModule.FILE_EXT_MAP[subdir]
                            if subdir in NLPModule.FILE_EXT_MAP else subdir)
                filepath = os.path.join(output_dir, subdir,
                                        filename + "." + file_ext)
                with io.open(filepath, "w", encoding="utf8",
                             newline="\n") as f:
                    f.write(content)

            # This could lead to race conditions, but it doesn't really matter if
            # the count gets messed up, so we let it be
            if report_progress:
                progress.update(1)

        if multithreaded:
            list(
                pmap.pmap(partial(process_filename, report_progress=True),
                          filenames))
            progress.close()
        else:
            for filename in tqdm(filenames):
                process_filename(filename)