Exemplo n.º 1
0
def dump_docs(dmp):
    """Документы

    dmp: Dumper object
    """
    folder = os.path.join('dump', 'docs')
    os.makedirs(folder, exist_ok=True)

    print('[получение списка документов]')

    docs = dmp._vk.docs.get()

    print('Сохраненние документов:')

    if docs['count'] == 0:
        print('    0/0 (total: {})'.format(len(next(os.walk(folder))[2])))
    else:
        objs = []
        for d in docs['items']:
            objs.append({
                'url': d['url'],
                'name': d['title'] + '_' + str(d['id']),
                'ext': d['ext']
            })

        print('  .../{}'.format(docs['count']), end='\r')
        with Pool(dmp._settings['POOL_PROCESSES']) as pool:
            res = pool.starmap(copy_func(dmp._download),
                               zip(itertools.repeat(dmp.__class__), objs, itertools.repeat(folder)))

        print('\x1b[2K    {}/{} (total: {})'.format(sum(filter(None, res)),
                                                    len(objs),
                                                    len(next(os.walk(folder))[2])))
Exemplo n.º 2
0
    def run_mcts(self, env, runs_per_round):
        """
        Runs all batched MCTS instances concurrently on the STOVE model
        :param env: (STOVE) a STOVE instance representing the env
        :param runs_per_round: (int) the number of MCTS expansions to perform
        :return: an array of next actions
        """
        pool = Pool(self.num_mcts)
        for i in range(runs_per_round):
            start = time.time()
            result = pool.imap(select, self.trees)
            all_states = []
            all_zs = []
            for state, z in result:
                all_states.append(state)
                all_zs.append(z)
            # expand all mcts by applying all next actions batched on all mcts zs
            expansion_actions = multi_one_hot(range(self.actions),
                                              self.actions)
            expansion_actions = expansion_actions.view(self.actions, 1,
                                                       self.actions)
            expansion_actions = expansion_actions.repeat(self.num_mcts, 1,
                                                         1).to('cuda')
            new_zs, r = env.rollout(tile(torch.cat(all_zs, 0), 0,
                                         self.actions).to('cuda'),
                                    num=1,
                                    actions=expansion_actions,
                                    appearance=tile(self.obj_app, 0,
                                                    self.actions).to('cuda'))

            # rollout all new expanded nodes in parallel
            random_rollout_actions = np.random.randint(
                self.actions,
                size=(self.actions * self.num_mcts * self.max_rollout * 2, ))
            random_rollout_actions = multi_one_hot(random_rollout_actions,
                                                   self.actions)
            random_rollout_actions = random_rollout_actions.view(
                self.num_mcts * self.actions, self.max_rollout * 2,
                self.actions)
            _, r_rollout = env.rollout(
                new_zs[:, -1].to('cuda'),
                num=self.max_rollout * 2,
                actions=random_rollout_actions,
                appearance=tile(self.obj_app, 0, self.actions).to('cuda'))

            for j, mcts in enumerate(self.trees):
                low = j * self.actions
                high = (j + 1) * self.actions
                mcts.backpropagate(new_zs[low:high], r[low:high],
                                   r_rollout[low:high], all_states[j])

        pool.close()
        actions = []
        for i in range(self.num_mcts):
            counts = [
                self.trees[i].Nsa['r' + str(a)] for a in range(self.actions)
            ]
            actions.append(np.argmax(counts))
        return actions
Exemplo n.º 3
0
def format_to_nnsum(args, split_ratio=[0.8, 0.1, 0.1]):
    ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use
        for training SummaRunner and other baseline models.
    label_file: {id}.json
            {"id":"7f168bcf16ff08b32221d0c3993701dd502de584",
            "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
    abstract_file: {id}.spl
            # nnsum paper uses tokenized words joined by space as each sentence,
            but uncased (both upper and lower case included)
    input_file: {id}.json
            {"input": [sent_1, sent_2, ..., sent_n], "id":story_id}
            sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER,
                    "word_count":word count of sent_i, "sentence_id":i}
            #sentence_id is from 1
            #The fields really used in the model are:
                "tokens", "text"
    '''
    output_dir = os.path.dirname(args.save_path)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    file_list = os.listdir(args.raw_path)
    file_list.sort(
        key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'),
                       int(f.rsplit("_", 1)[1].split(".")[0])))
    file_list = ["%s/%s" % (args.raw_path, f) for f in file_list]
    #print(file_list)
    train_count, valid_count, test_count = [
        round(len(file_list) * x) for x in split_ratio
    ]
    print(train_count, valid_count, test_count)

    train_files = file_list[:train_count]
    valid_files = file_list[train_count:train_count + valid_count]
    test_files = file_list[train_count + valid_count:]

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        data_dir = pathlib.Path(args.save_path)
        input_dir = data_dir / "nnsum_inputs" / corpus_type
        label_dir = data_dir / "nnsum_labels" / corpus_type
        abstracts_dir = data_dir / "human-abstracts" / corpus_type
        input_dir.mkdir(exist_ok=True, parents=True)  # similar to 'mkdir -p'
        label_dir.mkdir(exist_ok=True, parents=True)
        abstracts_dir.mkdir(exist_ok=True, parents=True)
        a_lst = [(f, args, input_dir, abstracts_dir, label_dir)
                 for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        result_iter = pool.imap_unordered(_format_to_nnsum, a_lst)

        num_stories = len(a_lst)
        #randomly assigned the entries in a_lst to different processors in the pool
        for idx, result in enumerate(result_iter, 1):
            print("{}: Writing story {}/{}".format(corpus_type, idx,
                                                   num_stories),
                  end="\r" if idx < num_stories else "\n",
                  flush=True)

        pool.close()
        pool.join()
Exemplo n.º 4
0
def format_to_nnsum(args):
    ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use
        for training SummaRunner and other baseline models.
    label_file: {id}.json
            {"id":"7f168bcf16ff08b32221d0c3993701dd502de584",
            "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
    abstract_file: {id}.spl
            # nnsum paper uses tokenized words joined by space as each sentence,
            but uncased (both upper and lower case included)
    input_file: {id}.json
            {"input": [sent_1, sent_2, ..., sent_n], "id":story_id}
            sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER,
                    "word_count":word count of sent_i, "sentence_id":i}
            #sentence_id is from 1
            #The fields really used in the model are:
                "tokens", "text"
    '''
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        data_dir = pathlib.Path(args.save_path)
        input_dir = data_dir / "nnsum_inputs" / corpus_type
        label_dir = data_dir / "nnsum_labels" / corpus_type
        abstracts_dir = data_dir / "human-abstracts" / corpus_type
        input_dir.mkdir(exist_ok=True, parents=True)  # similar to 'mkdir -p'
        label_dir.mkdir(exist_ok=True, parents=True)
        abstracts_dir.mkdir(exist_ok=True, parents=True)
        a_lst = [(f, args, input_dir, abstracts_dir, label_dir)
                 for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        result_iter = pool.imap_unordered(_format_to_nnsum, a_lst)

        num_stories = len(a_lst)
        #randomly assigned the entries in a_lst to different processors in the pool
        for idx, result in enumerate(result_iter, 1):
            print("{}: Writing story {}/{}".format(corpus_type, idx,
                                                   num_stories),
                  end="\r" if idx < num_stories else "\n",
                  flush=True)

        pool.close()
        pool.join()
Exemplo n.º 5
0
    def run(self):
        """
        This functions reads the feature extraction filelist and creates a pool of processes to extract features
        from distinct files in parallel. It outputs one pymir3 FeatureTrack file per input file. Output is buffered
        to save memory and defer disk access.

        .. note::
            These keys are expected to be set in the experiment file:
                * ['general']['feature_extraction_filelist']
                * ['general']['scratch_directory']
                * ['feature_extraction']['output_buffer_size']
                * ['feature_extraction']['worker_extractors']

        """

        print("Running feature extraction behavior: %s" % self.name)

        # todo: use metadata file to add labels to track metadata (if available)
        # deve garantir a label no metadados pra facilitar a vida, ao invés de usar o nome do arquivo (acho que não precisa)

        with open(self.params['general']['feature_extraction_filelist']) as f:
            files = f.read().splitlines()

        # todo: usar um multiprocessing.manager pra realizar o compatilhamento do buffer (ao invés de fazer por chunks, como abaixo)

        metas = copy.copy(files)
        files = []
        for i in metas:
            files.append(i.split("\t")[0])
        metas = []

        num_files = len(files)
        output_buffer_size = self.params['feature_extraction']['output_buffer_size']

        pool = Pool(processes=self.params['feature_extraction']['worker_extractors'])
        for i in range(0, num_files, output_buffer_size):
            print "processing files %d through %d of %d" % (i + 1, min(i + output_buffer_size, num_files), num_files)
            result = pool.map(self.extract, files[i:min(i + output_buffer_size, num_files)])

            T0 = time.time()
            for track in result:
                filename = acf_utils.extract_filename(track.metadata.filename, "wav") + ".features"
                filename = self.params['general']['scratch_directory'] + "/" + filename

                print "writing features to file %s..." % (filename)
                feature_file = open(filename, "w")
                track.save(feature_file)
                feature_file.close()
                del track
            T1 = time.time()
            print "writing feature files to disk took %f seconds" % (T1 - T0)

            del result
            gc.collect()

        pool.close()
        pool.join()

        print ('Feature extraction done!')
Exemplo n.º 6
0
def test(f: Callable, inp: list, outp: list, name: str) -> list:
    try:
        with Pool() as p:
            out = p.map(f, inp)
    except:
        out = None
    status = 'OK' if out == outp else 'FAILED'
    print(f'{name}: {status}')
Exemplo n.º 7
0
def pcall_mp(fun, args, cores=cores):
    """Calls a function for every input in args"""
    mainpool = Pool(cores)  # create pool
    #    print("Using",cores,"cores")
    out = mainpool.map(fun, args)  # return list
    mainpool.terminate()
    del mainpool  # delete pool
    return out
Exemplo n.º 8
0
def ospf_check():
    clear_log()
    devices = [x.split(',')[0] for x in open(devicesFile)]
    pool = Pool(processor)
    lock = Manager().Lock()
    list(pool.map(partial(_inf_ospf_check, lock), devices))
    pool.close()
    pool.join()
def parse_rows(rows):
    with Pool(processes=32, maxtasksperchild=1000) as pool:
        iterator = pool.imap(parse_row, rows, chunksize=100)
        iterator_tracked = tqdm(iterator, desc='parsing rows', total=len(rows))
        parsed_rows = list(iterator_tracked)

    features, labels, surfaces = list(map(list, zip(*parsed_rows)))
    return features, labels, surfaces
Exemplo n.º 10
0
    def get_issues(self, start_date=None, end_date=None):
        def wrapper(path):
            return self.__create_entries(path, start_date, end_date)

        with Pool() as pool:
            entries = reduce(lambda a, b: a + b, pool.map(wrapper, self.__paths))

        return pandas.DataFrame(entries, columns=ISSUE_FIELDS)
Exemplo n.º 11
0
    def get_log(self, start_date=None, end_date=None):
        def wrapper(path):
            return self.__create_log_entries(path, start_date, end_date)

        with Pool() as pool:
            entries = reduce(lambda a, b: a + b, pool.map(wrapper, self.__paths))

        return pandas.DataFrame(entries, columns=GIT_COMMIT_FIELDS + ["repository"])
    def process(self):
        images_relative_dirpath = os.path.join("raw", self.fold, "images")

        image_info_list = []
        coco = self.get_coco()
        for image_id in self.image_id_list:
            filename = coco.loadImgs(image_id)[0]["file_name"]
            annotation_ids = coco.getAnnIds(imgIds=image_id)
            annotation_list = coco.loadAnns(annotation_ids)
            image_info = {
                "image_id":
                image_id,
                "image_filepath":
                os.path.join(self.root, images_relative_dirpath, filename),
                "image_relative_filepath":
                os.path.join(images_relative_dirpath, filename),
                "annotation_list":
                annotation_list
            }
            image_info_list.append(image_info)

        partial_preprocess_one = partial(preprocess_one,
                                         pre_filter=self.pre_filter,
                                         pre_transform=self.pre_transform,
                                         processed_dir=self.processed_dir)
        with Pool(self.pool_size) as p:
            sample_stats_list = list(
                tqdm(p.imap(partial_preprocess_one, image_info_list),
                     total=len(image_info_list)))

        # Aggregate sample_stats_list
        image_s0_list, image_s1_list, image_s2_list, class_freq_list = zip(
            *sample_stats_list)
        image_s0_array = np.stack(image_s0_list, axis=0)
        image_s1_array = np.stack(image_s1_list, axis=0)
        image_s2_array = np.stack(image_s2_list, axis=0)
        class_freq_array = np.stack(class_freq_list, axis=0)

        image_s0_total = np.sum(image_s0_array, axis=0)
        image_s1_total = np.sum(image_s1_array, axis=0)
        image_s2_total = np.sum(image_s2_array, axis=0)

        image_mean = image_s1_total / image_s0_total
        image_std = np.sqrt(image_s2_total / image_s0_total -
                            np.power(image_mean, 2))
        class_freq = np.sum(class_freq_array * image_s0_array[:, None],
                            axis=0) / image_s0_total

        # Save aggregated stats
        self.stats = {
            "image_mean": image_mean,
            "image_std": image_std,
            "class_freq": class_freq,
        }
        torch.save(self.stats, self.stats_filepath)

        # Indicates that processing has been performed:
        pathlib.Path(self.processed_flag_filepath).touch()
Exemplo n.º 13
0
def format_to_lines(args):
    # load mapping files
    print('| Loading mapping files ...')
    corpus_mapping = {"train": [], "valid": [], "test": []}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        mapping_fp = os.path.join(args.map_path,
                                  "mapping_{}.txt".format(corpus_type))
        if not os.path.exists(mapping_fp):
            print(
                "Mapping file '{}' doesn't exist. Skip the type of mapping files."
                .format(mapping_fp))
            continue
        for line in open(mapping_fp):
            temp.append(hashhex(line.strip()))
            temp.append(line.strip())
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    # load corresponding tokenized json files
    print('| Loading tokenized json files ...')
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(os.path.join(args.raw_path, '*.json')):
        real_name = os.path.splitext(os.path.basename(f))
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    # convert to target lines json file
    print('| Converting to line-based json files ...')
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = os.path.join(args.save_path,
                                       "{}.{}.json".format(corpus_type, p_ct))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []
        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = os.path.join(args.save_path,
                                   "{}.{}.json".format(corpus_type, p_ct))
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []

    print('| Finish formating to lines-based json files !')
Exemplo n.º 14
0
    def build_clusters(self):
        set_list = [(k, n) for k in self.k_list for n in range(self.num_iter)]

        p = Pool()
        p.starmap(self.prepare_directory, tqdm(set_list))
        p.close()

        self.write_dirlist()
        self.print_face()
Exemplo n.º 15
0
def main():
    base_filename = "../plots/survival/{}.pdf"

    survival_functions = [(sv.FractionOldNew, 'FractionNew'),
                          (sv.OldNewSurvival, 'OldNewMix'),
                          (sv.OldWaning, 'OldWaning')]

    p = Pool()
    p.map(run_survival_function, survival_functions)
Exemplo n.º 16
0
def format_to_lines(args):
    if not os.path.isdir(args.map_path):
        os.makedirs(args.map_path)
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    data_splitter = SplitRawFiles(args.raw_path, args.map_path)
    data_splitter.get_and_split_filenames()
    data_splitter.save_fnames_to_corresponding_files()
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            # temp.append(hashhex(line.strip()))
            temp.append(line)
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    i=0
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        # real_name = f.split('/')[-1].split('.')[0]
        # real_name = hashhex(f.split('/')[-1].split('.')[0])
        real_name = f.split('/')[-1]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)
        i+=1
        # if i > 100:
        #     break
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    # import ipdb; ipdb.set_trace()
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                # import ipdb; ipdb.set_trace()
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 17
0
 def createZips(self):
     t1 = time()
     if __name__ == '__main__':
         self.get_list_of_id()  # get set of string id
         p = Pool()
         p.map(self.createZip, range(self.count_zips))
         p.close()
         p.join()
     print('Create .zip files time = ' + str(time() - t1) + 's')
def sleeping(arg):
    time.sleep(0.1)
    ncores = 2
    pool = Pool(ncores)
    # sequential run
    %timeit list(map(sleeping, range(24)))
    # parallel run
    %timeit pool.map(sleeping, range(24))
    pool.close()
Exemplo n.º 19
0
    def convertpool(self):

        if len(self.todo) > 0:

            if self.type in [".h264", ".mp4", ".avi"]:

                pool = Pool(min(self.pools, len(self.todo)))
                try:
                    pool.map(self.conv_single, self.todo)
                    pool.close()
                    lineprint("Done converting all videofiles!",
                              label="pirecorder")
                except KeyboardInterrupt:
                    lineprint("User terminated converting pool..",
                              label="pirecorder")
                    self.terminated = True
                    pool.terminate()
                    return
                except Exception as e:
                    excep = "Got exception: %r, terminating pool" % (e, )
                    lineprint(excep, label="pirecorder")
                    pool.terminate()
                finally:
                    pool.join()

                if self.delete:
                    for filein in self.todo:
                        os.remove(filein)
                    lineprint("Deleted all original videofiles..",
                              label="pirecorder")

            elif self.type in [".jpg", ".jpeg", ".png"]:

                vidname = commonpref(self.todo)
                lineprint("Start converting " + str(len(self.todo)) +
                          " images",
                          label="pirecorder")

                frame_array = []
                for filename in self.todo:
                    frame = cv2.imread(filename)
                    frame_array.append(frame)
                    #os.rename(filename, self.outdir+"/"+filename)
                h, w, _ = frame_array[0].shape
                if self.outdir != "":
                    vidname = self.outdir + "/" + os.path.basename(vidname)
                vidout = videowriter(vidname, w, h, self.imgfps,
                                     self.resizeval)
                for i in range(len(frame_array)):
                    vidout.write(frame_array[i])
                vidout.release()
                lineprint("Finished converting " + os.path.basename(vidname),
                          label="pirecorder")

            else:
                lineprint("No video or image files found..",
                          label="pirecorder")
Exemplo n.º 20
0
def multimap(function,
             inputs,
             chunked=False,
             processes=32,
             maxtasksperchild=1,
             chunksize=1,
             n_calcs=None):
    '''
    This function is a wrapper to parallelize a function.

    Args:
        function            The function you want to execute
        inputs              An iterable that yields proper arguments to the
                            function
        chunked             A Boolean indicating whether your function expects
                            single arguments or "chunked" iterables, e.g.,
                            lists.
        processes           The number of threads/processes you want to be using
        maxtasksperchild    The maximum number of tasks that a child process
                            may do before terminating (and therefore clearing
                            its memory cache to avoid memory overload).
        chunksize           How many calculations you want to have each single
                            processor do per task. Smaller chunks means more
                            memory shuffling. Bigger chunks means more RAM
                            requirements.
        n_calcs             How many calculations you have. Only necessary for
                            adding a percentage timer to the progress bar.
    Returns:
        outputs     A list of the inputs mapped through the function
    '''
    # Collect garbage before we begin multiprocessing to make sure we don't
    # pass things we don't need to
    gc.collect()

    # If we have one thread, there's no use multiprocessing
    if processes == 1:
        output = [function(input_) for input_ in tqdm(inputs, total=n_calcs)]
        return output

    with Pool(processes=processes, maxtasksperchild=maxtasksperchild) as pool:
        # Use multiprocessing to perform the calculations. We use imap instead
        # of map so that we get an iterator, which we need for tqdm (the
        # progress bar) to work. imap also requires less disk memory, which
        # can be an issue for some of our large systems.
        if not chunked:
            iterator = pool.imap(function, inputs, chunksize=chunksize)
            total = n_calcs
            outputs = list(tqdm(iterator, total=total))

        # If our function expects chunks, then we have to unpack our inputs
        # appropriately
        else:
            iterator = pool.imap(function, _chunk(inputs, n=chunksize))
            total = n_calcs / chunksize
            outputs = list(np.concatenate(list(tqdm(iterator, total=total))))

    return outputs
Exemplo n.º 21
0
def Multiprocessed_OCRPDF(
        source="",
        targetPath=None,
        processes=4,
        nice=5,
        verbose=False,
        tesseract_config='--oem 1 -l best/eng -c preserve_interword_spaces=1 textonly_pdf=1',
        logger=None):

    if isinstance(source, str):
        if verbose:
            (
                logger.info if logger else print
            )("You passed a string in as source. Trying this as source pdf file path."
              )
        page_count = PyPDF2.PdfFileReader(source).getNumPages()
    else:
        if verbose:
            (logger.info if logger else
             print)("OCRUSREX - Try extracting Images from bytes object")
        page_count = PyPDF2.PdfFileReader(io.BytesIO(source)).getNumPages()

    output = PyPDF2.PdfFileWriter()

    # set up a multiprocess pool with the specified number of processes. Then call the single-threaded OCRPDF pethod
    # on each page
    p = Pool(processes)
    for ocred_page in p.map(
            lambda p: OCRPDF(source=source,
                             verbose=verbose,
                             nice=nice,
                             page=p + 1,
                             tesseract_config=tesseract_config,
                             logger=logger), range(0, page_count)):
        output.addPage(PyPDF2.PdfFileReader(io.BytesIO(ocred_page)).getPage(0))

    if verbose:
        (logger.info if logger else print)("Multithreaded Execution Complete!")

    # If targetPath was provided, assume that it's a string and valid path. Try to write.
    if targetPath:
        outputStream = open(targetPath, "wb")
        output.write(outputStream)
        outputStream.close()
        # upon success, return truthy values (in this case, True)
        return True

    # otherwise, return results as bytes obj
    else:
        output_file_obj = io.BytesIO()
        output.write(output_file_obj)
        return output_file_obj.getvalue()

    if verbose:
        (logger.info if logger else print)(
            "Complete! Elapsed time: {0}".format(end - start))
Exemplo n.º 22
0
 def main_change(self, lock):
     p = Pool()
     list_of_process = []
     for i in range(2):
         list_of_process.append(
             p.Process(target=self.change, args=(i, lock)))
     for i in range(2):
         list_of_process[i].start()
     for i in range(2):
         list_of_process[i].join()
Exemplo n.º 23
0
 def __init__(self, funct, data, threads='all'):
     raise Exception("Not functionnal yet !")
     self.funct = funct
     if threads == 'all':
         threads = cpu_count()
     self.pool = Pool(processes=threads)
     self.data = data
     self.PG = None
     self.initializer = None
     self.finalizer = None
Exemplo n.º 24
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        if args.map_on and args.map_path != 'empty':
            for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
                temp.append(hashhex(line.strip()))
            corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
        else:
            tr, va, te = manual_corp_assign(args)
            corpus_mapping['valid'] = va
            corpus_mapping['test'] = te
            corpus_mapping['train'] = tr
    train_files, valid_files, test_files = [], [], []
    # path = glob.glob(pjoin(args.raw_path, '*.json')) # sh hinzu
    # if len(path) < 1:
     #   path = glob.glob(pjoin(os.getcwd() + '\\' + args.raw_path, '*.json'))
     #   print(os.getcwd() + '\\' + args.raw_path)
    for f in glob.glob(pjoin(args.raw_path, '*.json')): # sh geändert
        if args.map_on and args.map_path != 'empty':
            real_name = f.split('\\')[-1].split('.')[0] #  SH geändert real_name = f.split('/')[-1].split('.')[0]
        else:
            real_name = f
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 25
0
    def ensure(self, what, affected_services):
        '''ensures that affected_services are started or stopped with 5 attempts.
        what: string 'off' or 'on'
        affected_services: list { service_name, service_type }

        returns the services that still did not do what was requested'''
        tries = 4
        wait = [8, 5, 3, 2, 1]

        if None in [self.username, self.password, self.server]:
            log.warn(
                'Required environmental variables for connecting to ArcGIS Server do not exist. '
                +
                'No services will be stopped or started. See README.md for more details.'
            )
            return (True, None)

        def act_on_service(service_info):
            #: logs within this context do not show up in the console or log file
            service_name, service_type = service_info
            if what == 'off':
                status, message = self.turn_off(service_name, service_type)
            else:
                status, message = self.turn_on(service_name, service_type)

            if not status:
                return (service_name, service_type)
            return None

        def get_service_names(services):
            return ', '.join(
                [name + '.' + service for name, service in affected_services])

        while len(affected_services) > 0 and tries >= 0:
            sleep(wait[tries])
            tries -= 1

            num_processes = environ.get('FORKLIFT_POOL_PROCESSES')
            swimmers = num_processes or config.default_num_processes
            if swimmers > len(affected_services):
                swimmers = len(affected_services)
            with Pool(swimmers) as pool:
                log.debug('affected services: %s',
                          get_service_names(affected_services))
                affected_services = [
                    service
                    for service in pool.map(act_on_service, affected_services)
                    if service is not None
                ]

            if len(affected_services) > 0:
                log.debug('retrying %s', get_service_names(affected_services))

        return (len(affected_services) == 0,
                get_service_names(affected_services))
Exemplo n.º 26
0
def format_to_lines(args):
    # corpus_mapping = {}
    # for corpus_type in ['valid', 'test', 'train']:
    #     temp = []
    #     for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
    #         temp.append(hashhex(line.strip()))
    #     corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []

    # 随机划分数据集,train:valid:test = 8:1:1
    import random
    random.seed(1)
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        # real_name = f.split('/')[-1].split('.')[0]
        # if (real_name in corpus_mapping['valid']):
        #     valid_files.append(f)
        # elif (real_name in corpus_mapping['test']):
        #     test_files.append(f)
        # elif (real_name in corpus_mapping['train']):
        #     train_files.append(f)
        n = random.random()
        if n <= 0.1:
            valid_files.append(f)
        elif n <= 0.2:
            test_files.append(f)
        else:
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 27
0
def get_gameplays():

    PlayTypeDict = {}

    PlayTypeStrings = {
        'Pass': ['pass incomplete', 'pass complete', 'sacked'],
        'Admin': ['spiked the ball', 'Timeout', 'Penalty', 'aborted'],
        'Kneel': ['knee', 'knelt'],
        'Punt': ['Punts'],
        'Field Goal': ['field goal', 'no good'],
        'Special Teams':
        ['kicks off', 'kicks onside', 'extra point', 'two point'],
        'Run': [
            'left end', 'right end', ' for ', 'up the middle', 'middle for',
            'left tackle', 'left guard', 'right guard', 'right tackle'
        ],
    }

    YearStart = 1998
    YearsToGo = 20
    for Year in range(YearStart, YearStart + YearsToGo):

        PlayTypeCounts = {
            'Pass': 0,
            'Run': 0,
            'Punt': 0,
            'Field Goal': 0,
            'Admin': 0,
            'Kneel': 0,
            'Special Teams': 0
        }
        for GameNumber in range(1, 17):
            print('Game', GameNumber, 'in', Year, 'Time: ', datetime.now())

            PlayTypeDict = {}
            PathList = []
            for Team in TeamLookup:
                for GameLocation in ['H', 'A']:
                    path = 'https://widgets.sports-reference.com/wg.fcgi?css=1&site=pfr&url=%2Fplay-index%2Fplay_finder.cgi%3Frequest%3D1%26match%3Dall%26year_min%3D{YEAR}%26year_max%3D{YEAR}%26game_type%3DR%26game_num_min%3D{GameNumber}%26game_num_max%3D{GameNumber}%26week_num_min%3D0%26week_num_max%3D99%26game_location%3D{GameLocation}%26minutes_max%3D15%26seconds_max%3D0%26minutes_min%3D0%26seconds_min%3D0%26team_id%3D{TEAM}%26field_pos_min_field%3Dteam%26field_pos_max_field%3Dteam%26end_field_pos_min_field%3Dteam%26end_field_pos_max_field%3Dteam%26type%255B%255D%3DPASS%26type%255B%255D%3DRUSH%26type%255B%255D%3DPUNT%26type%255B%255D%3DKOFF%26type%255B%255D%3DONSD%26type%255B%255D%3DFG%26type%255B%255D%3DXP%26type%255B%255D%3D2PC%26no_play%3DN%26turnover_type%255B%255D%3Dinterception%26turnover_type%255B%255D%3Dfumble%26score_type%255B%255D%3Dtouchdown%26score_type%255B%255D%3Dfield_goal%26score_type%255B%255D%3Dsafety%26order_by%3Dyds_to_go&div=div_all_plays&del_col=1,11,12,13,14'.format(
                        YEAR=Year,
                        GameNumber=GameNumber,
                        TEAM=Team,
                        GameLocation=GameLocation)

                    PathList.append(path)
                    #req = get(path)
            p = Pool(8)  # Pool tells how many at a time
            records = p.map(GetAndParsePath, PathList)
            p.terminate()
            p.join()

            with open(
                    'output/PlayTypeCounts-Year-' + str(Year) + '-Game-' +
                    str(GameNumber) + '.json', 'w') as outfile:
                json.dump(PlayTypeDict, outfile)
Exemplo n.º 28
0
def format_to_lines_tfds(args):
    """ Formats source text and target text as pt file. """

    tokenized_sub_dirs = os.listdir(args.raw_path)
    dataset_name = os.path.dirname(args.save_path).split('/')[-1]

    # Make directory
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    # Create file list for each split directory
    corpora = {}
    for tokenized_sub_dir in tokenized_sub_dirs:
        path = pjoin(args.raw_path, tokenized_sub_dir)
        files = []
        for f in glob.glob(pjoin(path, '*.json')):
            files.append(f)
        corpora[tokenized_sub_dir] = files
        files = []

    for corpus_type in tokenized_sub_dirs:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            # NOTE: save files according to shard_size
            if (len(dataset) >= args.shard_size):
                if (corpus_type == 'validation'):
                    type_name = 'valid'
                else:
                    type_name = corpus_type
                pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name,
                                                       p_ct)
                with open(pjoin(args.save_path, pt_file), 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []
        pool.close()
        pool.join()

        # For the last few data (< shard size)
        if (len(dataset) > 0):
            if (corpus_type == 'validation'):
                type_name = 'valid'
            else:
                type_name = corpus_type
            pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name,
                                                   p_ct)
            with open(pjoin(args.save_path, pt_file), 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 29
0
def convert_to_shard_data(post_dir, shard_dir, args):
    shard_count = 0

    corpora = sorted([
        os.path.join(post_dir, f) for f in os.listdir(post_dir)
        if not f.startswith('.') and not f.endswith('.abs.txt.json')
    ])
    args_list = []
    for f_main in corpora:
        f_abs_name = '{}.abs.txt.json'.format(
            os.path.basename(f_main).split('.')[0])
        f_abs = os.path.join(post_dir, f_abs_name)
        args_list.append((f_main, f_abs, args))

    start = time.time()
    print('... (4) Packing tokenized data into shards...')
    print('Converting files count: {}'.format(len(corpora)))

    shard_count = 0
    dataset = []
    t_len = math.ceil(len(corpora) / args.shard_size)
    # imap executes in sync multiprocess manner
    # use array and shard_size to save the flow of ordered data
    with Pool(args.n_cpus) as pool:
        with tqdm(total=t_len) as pbar:
            with tqdm(total=args.shard_size) as spbar:
                for i, data in enumerate(pool.imap(format_to_lines,
                                                   args_list)):
                    dataset.append(data)
                    spbar.update()
                    if i != 0 and i % args.shard_size == 0:
                        fpath = os.path.join(
                            shard_dir, 'shard.{}.json'.format(shard_count))
                        with open(fpath, 'w') as f:
                            f.write(json.dumps(dataset))
                        dataset = []
                        shard_count += 1
                        pbar.update()
                        spbar.reset()
                        # gc.collect()
                spbar.close()
            pbar.close()

        if len(dataset) > 0:
            fpath = os.path.join(shard_dir,
                                 'shard.{}.json'.format(shard_count))
            print('last shard {} saved'.format(shard_count))
            with open(fpath, 'w') as f:
                f.write(json.dumps(dataset))
            dataset = []
            shard_count += 1

    end = time.time()
    print('... Ending (4), time elapsed {}'.format(end - start))
Exemplo n.º 30
0
 def calculate(self, data):
     """ run graph calculations """
     # make sure data is valid when using schema
     if self._schema:
         try:
             import jsonschema
         except ImportError:
             msg = 'jsonschema package is needed for validating data'
             raise ImportError(msg)
         jsonschema.validate(instance=data, schema=self._schema)
     t1 = dt.datetime.utcnow()
     LOGGER.info('Starting calculation...')
     self._data = Data(data)
     self._data.check_inputs(self.sim_inputs, self.sim_outputs)
     if not self._sorted_dep:
         self._topological_sort()
     for items in self._sorted_dep:
         # loading node with inputs
         for item in items:
             node = self._get_node(item)
             inputs = [i for i in node.inputs_without_constants]
             for inp in inputs:
                 node.set_value_to_input(inp.name, self._data[inp.map])
         # running nodes
         if self._parallel:
             try:
                 from multiprocess import Pool
             except ImportError:
                 msg = 'multiprocess package is needed for parralelism'
                 raise ImportError(msg)
             pool = Pool(self._pool_size)
             results = pool.map(Graph.run_node,
                                [self._get_node(i) for i in items])
             pool.close()
             pool.join()
             results = {k: v for k, v in results}
         else:
             results = {}
             for item in items:
                 node = self._get_node(item)
                 res = node.run_with_loaded_inputs()
                 results[node.id] = res
         # save results
         for item in items:
             node = self._get_node(item)
             res = results[node.id]
             if len(node.outputs) == 1:
                 self._data[node.outputs[0].map] = res
             else:
                 for i, out in enumerate(node.outputs):
                     self._data[out.map] = res[i]
     t2 = dt.datetime.utcnow()
     LOGGER.info('Calculation finished in {}'.format(t2 - t1))
     return res