Exemplo n.º 1
0
 def calculate(self):
     pool = Pool(processes=min(cpu_count(), 8))
     results = [pool.apply(self.square, (i, )) for i in self.getNumbers()]
     pool.close()
     pool.join()
     for result in results:
         print result
Exemplo n.º 2
0
def _files_dict_to_json(args, train_files, valid_files, test_files):
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
def compute_jaccard_pairwise(indices,
                             square_form=True,
                             parallel=True,
                             return_poses=False):
    n = len(indices)

    if parallel:
        pool = Pool(16)
        scores_poses_tuples = pool.map(
            lambda x: compute_jaccard_i_vs_list(x[0], x[1]),
            [(indices[i], indices[i + 1:]) for i in range(n)])
        pool.close()
        pool.join()
    else:
        scores_poses_tuples = [
            compute_jaccard_i_vs_list(indices[i], indices[i + 1:])
            for i in range(n)
        ]

    pairwise_scores = np.array(
        [scores for scores, poses in scores_poses_tuples])

    if square_form:
        pairwise_scores = squareform(np.concatenate(pairwise_scores))

    if return_poses:
        poses = np.array([poses for scores, poses in scores_poses_tuples])
        return pairwise_scores, poses
    else:
        return pairwise_scores
def compute_jaccard_list_vs_all(seed_indices):
    pool = Pool(14)
    affinities_to_seeds = np.array(
        pool.map(lambda i: compute_jaccard_i_vs_all(i), seed_indices))
    pool.close()
    pool.join()
    return affinities_to_seeds
Exemplo n.º 5
0
def calculate_expected(uri, chroms, maxdis=2000000, balance=True, nproc=1):

    # B: Block Bias, constant for each copy number pair
    hic_pool = cooler.Cooler(uri)
    res = hic_pool.binsize
    maxdis = maxdis // res
    args = []
    for c in chroms:
        args.append((hic_pool, c, maxdis, balance))

    # Allocate processes
    if nproc == 1:
        results = list(map(_expected_core, args))
    else:
        pool = Pool(nproc)
        results = pool.map(_expected_core, args)
        pool.close()
        pool.join()

    expected = {}
    for i in range(1, maxdis + 1):
        nume = 0
        denom = 0
        for extract in results:
            if i in extract:
                nume += extract[i][0]
                denom += extract[i][1]
        if nume > 0:
            expected[i] = nume / denom

    return expected
Exemplo n.º 6
0
class MultiThreading(object):
    def __init__(self, funct, data, threads='all'):
        raise Exception("Not functionnal yet !")
        self.funct = funct
        if threads == 'all':
            threads = cpu_count()
        self.pool = Pool(processes=threads)
        self.data = data
        self.PG = None
        self.initializer = None
        self.finalizer = None

    def add_progress_counter(self, init_mess="Beginning", end_mess="Done",
                             name_things='things', perc_interv=5):
        self.PG = ProgressCounter(init_mess=init_mess, end_mess=end_mess,
                                  nmb_max=len(self.data),
                                  name_things=name_things,
                                  perc_interv=perc_interv)
        self.manager = Manager()
        self.manager.register("PG", self.PG)

    def run(self):
        res = self.pool.map_async(self.PG_func_wrapper, self.data)
        self.pool.close()
        self.pool.join()
        return res
Exemplo n.º 7
0
def parallel_apply(df, func, n_cores, n_jobs):
    df_split = np.array_split(df, n_jobs)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return (df)
Exemplo n.º 8
0
def pairix(bins, pairs_path, cool_path, metadata, assembly, nproc, max_split):
    """
    Bin a pairix-indexed contact list file.

    {}

    See also: 'cooler csort' to sort and index a contact list file

    Pairix on GitHub: <https://github.com/4dn-dcic/pairix>.

    """
    chromsizes, bins = _parse_bins(bins)

    if metadata is not None:
        with open(metadata, 'r') as f:
            metadata = json.load(f)

    try:
        if nproc > 1:
            pool = Pool(nproc)
            logger.info("Using {} cores".format(nproc))
            map = pool.imap
        else:
            map = six.moves.map
        iterator = PairixAggregator(pairs_path,
                                    chromsizes,
                                    bins,
                                    map=map,
                                    n_chunks=max_split)
        create(cool_path, bins, iterator, metadata, assembly)
    finally:
        if nproc > 1:
            pool.close()
Exemplo n.º 9
0
 def msolve(A, Y, init=None):
     if use_cuda:
         Z = np.asarray(np.hstack(list(
             map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=True), np.split(Y, n_jobs, axis=1)))))
     else:
         if n_jobs <= 1:
             Z = np.asarray(np.hstack([scipy.sparse.linalg.cg(A, Y[:, i],
                                                              x0=init[:, i] if init is not None else None,
                                                              tol=tol, atol=atol)[0][:, np.newaxis] for i in
                                       range(Y.shape[1])]))
         else:
             p = Pool(n_jobs)
             try:
                 if init is None:
                     Z = np.asarray(np.hstack(list(
                         p.map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=False),
                               np.split(Y, Y.shape[1], axis=1)))))
                 else:
                     Z = np.asarray(np.hstack(list(
                         p.map(lambda y, init_: cg(A, y, x0=init_, tol=tol, atol=atol, use_cuda=False),
                               zip(np.split(Y, Y.shape[1], axis=1), np.split(init, init.shape[1], axis=1))))))
                 p.close()
                 p.join()
             except KeyboardInterrupt:
                 print("Caught KeyboardInterrupt, terminating workers")
                 p.terminate()
                 p.join()
                 raise
     return Z
Exemplo n.º 10
0
def format_to_bert(args):
    lda_model_tfidf = models.ldamodel.LdaModel.load(
        '/home1/bqw/sum/sum_topic_10.model')
    lda_dict = corpora.Dictionary.load('/home1/bqw/sum/topic.dict')
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']

    for corpus_type in datasets:
        a_lst = []
        for json_f in glob.glob(
                pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            real_name = json_f.split('/')[-1]
            a_lst.append((json_f, args,
                          pjoin(args.save_path,
                                real_name.replace('json', 'bert.pt')),
                          lda_model_tfidf, lda_dict))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
def main():
    folder = "test_folder"
    if os.path.exists(folder):
        os.rmdir(folder)
    p = Pool(4)
    p.map(return_folder_name, [None for i in range(4)])
    p.close()
Exemplo n.º 12
0
def format_to_lines(args):
    corpus_mapping = {}
    train_files = []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        train_files.append(f)

    corpora = {'train': train_files}
    for corpus_type in ['train']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 13
0
def format_to_bert(args):

    if (args.dataset != ''):
        datasets = [args.dataset]
        print('dataset')
    else:
        datasets = ['train']

    for corpus_type in datasets:
        a_lst = []
        print('.' + corpus_type + '.*.json')
        for json_f in glob.glob(
                pjoin(args.raw_path, 'news.' + corpus_type + '.*.json')):
            print(json_f)
            real_name = json_f.split('/')[-1]
            print(real_name)
            a_lst.append((corpus_type, json_f, args,
                          pjoin(args.save_path,
                                real_name.replace('json', 'bert.pt'))))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
def execute_parallel_uploads(plant_files_list, animal_files_list,
                             human_files_list):
    num_processes = get_process_number(plant_files_list) + get_process_number(
        animal_files_list) + get_process_number(human_files_list)
    print(num_processes)
    pool = Pool(processes=num_processes)

    plants = get_parallel_process_list(pool,
                                       get_process_number(plant_files_list),
                                       plant_files_list)

    animals = get_parallel_process_list(pool,
                                        get_process_number(animal_files_list),
                                        animal_files_list)

    humans = get_parallel_process_list(pool,
                                       get_process_number(human_files_list),
                                       human_files_list)

    pool.close()
    pool.join()

    finalize_processes(plants)
    finalize_processes(animals)
    finalize_processes(humans)
def load_scoremaps_multiple_sections_parallel(sections, stack, structure, downscale, detector_id):
    pool = Pool(12)
    scoremaps = pool.map(lambda sec: load_scoremap_worker(stack, sec, structure, downscale, detector_id=detector_id),
                                     sections)
    pool.close()
    pool.join()
    return {sec: sm for sec, sm in zip(sections, scoremaps) if sm is not None}
Exemplo n.º 16
0
def find_day_series(df,
                    day,
                    tol,
                    min_occurrence,
                    is_departure: bool,
                    num_procs=1):

    df_day = df[df["week day"] == day].copy()
    date_num = dict(
        zip(np.sort(df_day.day.unique()), range(len(df_day.day.unique()))))
    df_day["day_num"] = df_day.day.apply(lambda d: date_num[d])
    series = df_day.series.unique()
    len_tot = series.shape[0]
    len_slice = len_tot // num_procs
    split_series = [i * len_slice for i in range(num_procs)] + [len_tot]
    split_flights = tuple([(series[split_series[i]:split_series[i + 1]],
                            df_day[df_day.series.isin(
                                series[split_series[i]:split_series[i + 1]])],
                            tol, min_occurrence, is_departure)
                           for i in range(num_procs)])

    pool = Pool(num_procs)
    result = pool.map(compute_series, split_flights)
    final_df = pd.concat(result, ignore_index=True)
    pool.close()
    pool.join()

    return final_df
Exemplo n.º 17
0
def run_func(target_func, func_args, split_args, core_nums=2):
    """多线程运算

    Parameters
    -----------
    target_func: func
        待运行的函数。需要分配到不同进程的参数必须放在该函数参数列表的最前面,即:
        target_func(split_args, func_args)
    func_args: dict
        被传入到运行函数中
    split_args: two-dimensioned array N*K
        参数列表会平均分配到不同的进程中去。N代表参数个数,K代表每个参数下元素数量。
    core_nums: int
        创建进程的数量
    """
    s_args = np.array_split(split_args, core_nums, axis=1)
    p = Pool(core_nums)
    for i in range(core_nums):
        print("create process %s" % i)
        p.apply_async(target_func, args=tuple(s_args[i]), kwds=func_args,
                      callback=lambda x: print(x), error_callback=lambda x: print(x))
    p.close()
    p.join()
    print("calculation has finished!")
    
def format_to_bert(args):
    if args.dataset != "":
        datasets = [args.dataset]
    else:
        datasets = ["train", "valid", "test"]
    for corpus_type in datasets:
        a_lst = []
        for json_f in [
                "../json_data/train.0.json",
                "../json_data/train.1.json",
                "../json_data/train.2.json",
                "../json_data/train.3.json",
                "../json_data/train.4.json",
                "../json_data/val.5.json",
        ]:
            print(json_f)
            real_name = json_f.split("/")[-1]
            a_lst.append((
                json_f,
                args,
                pjoin(args.save_path, real_name.replace("json", "bert.pt")),
            ))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
Exemplo n.º 19
0
    def run_parallel_fep(self, mutant_params, system_idx, mutant_idx, n_steps,
                         n_iterations, windows):
        logger.debug('Computing FEP for {}...'.format(self.name))
        if not self.opt:
            mutant_systems = mutant_params.build_fep_systems(
                system_idx, mutant_idx, windows)
        else:
            mutant_systems = mutant_params

        nstates = len(mutant_systems)
        chunk = math.ceil(nstates / self.num_gpu)
        groups = grouper(range(nstates), chunk)
        pool = Pool(processes=self.num_gpu)

        system = copy.deepcopy(self.wt_system)
        box_vectors = self.input_pdb.topology.getPeriodicBoxVectors()
        system.setDefaultPeriodicBoxVectors(*box_vectors)
        system.addForce(
            mm.MonteCarloBarostat(1 * unit.atmospheres,
                                  self.temperature * unit.kelvin, 25))  ###

        fep = partial(run_fep,
                      sim=self,
                      system=system,
                      pdb=self.extended_pdb,
                      n_steps=n_steps,
                      n_iterations=n_iterations,
                      all_mutants=mutant_systems)
        u_kln = pool.map(fep, groups)
        pool.close()
        pool.join()
        pool.terminate()
        ddg = FSim.gather_dg(self, u_kln, nstates)

        return ddg
Exemplo n.º 20
0
def format_to_lines(args, raw_path, save_path):
    # file names: '4fd2a00e8eb7c8105d883bd7.json'
    name_list = os.listdir(raw_path)
    for i, name in enumerate(name_list):
        a_lst = [(pjoin(raw_path, f), args) for f in name_list]
    pool = Pool(args.n_cpus)
    dataset = []
    p_ct = 0
    for d in pool.imap_unordered(_format_to_lines, a_lst):
        dataset.append(d)
        if (len(dataset) > args.shard_size):
            pt_file = "{:s}.{:d}.json".format(save_path, p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []

    pool.close()
    pool.join()
    if (len(dataset) > 0):
        pt_file = os.path.join(save_path, 'test.json')
        with open(pt_file, 'w') as save:
            save.write(json.dumps(dataset))
            p_ct += 1
            dataset = []
Exemplo n.º 21
0
    def run():
        def init_pool():
            logging.info('Init pool')
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        pool = Pool(args.process, init_pool)
        try:
            task_inputs = gen_inputs()

            if args.process == 1:
                results = itertools.imap(task, task_inputs)
            else:
                results = pool.imap(task, task_inputs, 1)
                #results = imap_wrap(pool, task, task_inputs, 100)

            data_header = ['chrom', 'start', 'end', 'region_id', 'nsamples']
            param_header = [
                'status', 'ia', 'ib', 'ic', 'a', 'b', 'c', 'top_cn',
                'top_theta', 'theta', 'll', 'step'
            ]
            out_header = data_header + param_header
            print(*out_header, sep='\t')
            for data, params in results:
                row = [data[c] for c in data_header] \
                    + [params[c] for c in param_header]
                print(*row, sep='\t')
            pool.close()
        except Exception as e:
            pool.terminate()
            raise e
        finally:
            pool.join()
Exemplo n.º 22
0
    def _train_batch_parallelize(self, trees, n_incorrect_answers):
        """Parallelizes training for a list of trees.
        Uses the number of threads given by multiprocessing.cpu_count()

        Updates model parameters directly, and returns batch error.
        """
        # Defaults to using cpu_count() threads
        pool = Pool()
        
        def get_subbatch_deltas(_trees):
            return self._train_batch(_trees, n_incorrect_answers, 
                                     apply_learning=False)

        subbatches = utils.split(trees, n_slices=cpu_count())

        # result will be a list of tuples (error, deltas)
        result = pool.map(get_subbatch_deltas, subbatches)

        # no more processes accepted by this pool
        pool.close()   
        # Wait until mapping is completed
        pool.join()

        error = sum([r[0] for r in result])
        deltas = [r[1] for r in result]
        for (delta_Wv, delta_b, delta_We, delta_Wr) in deltas:
            self.Wv -= delta_Wv
            self.b -=  delta_b
            self.We -= delta_We
            self.Wr -= delta_Wr

        return error
Exemplo n.º 23
0
def str_to_bert(args):
    assert args.pretrained_model_type in pretrained_model_types
    logger.info("Run with pretrained_model_type: " +
                args.pretrained_model_type)
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    for corpus_type in datasets:
        a_lst = []
        for origin_src in glob.glob(args.raw_path + '.' + corpus_type +
                                    '.src.*.txt'):
            number_shard = origin_src.split('.')[-2]
            origin_tgt = args.raw_path + '.' + corpus_type + '.tgt.' + number_shard + '.txt'
            save_name = args.save_path + '.' + corpus_type + '.' + number_shard + '.bert.pt'
            assert os.path.exists(
                origin_tgt), "Invalid origin_tgt: " + origin_tgt
            a_lst.append(
                (corpus_type, origin_src, args, save_name, origin_tgt))
        print(a_lst)

        pool = Pool(args.n_cpus)
        for d in pool.imap(_str_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
 def multi_align_tr(self, imstack, TrM, nsz, shx, shy, stfolder, sfn,
                    nCORES, fnames, ext):
     if not sfn in os.listdir(stfolder):
         os.makedirs(os.path.join(stfolder, sfn))
         print('directory created')
     pool = Pool(nCORES)
     print('applying transformations with', nCORES,
           'processes in parallel ')
     results = []
     for i in range(len(imstack)):
         #            results.append(transform(imstack[i],TrM,nsz,shx,shy,stfolder,sfn,fnames,i,ext,))
         results.append(
             pool.apply_async(transform, (
                 imstack[i],
                 TrM,
                 nsz,
                 shx,
                 shy,
                 stfolder,
                 sfn,
                 fnames,
                 i,
                 ext,
             )))
         self.loading.progress2['value'] += 1
         self.update()
     pool.close()
     pool.join()
     print('successfully transformed all the images in the stack')
     return results
Exemplo n.º 25
0
    def get_panorama(self, fname, pano_id, zoom_level=3):
        server_url = 'http://cbk%d.google.com/' % randint(0,3)
        pano_url = server_url + 'cbk?output=tile&panoid=%s&zoom=%d&x=%d&y=%d'
        zoom_sizes = {3:(7,4), 4:(13,7), 5:(26,13)}
        max_x, max_y = zoom_sizes[zoom_level]

        jobs = []
        for y in xrange(max_y):
            for x in xrange(max_x):
                tile_url = pano_url % (pano_id, zoom_level, x, y)
                jobs.append(tile_url)

        p = Pool(len(jobs))
        tiles = p.map(self.get_tile, jobs)
        p.close()

        if all(x.size for x in tiles):
            tiles = np.array(tiles)
            strips = []
            for y in xrange(max_y):
                strips.append(np.hstack(tiles[y*max_x:(y+1)*max_x,:,:,:]))
            pano = np.vstack(strips)
            pano = pano[0:1664, 0:3328]
        else:
            pano = np.array([])
        return pano
Exemplo n.º 26
0
def format_to_bert(args):
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    #print("Reached here Wohoo : {}".format(datasets))
    #base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    for corpus_type in datasets:
        #if corpus_type == 'train':
        #    raw_pth = os.path.join(base_dir, 'merged_stories_tokenized')
        #elif corpus_type == 'valid':
        #    raw_pth = os.path.join(base_dir, 'merged_stories_tokenized_val')
        #elif corpus_type == 'test':
        #    raw_pth = os.path.join(base_dir, 'merged_stories_tokenized_test')
        #else:
        #    print("Not in in dataset")
        #    sys.exit()
        a_lst = []
        #json_fs = os.listdir(raw_pth)
        #for json_f in glob.glob(pjoin(args.raw_path, '.*.json')):
        #for json_f in json_fs:
        for json_f in glob.glob(pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            #if json_f.endswith('.json'):
            real_name = json_f.split('/')[-1]
            a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt'))))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
def format_to_lines(args):
    print("in format lines")
    train_files = glob.glob(pjoin(args.train_path, './*.json'))
    valid_files = glob.glob(pjoin(args.valid_path, './*.json'))
    test_files = glob.glob(pjoin(args.test_path, './*.json'))
    print("test_files are ", test_files)
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            # if (len(dataset) > args.shard_size):
            pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                print(
                    "saved to ",
                    "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                 p_ct))
        pool.close()
        pool.join()
Exemplo n.º 28
0
def format_to_nnsum(args):
    ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use
        for training SummaRunner and other baseline models.
    label_file: {id}.json
            {"id":"7f168bcf16ff08b32221d0c3993701dd502de584",
            "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
    abstract_file: {id}.spl
            # nnsum paper uses tokenized words joined by space as each sentence,
            but uncased (both upper and lower case included)
    input_file: {id}.json
            {"input": [sent_1, sent_2, ..., sent_n], "id":story_id}
            sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER,
                    "word_count":word count of sent_i, "sentence_id":i}
            #sentence_id is from 1
            #The fields really used in the model are:
                "tokens", "text"
    '''
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        data_dir = pathlib.Path(args.save_path)
        input_dir = data_dir / "nnsum_inputs" / corpus_type
        label_dir = data_dir / "nnsum_labels" / corpus_type
        abstracts_dir = data_dir / "human-abstracts" / corpus_type
        input_dir.mkdir(exist_ok=True, parents=True)  # similar to 'mkdir -p'
        label_dir.mkdir(exist_ok=True, parents=True)
        abstracts_dir.mkdir(exist_ok=True, parents=True)
        a_lst = [(f, args, input_dir, abstracts_dir, label_dir)
                 for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        result_iter = pool.imap_unordered(_format_to_nnsum, a_lst)

        num_stories = len(a_lst)
        #randomly assigned the entries in a_lst to different processors in the pool
        for idx, result in enumerate(result_iter, 1):
            print("{}: Writing story {}/{}".format(corpus_type, idx,
                                                   num_stories),
                  end="\r" if idx < num_stories else "\n",
                  flush=True)

        pool.close()
        pool.join()
Exemplo n.º 29
0
    def run(self):
        """
        This functions reads the feature extraction filelist and creates a pool of processes to extract features
        from distinct files in parallel. It outputs one pymir3 FeatureTrack file per input file. Output is buffered
        to save memory and defer disk access.

        .. note::
            These keys are expected to be set in the experiment file:
                * ['general']['feature_extraction_filelist']
                * ['general']['scratch_directory']
                * ['feature_extraction']['output_buffer_size']
                * ['feature_extraction']['worker_extractors']

        """

        print("Running feature extraction behavior: %s" % self.name)

        # todo: use metadata file to add labels to track metadata (if available)
        # deve garantir a label no metadados pra facilitar a vida, ao invés de usar o nome do arquivo (acho que não precisa)

        with open(self.params['general']['feature_extraction_filelist']) as f:
            files = f.read().splitlines()

        # todo: usar um multiprocessing.manager pra realizar o compatilhamento do buffer (ao invés de fazer por chunks, como abaixo)

        metas = copy.copy(files)
        files = []
        for i in metas:
            files.append(i.split("\t")[0])
        metas = []

        num_files = len(files)
        output_buffer_size = self.params['feature_extraction']['output_buffer_size']

        pool = Pool(processes=self.params['feature_extraction']['worker_extractors'])
        for i in range(0, num_files, output_buffer_size):
            print "processing files %d through %d of %d" % (i + 1, min(i + output_buffer_size, num_files), num_files)
            result = pool.map(self.extract, files[i:min(i + output_buffer_size, num_files)])

            T0 = time.time()
            for track in result:
                filename = acf_utils.extract_filename(track.metadata.filename, "wav") + ".features"
                filename = self.params['general']['scratch_directory'] + "/" + filename

                print "writing features to file %s..." % (filename)
                feature_file = open(filename, "w")
                track.save(feature_file)
                feature_file.close()
                del track
            T1 = time.time()
            print "writing feature files to disk took %f seconds" % (T1 - T0)

            del result
            gc.collect()

        pool.close()
        pool.join()

        print ('Feature extraction done!')
Exemplo n.º 30
0
    def run_mcts(self, env, runs_per_round):
        """
        Runs all batched MCTS instances concurrently on the STOVE model
        :param env: (STOVE) a STOVE instance representing the env
        :param runs_per_round: (int) the number of MCTS expansions to perform
        :return: an array of next actions
        """
        pool = Pool(self.num_mcts)
        for i in range(runs_per_round):
            start = time.time()
            result = pool.imap(select, self.trees)
            all_states = []
            all_zs = []
            for state, z in result:
                all_states.append(state)
                all_zs.append(z)
            # expand all mcts by applying all next actions batched on all mcts zs
            expansion_actions = multi_one_hot(range(self.actions),
                                              self.actions)
            expansion_actions = expansion_actions.view(self.actions, 1,
                                                       self.actions)
            expansion_actions = expansion_actions.repeat(self.num_mcts, 1,
                                                         1).to('cuda')
            new_zs, r = env.rollout(tile(torch.cat(all_zs, 0), 0,
                                         self.actions).to('cuda'),
                                    num=1,
                                    actions=expansion_actions,
                                    appearance=tile(self.obj_app, 0,
                                                    self.actions).to('cuda'))

            # rollout all new expanded nodes in parallel
            random_rollout_actions = np.random.randint(
                self.actions,
                size=(self.actions * self.num_mcts * self.max_rollout * 2, ))
            random_rollout_actions = multi_one_hot(random_rollout_actions,
                                                   self.actions)
            random_rollout_actions = random_rollout_actions.view(
                self.num_mcts * self.actions, self.max_rollout * 2,
                self.actions)
            _, r_rollout = env.rollout(
                new_zs[:, -1].to('cuda'),
                num=self.max_rollout * 2,
                actions=random_rollout_actions,
                appearance=tile(self.obj_app, 0, self.actions).to('cuda'))

            for j, mcts in enumerate(self.trees):
                low = j * self.actions
                high = (j + 1) * self.actions
                mcts.backpropagate(new_zs[low:high], r[low:high],
                                   r_rollout[low:high], all_states[j])

        pool.close()
        actions = []
        for i in range(self.num_mcts):
            counts = [
                self.trees[i].Nsa['r' + str(a)] for a in range(self.actions)
            ]
            actions.append(np.argmax(counts))
        return actions
Exemplo n.º 31
0
Arquivo: olt.py Projeto: sjava/olt
def zte_gpon_svlan_check():
    clear_log()
    nodes = graph.cypher.execute(
        "match(n:Olt)--(c:Card) where c.name='GTGO' return n.ip,collect(c.slot)")
    olts = ((x[0], x[1]) for x in nodes)
    lzte_gpon_svlan = lambda x: zte_gpon_svlan(ip=x[0], slots=x[1])
    pool = Pool(8)
    lock = Manager().Lock()
    func = partial(svlan_entry, lock)
    list(pool.map(compose(func, lzte_gpon_svlan), olts))
    pool.close()
    pool.join()
Exemplo n.º 32
0
    def prime_calculate(self):
        break_points = []  # List that will have start and stopping points
        for i in range(cores):  # Creates start and stopping points based on length of range_finish
            break_points.append(
                {"start": int(math.ceil(((self.maximum_prime + 1) + 0.0) / cores * i)),
                 "stop": int(math.ceil(((self.maximum_prime + 1) + 0.0) / cores * (i + 1)))})

        p = Pool(cores)  # Number of processes to create.
        for i in break_points:  # Cycles though the breakpoints list created above.
            a = p.apply_async(self.prime_calculator, kwds=i, args=tuple(),
                              callback=self.update_num)  # This will start the separate processes.
        p.close()  # Prevents any more processes being started
        p.join()  # Waits for worker process to end
Exemplo n.º 33
0
Arquivo: switch.py Projeto: sjava/olt
def interface_check_m():
    clear_log()
    #  cmd = "match(s: Switch) where s.model in ['S8505','S8508'] return s.ip, s.model"
    cmd = "match(s: Switch)  return s.ip, s.model"
    #  cmd = "match(s:Switch) where s.model='S9306' or s.model='s9303' return s.ip,s.model limit 2"
    nodes = graph.cypher.execute(cmd)
    switchs = [(x[0], x[1]) for x in nodes]
    pool = Pool(16)
    lock = Manager().Lock()
    out_inf = partial(output_interface_m, lock)
    list(pool.map(compose(out_inf, get_interface), switchs))
    pool.close()
    pool.join()
Exemplo n.º 34
0
Arquivo: olt.py Projeto: sjava/olt
def svlan_check():
    clear_log()
    #  nodes = graph.find('Olt', property_key='ip', property_value='9.192.96.246')
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='company', property_value='zte')
    olts = [(x['ip'], x['company'], x['area']) for x in nodes]
    #  list(map(compose(card_entry, get_card), olts))
    pool = Pool(16)
    lock = Manager().Lock()
    func = partial(svlan_entry, lock)
    list(pool.map(compose(func, get_svlan), olts))
    pool.close()
    pool.join()
Exemplo n.º 35
0
def main(args):

    filedate = args.filedate
    database = args.database

    slablist = ['alu','cal','cam','car','cas','cot','hal','hel','him','hin','izu','jap','ker','kur','mak','man','mue','pam','png','phi','puy','ryu','sam','sco','sol','sul','sum','van']

    indices = range(len(slablist))
    pool1 = Pool(args.nCores)
    partial_loop1 = partial(calls2d, database, filedate, slablist)

    pts = pool1.map(partial_loop1, indices)
    pool1.close()
    pool1.join()
Exemplo n.º 36
0
Arquivo: olt.py Projeto: sjava/weihu
def add_infs():
    funcs = {'zte': Zte.get_infs, 'hw': Huawei.get_infs}
    get_infs = partial(_company, funcs)

    clear_log()
    nodes = graph.cypher.execute(
        'match (n:Olt) return n.ip as ip,n.company as company')
    olts = [dict(ip=x['ip'], company=x['company']) for x in nodes]
    pool = Pool(128)
    lock = Manager().Lock()
    _add_infs_p = partial(_add_infs, lock)
    list(pool.map(compose(_add_infs_p, get_infs), olts))
    pool.close()
    pool.join()
Exemplo n.º 37
0
Arquivo: olt.py Projeto: sjava/olt
def hostname_check():
    clear_log()
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46')
    olts = [(x['ip'], x['company']) for x in nodes]
    pool = Pool(16)
    lock = Manager().Lock()
    func = partial(hostname_entry, lock)
    list(pool.map(compose(func, get_hostname), olts))
    pool.close()
    pool.join()
    ip_hostname = (x.split(',') for x in open(result_file))
    cmd = "match (n:Olt) where n.ip={ip} set n.hostname={hostname}"
    list(map(lambda x: graph.cypher.execute(
        cmd, ip=x[0], hostname=x[1]), ip_hostname))
Exemplo n.º 38
0
def get_vlan_usersP(bras):
    def _get_vlan_users(bas):
        funcs = {'m6k': M6k.get_vlan_users,
                 'me60': ME60.get_vlan_users}
        _gvu = partial(_model, funcs)
        return _gvu(bas)

    bras = [dict(ip=x[0], model=x[1], inf=x[2])
            for x in bras]
    pool = Pool(len(bras))
    temp = pool.map(_get_vlan_users, bras)
    pool.close()
    pool.join()
    temp = [x[1] for x in temp if x[1]]
    rslt = reduce(lambda x, y: merge_with(sum, x, y), temp)
    return rslt
Exemplo n.º 39
0
 def calculate(self, data):
     t1 = dt.datetime.utcnow()
     LOGGER.info('Starting calculation...')
     self._data = deepcopy(data)
     self._check_inputs(data)
     dep = self._dependencies()
     sorted_dep = topological_sort(dep)
     for items in sorted_dep:
         # loading node with inputs
         for item in items:
             node = self._get_node(item)
             args = [i_name for i_name in node.input_names if i_name not in node.kwargs]
             data_to_pass = []
             for arg in args:
                 data_to_pass.append(self._data[arg])
             kwargs_to_pass = {}
             for kwarg in node.kwargs:
                 kwargs_to_pass[kwarg] = self._data[kwarg]
             node.load_inputs(data_to_pass, kwargs_to_pass)
         # running nodes
         if self._parallel:
             pool = Pool(self._pool_size)
             results = pool.map(
                 Graph.run_node,
                 [self._get_node(i) for i in items]
             )
             pool.close()
             pool.join()
             results = {k: v for k, v in results}
         else:
             results = {}
             for item in items:
                 node = self._get_node(item)
                 res = node.run_with_loaded_inputs()
                 results[node.id] = res
         # save results
         for item in items:
             node = self._get_node(item)
             res = results[node.id]
             if len(node.output_names) == 1:
                 self._data[node.output_names[0]] = res
             else:
                 for i, out in enumerate(node.output_names):
                     self._data[out] = res[i]
     t2 = dt.datetime.utcnow()
     LOGGER.info('Calculation finished in {}'.format(t2-t1))
     return res
Exemplo n.º 40
0
Arquivo: olt.py Projeto: sjava/olt
def zhongji_check():
    clear_log()
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46')
    olts = [(x['ip'], x['company']) for x in nodes]
    pool = Pool(16)
    lock = Manager().Lock()
    func = partial(zhongji_entry, lock)
    list(pool.map(compose(func, get_zhongji), olts))
    pool.close()
    pool.join()
    ports = (x.split(',') for x in open(result_file))
    cmd = """match(n: Olt) where n.ip = {ip} 
    merge(n) - [:HAS]->(m: Etrunk{name: {sm}}) 
    merge(m) - [:Include]->(p: Port{name: {interface}})"""
    list(map(lambda x: graph.cypher.execute(
        cmd, ip=x[0], sm=x[1], interface=x[2]), ports))
Exemplo n.º 41
0
def parallel_cdist(data1, data2, n_rows_per_job=100):

    from scipy.spatial.distance import cdist

    data1 = np.array(data1)
    data2 = np.array(data2)

    pool = Pool(12)

    start_indices = np.arange(0, data1.shape[0], n_rows_per_job)
    end_indices = start_indices + n_rows_per_job - 1

    partial_distance_matrices = pool.map(lambda (si, ei): cdist(data1[si:ei+1].copy(), data2), zip(start_indices, end_indices))
    pool.close()
    pool.join()

    distance_matrix = np.concatenate(partial_distance_matrices)
    return distance_matrix
Exemplo n.º 42
0
def add_power_info():
    funcs = {'S8508': S85.get_power_info,
             'S8505': S85.get_power_info,
             'T64G': T64.get_power_info,
             'S8905': S89.get_power_info,
             'S8905E': S8905E.get_power_info,
             'S9306': S93.get_power_info,
             'S9303': S93.get_power_info}
    get_power_info = partial(_model, funcs)
    #  clear_log()
    nodes = graph.cypher.execute(
        "match (s:Switch) where s.snmpState='normal' return s.ip as ip,s.model as model")
    switches = [dict(ip=x['ip'], model=x['model']) for x in nodes]
    pool = Pool(processor)
    lock = Manager().Lock()
    _ff = partial(_add_power_info, lock)
    list(pool.map(compose(_ff, get_power_info), switches))
    pool.close()
    pool.join()
Exemplo n.º 43
0
def add_traffics():
    funcs = {'S8508': S85.get_traffics,
             'S8505': S85.get_traffics,
             'T64G': T64.get_traffics,
             'S8905': S89.get_traffics,
             'S8905E': S8905E.get_traffics,
             'S9306': S93.get_traffics,
             'S9303': S93.get_traffics}
    get_traffics = partial(_model, funcs)
    #  clear_log()
    nodes = graph.cypher.execute(
        "match (s:Switch)--(i:Inf) where s.snmpState='normal' return s.ip as ip,collect(i.name) as infs,s.model as model")
    switchs = [dict(ip=x['ip'], infs=x['infs'], model=x['model'])
               for x in nodes]
    pool = Pool(processor)
    lock = Manager().Lock()
    _ff = partial(_add_traffics, lock)
    list(pool.map(compose(_ff, get_traffics), switchs))
    pool.close()
    pool.join()
Exemplo n.º 44
0
def compute_jaccard_pairwise(indices, square_form=True, parallel=True, return_poses=False):
    n = len(indices)

    if parallel:
        pool = Pool(16)
        scores_poses_tuples = pool.map(lambda x: compute_jaccard_i_vs_list(x[0],x[1]),
                                   [(indices[i], indices[i+1:]) for i in range(n)])
        pool.close()
        pool.join()
    else:
        scores_poses_tuples = [compute_jaccard_i_vs_list(indices[i], indices[i+1:]) for i in range(n)]

    pairwise_scores = np.array([scores for scores, poses in scores_poses_tuples])

    if square_form:
        pairwise_scores = squareform(np.concatenate(pairwise_scores))

    if return_poses:
        poses = np.array([poses for scores, poses in scores_poses_tuples])
        return pairwise_scores, poses
    else:
        return pairwise_scores
Exemplo n.º 45
0
    def aggregate(self, feature_files):
        """
        This aggregator is a front-end to the pymir3 stats module. The statistics that must be computed
        are found in the simple_aggregation key in the experiment file.

        :param feature_files: a list of FeatureTrack filenames
        :type feature_files: list[str]
        :return:
        :rtype: None

        .. note::
            These keys are expected to be set in the experiment file:
                * ['simple_aggregation']['mean']
                * ['simple_aggregation']['delta']
                * ['simple_aggregation']['variance']
                * ['simple_aggregation']['acceleration']
                * ['simple_aggregation']['slope']
                * ['simple_aggregation']['limits']
                * ['simple_aggregation']['csv']
                * ['simple_aggregation']['normalize']
                * ['general']['scratch_directory']
                * ['feature_aggregation']['aggregated_output']

        """

        features = load_feature_files(feature_files)

        if self.params['simple_aggregation']['texture_windows']:

            #for i in range(len(feature_files)):
            #    feature_files[i] = feature_files[i] + "_tw"

            jobs = []
            out_idx = 0
            for f in features:
                jobs.append((f, self.params['simple_aggregation']['texture_window_length'], feature_files[out_idx] ))
                out_idx+=1

            num_files = len(jobs)
            output_buffer_size = self.params['simple_aggregation']['tw_buffer_size']

            pool = Pool(processes=self.params['simple_aggregation']['tw_workers'])

            pool.map(calc_textures, jobs)

            # out_idx = 0

            # for i in range(0, num_files, output_buffer_size):
            #     print "Calculating texture windows %d through %d of %d" % (i + 1, min(i + output_buffer_size, num_files), num_files)
                
            #     result = pool.map(calc_textures, jobs[i:min(i + output_buffer_size, num_files)])
                
            #     for track in result:
            #         filename = feature_files[out_idx]
            #         print "writing features to file %s..." % (filename)
            #         feature_file = open(filename, "w")
            #         track.save(feature_file)
            #         feature_file.close()
            #         del track
            #         out_idx+=1

            #     del result
            #     gc.collect()

            pool.close()
            pool.join()
            features = None

        if features == None:
            features = load_feature_files(feature_files)

        stats = feat_stats.Stats()
        m = stats.stats(features,
                        mean=self.params['simple_aggregation']['mean'],
                        delta=self.params['simple_aggregation']['delta'],
                        variance=self.params['simple_aggregation']['variance'],
                        acceleration=self.params['simple_aggregation']['acceleration'],
                        slope=self.params['simple_aggregation']['slope'],
                        limits=self.params['simple_aggregation']['limits'],
                        csv=self.params['simple_aggregation']['csv'],
                        normalize=self.params['simple_aggregation']['normalize'])

        out = open(self.params['general']['scratch_directory'] +
                   "/" + self.params['feature_aggregation']['aggregated_output'], "w")

        m.save(out)

        out.close()
    for iy, y0 in enumerate(np.arange(0, img_h, 5000)):
        for ix, x0 in enumerate(np.arange(0, img_w, 5000)):
            origins.append((x0, y0))

    alg = 'cellprofiler'

    big_labelmap = np.zeros((img_h, img_w), dtype=np.int64)
    n = 0
    for i, input_fp in enumerate(input_fps):
        prefix = os.path.splitext(input_fp)[0]
        labelmap = labelmap_alltiles[i].astype(np.int64) # astype(np.int64) is important, otherwise results in negative label values.
        x0, y0 = origins[i]
        big_labelmap[y0:y0+5000, x0:x0+5000][labelmap != 0] = labelmap[labelmap != 0] + n
        n += labelmap.max()

    labelmap_fp = os.path.splitext(input_img_fp)[0] + '_labelmap_%(alg)s.bp' % dict(alg=alg)
    bp.pack_ndarray_file(big_labelmap, labelmap_fp)
    upload_to_s3(labelmap_fp)
    
    for fp in input_fps:
        execute_command('rm ' + fp)        

t = time.time()

pool = Pool(NUM_CORES/2)
pool.map(detect_cells, filenames)
pool.close()
pool.join()

sys.stderr.write('Overall time: %.2f seconds.\n' % (time.time()-t))
Exemplo n.º 47
0
def compute_jaccard_list_vs_all(seed_indices):
    pool = Pool(14)
    affinities_to_seeds = np.array(pool.map(lambda i: compute_jaccard_i_vs_all(i), seed_indices))
    pool.close()
    pool.join()
    return affinities_to_seeds
Exemplo n.º 48
0
def balance(cool_uri, nproc=1, chunksize=int(1e7), mad_max=5, min_nnz=10,
            min_count=0, ignore_diags=1, tol=1e-5, max_iters=200):
    """
    Cooler contact matrix balancing.
    
    Parameters
    ----------
    cool_uri : str
        URI of cooler group.
    nproc : int
        Number of processes. (Default: 1)
        
    """
    cool_path, group_path = parse_cooler_uri(cool_uri)
    # pre-check the weight column
    with h5py.File(cool_path, 'r+') as h5:
        grp = h5[group_path]
        if 'weight' in grp['bins']:
            del grp['bins']['weight'] # Overwrite the weight column
    
    log.info('Balancing {0}'.format(cool_uri))
    
    clr = Cooler(cool_uri)
    
    try:
        if nproc > 1:
            pool = Pool(nproc)
            map_ = pool.imap_unordered
        else:
            map_ = map
        
        if clr.info['metadata']['onlyIntra']=='True':
            onlyIntra = True
        else:
            onlyIntra = False
        
        bias, stats = ice.iterative_correction(
                clr,
                chunksize=chunksize,
                cis_only=onlyIntra,
                trans_only=False,
                tol=tol,
                min_nnz=min_nnz,
                min_count=min_count,
                blacklist=None,
                mad_max=mad_max,
                max_iters=max_iters,
                ignore_diags=ignore_diags,
                rescale_marginals=True,
                use_lock=False,
                map=map_)
    finally:
        if nproc > 1:
            pool.close()
    
    if not stats['converged']:
        log.error('Iteration limit reached without convergence')
        log.error('Storing final result. Check log to assess convergence.')
    
    with h5py.File(cool_path, 'r+') as h5:
        grp = h5[group_path]
        # add the bias column to the file
        h5opts = dict(compression='gzip', compression_opts=6)
        grp['bins'].create_dataset('weight', data=bias, **h5opts)
        grp['bins']['weight'].attrs.update(stats)
Exemplo n.º 49
0
def fmultiprocess(
        log,
        function,
        inputArray,
        poolSize=False,
        timeout=3600,
        **kwargs):
    """multiprocess pool

    **Key Arguments:**
        - ``log`` -- logger
        - ``function`` -- the function to multiprocess
        - ``inputArray`` -- the array to be iterated over
        - ``poolSize`` -- limit the number of CPU that are used in multiprocess job
        - ``timeout`` -- time in sec after which to raise a timeout error if the processes have not completed

    **Return:**
        - ``resultArray`` -- the array of results

    **Usage:**

        .. code-block:: python 

            from fundamentals import multiprocess
            # DEFINE AN INPUT ARRAY
            inputArray = range(10000)
            results = multiprocess(log=log, function=functionName, poolSize=10, timeout=300,
                                  inputArray=inputArray, otherFunctionKeyword="cheese")
    """
    log.debug('starting the ``multiprocess`` function')

    # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1)
    if not poolSize:
        poolSize = psutil.cpu_count()

    if poolSize:
        p = Pool(processes=poolSize)
    else:
        p = Pool()

    cpuCount = psutil.cpu_count()
    chunksize = int((len(inputArray) + 1) / (cpuCount * 3))

    if chunksize == 0:
        chunksize = 1

    # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES
    if "log" in inspect.getargspec(function)[0]:
        mapfunc = partial(function, log=log, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize)
    else:
        mapfunc = partial(function, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize)

    resultArray = resultArray.get(timeout=timeout)

    p.close()
    p.terminate()

    log.debug('completed the ``multiprocess`` function')
    return resultArray
Exemplo n.º 50
0
def test():
    print('cpuCount() = %d\n' % cpuCount())
    
    #
    # Create pool
    #
    
    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)    

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')
    
    t = time.time()
    A = list(map(pow3, xrange(N)))
    print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))
    
    t = time.time()
    B = pool.map(pow3, xrange(N))
    print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N//8))
    print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))
    
    assert A == B == C, (len(A), len(B), len(C))
    print()
    
    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')
    
    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))
    
    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L)//8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5,)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')
            
    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')
            
    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()
    
    #
    # Testing timeouts
    #
    
    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()
            
    #
    # Testing callback
    #

    print('Testing callback:')
    
    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
        
    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))
    
    #
    # Check there are no outstanding tasks
    #
    
    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool
    
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)
    
    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')
def compute_spm_histograms(labelmap, sample_locs, patch_size, M):
    """
    Args:
        labelmap (2d-ndarray of int):
        sample_locs (2d-ndarray): List of (x,y) locations at which to sample the SPM histograms
        M (int): number of unique SIFT descriptor words, aka. size of vocabulary
        
    Returns:
        hists_arr0 ((1,M)-array of int)
        hists_arr1 ((4,M)-array of int)
        hists_arr2 ((16,M)-array of int)
    """

    global labelmap_global
    labelmap_global = labelmap

    # compute level-2 histograms
    l = 2

    grid_size = patch_size / 2**l

    if l == 2:
        rx = [-2, -1, 0, 1]
        ry = [-2, -1, 0, 1]
    elif l == 1:
        rx = [-1, 0]
        ry = [-1, 0]
    elif l == 0:
        rx = [-.5]
        ry = [-.5]

    rxs, rys = np.meshgrid(rx, ry)

    patch_coords_allGrid = []

    for grid_i, (rx, ry) in enumerate(np.c_[rxs.flat, rys.flat]):

        patch_xmin = sample_locs[:,0] + rx * grid_size
        patch_ymin = sample_locs[:,1] + ry * grid_size
        patch_xmax = sample_locs[:,0] + (rx + 1) * grid_size
        patch_ymax = sample_locs[:,1] + (ry + 1) * grid_size

        patch_coords_allGrid.append([patch_xmin, patch_ymin, patch_xmax, patch_ymax])


    all_coords = np.hstack(patch_coords_allGrid)
    patch_xmin = all_coords[0]
    patch_ymin = all_coords[1]
    patch_xmax = all_coords[2]
    patch_ymax = all_coords[3]

    def compute_histogram_particular_label(i):
        m = (labelmap_global == i).astype(np.uint8)
        mi = cv2.integral(m)
        ci = mi[patch_ymin, patch_xmin] + mi[patch_ymax, patch_xmax] - mi[patch_ymax, patch_xmin] - mi[patch_ymin, patch_xmax]
        return ci

    t = time.time()
    # hists = Parallel(n_jobs=16)(delayed(compute_histogram_particular_label)(i) for i in range(1, M+1))
    # hists = Parallel(n_jobs=8)(delayed(compute_histogram_particular_label)(i) for i in range(1, M+1))
    pool = Pool(8)
    hists = pool.map(compute_histogram_particular_label, range(1, M+1))
    # pool.terminate()
    pool.close()
    pool.join()
    # del pool
    sys.stderr.write('done in %f seconds\n' % (time.time() - t)) # ~ 13 seconds

    n_grid = (2**l)**2
    hists_arr2 = np.transpose(np.reshape(hists, (M, n_grid, -1)))
    print hists_arr2.shape

    # compute level-1 histograms based on level-2 histograms

    hists_arr1 = np.transpose([hists_arr2[:, [0,1,4,5], :].sum(axis=1),
                               hists_arr2[:, [2,3,6,7], :].sum(axis=1),
                               hists_arr2[:, [8,9,12,13], :].sum(axis=1),
                               hists_arr2[:, [10,11,14,15], :].sum(axis=1)],
                              [1,0,2])
    print hists_arr1.shape

    # compute level-0 histograms based on level-1 histograms

    hists_arr0 = hists_arr1.sum(axis=1)
    print hists_arr0.shape

    return hists_arr0, hists_arr1, hists_arr2
Exemplo n.º 52
0
    def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None,
                           grid_resolution=30, n_jobs=-1, grid_range=None, sample=True,
                           sampling_strategy='random-choice', n_samples=1000,
                           bin_count=50, return_metadata=False,
                           progressbar=True, variance_type='estimate'):

        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        variance_type: string

        return_metadata: boolean

        :Examples:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf.predict_proba, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \
                                           "Please call Interpretation.load_data \n" \
                                           "before running this method."
            raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be \n" \
                      "members of modelinstance.classes. \n" \
                      "Expected members of: \n" \
                      "{0}\n" \
                      "got: \n" \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            filter_classes = list(filter_classes)
            assert all([i in modelinstance.target_names for i in filter_classes]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one \n"
                                        "creating one with skater.model.local.InMemoryModel or \n"
                                        "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot \n'
                                            'be None'))
            self.interpreter.logger.warn("Classifiers with probability scores can be explained \n"
                                         "more granularly than those without scores. If a prediction method with \n"
                                         "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in \n" \
                                         "Interpretation.data_set.feature_ids \n" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise(KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(grid_range)
                raise(exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples=10)

            examples = DataManager(examples, feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape))
        self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples=n_samples,
                                                    bin_count=bin_count)

        assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \
                                                             "Theres a type mismatch between\n" \
                                                             "the sampled data and the origina\nl" \
                                                             "training set. Check Skater.models\n"

        _pdp_metadata = self._build_metadata_dict(modelinstance,
                                                  feature_ids,
                                                  self.data_set.feature_ids,
                                                  filter_classes,
                                                  variance_type)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]
        executor_instance = Pool(n_jobs)

        if progressbar:
            self.interpreter.logger.warn("Progress bars slow down runs by 10-20%. For slightly \n"
                                         "faster runs, do progress_bar=False")
            mapper = executor_instance.imap
            p = ProgressBar(len(arg_list), units='grid cells')
        else:
            mapper = executor_instance.map

        pd_list = []
        try:
            if n_jobs == 1:
                raise ValueError("Skipping to single processing")
            for pd_row in mapper(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        except:
            self.interpreter.logger.warn("Multiprocessing failed, going single process")
            for pd_row in map(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()
        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))