Exemplo n.º 1
0
    def _calc_threads(self):
        if mp.cpu_count() == 1:
            max = 1
        else:
            max = mp.cpu_count() - 1

        return max
Exemplo n.º 2
0
def exposure_analysis(rail=False):
    """
    Get exposure statistics for all road or railway assets in all regions.
    
    Optional Arguments:
        *rail* : Default is **False**. Set to **True** if you would like to 
        intersect the railway assets in a region.
    
    """
    # set data path
    data_path = load_config()['paths']['data']

    # load shapefile with all regions
    global_regions = geopandas.read_file(
        os.path.join(data_path, 'input_data', 'global_regions_v2.shp'))

    # load csv with income group data and assign income group to regions
    incomegroups = pandas.read_csv(os.path.join(data_path, 'input_data',
                                                'incomegroups_2018.csv'),
                                   index_col=[0])
    income_dict = dict(zip(incomegroups.index, incomegroups.GroupCode))
    global_regions['wbincome'] = global_regions.GID_0.apply(
        lambda x: income_dict[x])

    # only keep regions for which we have data
    global_regions = global_regions.loc[global_regions.GID_2.isin([
        (x.split('.')[0])
        for x in os.listdir(os.path.join(data_path, 'region_osm'))
    ])]

    # create dictionary with information on protection standards
    prot_lookup = dict(
        zip(global_regions['GID_2'], global_regions['prot_stand']))

    # create lists for the parallelization
    regions = list(global_regions.index)
    prot_lookups = [prot_lookup] * len(regions)
    data_paths = [data_path] * len(regions)

    # run exposure analysis parallel
    if not rail:
        with Pool(cpu_count() - 1) as pool:
            collect_output = pool.starmap(regional_roads,
                                          zip(regions, prot_lookups,
                                              data_paths),
                                          chunksize=1)

        pandas.concat(collect_output).to_csv(
            os.path.join(data_path, 'summarized', 'total_exposure_road.csv'))

    else:
        with Pool(cpu_count() - 1) as pool:
            collect_output = pool.starmap(regional_railway,
                                          zip(regions, prot_lookups,
                                              data_paths),
                                          chunksize=1)

        pandas.concat(collect_output).to_csv(
            os.path.join(data_path, 'summarized',
                         'total_exposure_railway.csv'))
Exemplo n.º 3
0
    def _calc_num_threads(self,
                          df_size: int,
                          query_size: int,
                          max_threads=None) -> int:
        num_queries = df_size * query_size

        if mp.cpu_count() == 1:
            max = 1
        else:
            max = mp.cpu_count() - 1

        calc = int(num_queries / 5000)
        if calc > max:
            r = max
        elif calc <= 1:
            if num_queries > 1000:
                r = 2
            else:
                r = 1
        else:
            r = calc

        if max_threads is not None and r > max_threads:
            return max_threads

        return r
Exemplo n.º 4
0
 def generate(self, sample_size):
     self.size = sample_size  # number of samples to be collected
     print(mp.cpu_count())
     pool = mp.Pool(mp.cpu_count())
     self.samples = pool.map(self.method, range(
         self.size))  # container for the collected samples
     #print(self.samples) --> debug
     pool.close()
     pool.join()
Exemplo n.º 5
0
    def run_preprocess(self, database_destination, table_destination):
        """

        :return:
        """
        list_of_dfs = []
        for journal, path in self._mydict.iteritems():
            list_of_dfs.append(self._identify_files_and_titles(path, journal))

        dataframe = pd.concat(list_of_dfs)

        if self._n_cores != 1:
            if self._n_cores == mp.cpu_count():
                print "Going to WARP SPEED!!!!"
            else:
                print "Meh, multiprocessing, but not good enough!"
            titles_to_process = dataframe['file_complete_path'].values
            list_features = self._pool.map(
                self._feat_gen.gen_features_from_pdf_file, titles_to_process)
        else:
            print "Why you didn't choose multiprocessing??? WHY???? WHY???????!"
            print ''
            list_features = []
            total_size = len(dataframe['file_complete_path'].values)
            count_ite = 1
            for t in dataframe['file_complete_path'].values:
                try:
                    list_features.append(
                        self._feat_gen.gen_features_from_pdf_file(t))
                    count_ite += 1
                    print 'Done ' + str(t)
                    print str(count_ite) + ' of ' + str(total_size)
                except:
                    print ' '
                    print 'Problems with pdf: ' + str(t)
                    print 'Iteration: ' + str(count_ite)
                    print ' '

        file_features = pd.concat(list_features)

        print "Sorry, no multiprocessing implemented for this part :/ "
        list_of_features_from_db = []
        total_size = len(dataframe['title'].values)
        count_ite = 1
        for t in dataframe['title'].values:
            list_of_features_from_db.append(
                self._feat_gen.gen_features_from_database(t))
            count_ite += 1
            print 'Done ' + str(t)
            print str(count_ite) + ' of ' + str(total_size)

        db_features = pd.concat(list_of_features_from_db)

        rtn_dataframe = file_features.set_index('title').join(
            db_features.set_index('title'))

        rtn_dataframe = rtn_dataframe.join(dataframe.set_index('title'))
        rtn_dataframe.reset_index(level=0, inplace=True)

        return rtn_dataframe
Exemplo n.º 6
0
    def preprocess_from_files(self,shot_files,use_shots):
        #all shots, including invalid ones
        all_signals = self.conf['paths']['all_signals'] 
        shot_list = ShotList()
        shot_list.load_from_shot_list_files_objects(shot_files,all_signals)
        shot_list_picked = shot_list.random_sublist(use_shots)

        #empty
        used_shots = ShotList()

        use_cores = max(1,mp.cpu_count()-2)
        pool = mp.Pool(use_cores)
        print('running in parallel on {} processes'.format(pool._processes))
        start_time = time.time()
        for (i,shot) in enumerate(pool.imap_unordered(self.preprocess_single_file,shot_list_picked)):
        #for (i,shot) in enumerate(map(self.preprocess_single_file,shot_list_picked)):
            sys.stdout.write('\r{}/{}'.format(i,len(shot_list_picked)))
            used_shots.append_if_valid(shot)

        pool.close()
        pool.join()
        print('Finished Preprocessing {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time))
        print('Omitted {} shots of {} total.'.format(len(shot_list_picked) - len(used_shots),len(shot_list_picked)))
        print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),len(used_shots)))
        if len(used_shots) == 0:
            print("WARNING: All shots were omitted, please ensure raw data is complete and available at {}.".format(self.conf['paths']['signal_prepath']))
        return used_shots 
def correction_factor(p, number_of_runs, method, X, y, n_jobs=None):

    # Setup parallel job
    if n_jobs == -1:
        n_jobs = cpu_count()
    elif n_jobs == None:
        n_jobs = 1

    pool = Pool(n_jobs, maxtasksperchild=1000)

    def run(_):

        # Artificially falsify
        y_f = falsify(y, p, random_state=_)

        # Correct labels
        y_corrected = method.fit_transform(X, y_f)

        N = X.shape[0]
        return ((y == y_corrected).sum() - (1 - p) * N) / (p * N)

    factor = np.array(pool.map(run, range(number_of_runs)))

    # Close the pool again
    pool.close()
    pool.join()
    pool.clear()

    return np.mean(factor), np.std(factor)
Exemplo n.º 8
0
def bin_func(raster_folder, raster_filename, output_folder, binary_functions, binary_args):
    start_time = datetime.now()
    input_raster = os.path.join(raster_folder, raster_filename)
    hdr_file = ""  # input_raster + ".hdr"        # only used for ENVI stacks
    outname = os.path.join(output_folder, raster_filename)
    if outname.find(".tif") != -1:
        outname = outname[0:len(outname) - 4]

    # arr: full size numpy array 3D XxYxZ 200x300x100
    arr = preprocessing.rio_array(input_raster, hdr_file=hdr_file)

    # activate to get list of dates from .hdr file (.hdr file needs to be specified above)
    dates = arr[1]

    for i, func in enumerate(binary_functions):
        # threshold_size = str(statistical_args[i]['threshold_size'])
        # creating results with calling wanted algorithm in parallel_apply_along_axis for quick runtime
        result = apply_along_axis.parallel_apply_along_axis(func1d=func, arr=arr[0], axis=0,
                                                            cores=mp.cpu_count(), **binary_args[i])

        # selecting dtype based on result
        dtype = type(result[0][0])

        func_name_end = str(func).find(" at")
        func_name_start = 10
        func_name = str(func)[func_name_start:func_name_end]

        # exporting result to new raster
        export_arr.functions_out_array(outname=outname + "_" + func_name + str(17), arr=result, input_file=input_raster,
                                       dtype=dtype)

    # print time to this point
    statistics_time = datetime.now()
    print("breakpoint-time = ", statistics_time - start_time, "Hr:min:sec")
Exemplo n.º 9
0
def _on_done(artifacts, graph, node, done, rem, value, apf=False, **kwargs):
    done.add(node)
    artifacts[node] = value

    frontier = rem | set(graph[node])
    batch = {n for n in frontier if not _pendencies(graph, n, done)}
    rem = frontier - batch

    if not batch:
        return

    pool = mp.Pool(processes=min(max(1, mp.cpu_count() - 1), len(batch)))

    each(
        lambda node: pool.apply_async(
            _resolve,
            args=(node, artifacts),
            callback=partial(
                _on_done, artifacts, graph, node, done, rem, apf=apf, **kwargs
            ),
        ),
        batch,
    )

    pool.close()
    pool.join()
Exemplo n.º 10
0
def _build_async(artifacts, apf=False, processes=None):
    graph = u.to_graph(artifacts)
    frontier = u.initial(graph)

    if not frontier:
        return {}

    done, rem = set(), set()
    if processes is None:
        processes = min(max(1, mp.cpu_count() - 1), len(frontier))
    pool = mp.Pool(processes=processes)

    each(
        lambda node: pool.apply_async(
            partial(_resolve, apf=apf),
            args=(node, artifacts),
            callback=partial(
                _on_done, artifacts, graph, node, done, rem, apf=apf),
        ),
        frontier,
    )

    pool.close()
    pool.join()

    artifacts.update({n: _resolve(n, artifacts, apf=apf) for n in rem})

    return artifacts
Exemplo n.º 11
0
    def clear_stack(cls):

        cpu = cpu_count()
        pool = Pool(cpu)

        def classify_set(c, pd, pc):

            pd = Directory.classify_product_auto(config, pd)
#           if product.part_id:
            pc.product = pd
            Directory.set_price(pc)
#           else:
#               manuals.append((config, product, price))

#       manuals = []

        for config, product, price in cls.STACK:

            pool.apply_async(classify_set, args=(config, product, price))

#       for config, product, price in manuals:
#           product = Directory.classify_product_manual(config, product)
#           if product.part_id:
#               price.product = product
#               Directory.set_price(price)
#           else:
#               Directory.set_product(product)

        cls.STACK = []
Exemplo n.º 12
0
def construct_adjacency(fips_data,
                        filename=os.path.join(resourceDir,
                                              'fips_2019_adj.pkl.gz')):
    """
    Creates, and then stores (or loads) the adjacency dictionary of all US counties and territorial units. If the storage file, which is by default :download:`fips_2019_adj.pkl.gz </_static/gis/fips_2019_adj.pkl.gz>`, does not exist, then will create and store this data into the storage file. Will return the data in the end.

    :param dict fips_data: the US county :py:class:`dict` produced by, for example, :py:meth:`create_and_store_fips_2018 <covid19_stats.engine.gis.create_and_store_fips_2018>`.
    :param str filename: the location of the adjacency dictionary file, which is by default :download:`fips_2019_adj.pkl.gz </_static/gis/fips_2019_adj.pkl.gz>` located in the ``covid19_stats`` resource directory.
    :returns: a :py:class:`dict` of adjacency. Each key is a `FIPS code`_ of a county, and each value is a :py:class:`set` of counties and other territories adjacent to it. See :py:meth:`get_fips_adjacency <covid19_stats.engine.gis.get_fips_adjacency>` to see an example of this adjacency information for a single county.
    :rtype: dict
    """
    if os.path.isfile(filename):
        return pickle.load(gzip.open(filename, 'rb'))
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        all_adj = dict(
            map(lambda fips: (fips, get_fips_adjacency(fips, fips_data)),
                fips_data))
        set_of_adjacents = set(
            chain.from_iterable(
                map(
                    lambda fips: map(
                        lambda fips2: tuple(sorted([fips, fips2])), all_adj[
                            fips]), all_adj)))
        if filename is not None:
            pickle.dump(set_of_adjacents, gzip.open(filename, 'wb'))
        return set_of_adjacents
Exemplo n.º 13
0
def map_list_as_chunks(l, f, extra_data, cpus=None, max_chunk_size=None):
    '''
    A wrapper around `pathos.multiprocessing.ProcessPool.uimap` that processes a list in chunks.
    Differs from `map_list_in_chunks` in that this method calls `f` once for each chunk.

    uimap already chunks but if you have extra data to pass in it will pickle
    it for every item. This function passes in the extra data to each chunk
    which significantly saves on pickling.
    https://stackoverflow.com/questions/53604048/iterating-the-results-of-a-multiprocessing-list-is-consuming-large-amounts-of-me

    Parameters
    ----------
    l : list
      the list
    f : function
      the function to process each item
      takes two parameters: chunk, extra_data
    extra_data : object
      the extra data to pass to each f
    cpus : int
      the number of cores to use to split the chunks across
    max_chunk_size : int
      the maximum size for each chunk
    '''
    cpus = cpu_count() if cpus is None else cpus
    max_chunk_size = float('inf') if max_chunk_size is None else max_chunk_size
    chunk_length = min(max_chunk_size, max(1, ceil(len(l) / cpus)))
    chunks = [l[x:x + chunk_length] for x in range(0, len(l), chunk_length)]
    pool = Pool(nodes=cpus)
    f_dumps = cloudpickle.dumps(f)
    tuples = [(chunk, f_dumps, extra_data) for chunk in chunks]
    return pool.map(_process_whole_chunk, tuples)
Exemplo n.º 14
0
    def preprocess_from_files(self,shot_files,use_shots):
        #all shots, including invalid ones
        all_signals = self.conf['paths']['all_signals'] 
        shot_list = ShotList()
        shot_list.load_from_shot_list_files_objects(shot_files,all_signals)
        shot_list_picked = shot_list.random_sublist(use_shots)

        #empty
        used_shots = ShotList()

        use_cores = max(1,mp.cpu_count()-2)
        pool = mp.Pool(use_cores)
        print('running in parallel on {} processes'.format(pool._processes))
        start_time = time.time()
        for (i,shot) in enumerate(pool.imap_unordered(self.preprocess_single_file,shot_list_picked)):
        #for (i,shot) in enumerate(map(self.preprocess_single_file,shot_list_picked)):
            sys.stdout.write('\r{}/{}'.format(i,len(shot_list_picked)))
            used_shots.append_if_valid(shot)

        pool.close()
        pool.join()
        print('Finished Preprocessing {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time))
        print('Omitted {} shots of {} total.'.format(len(shot_list_picked) - len(used_shots),len(shot_list_picked)))
        print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),len(used_shots)))
        if len(used_shots) == 0:
            print("WARNING: All shots were omitted, please ensure raw data is complete and available at {}.".format(self.conf['paths']['signal_prepath']))
        return used_shots 
Exemplo n.º 15
0
def _movie_casedeaths( msas_or_conus, dirname, time0, type_disp = 'cases' ):
    all_msas = set(map(lambda msa_or_conus: msa_or_conus.lower( ), msas_or_conus))
    with ThreadPool(processes = min(8, cpu_count( ), len( all_msas ) ) ) as pool:
        _ = list(pool.map(lambda msa_or_conus: _movie_casedeaths_metro_or_conus(
            msa_or_conus, dirname, time0, type_disp = type_disp ), all_msas ) )
    logging.info( 'at %0.3f seconds to create all %d movies of %s.' % (
        time.time( ) - time0, len( all_msas ), type_disp.upper( ) ) )
Exemplo n.º 16
0
    def parallel_search(self, query):
        self.parallel = 1

        processes = cpu_count()
        data_set_split = []
        query_filename = "query"
        query_bytes = objectToBytes(query, self.predinstance.group)

        for j in range(processes):
            start = ceil(j * self.num_records / processes)
            end = ceil((j + 1) * self.num_records / processes)
            if end > self.num_records:
                end = self.num_records
            data_set_split.append((start, end))
        overall_return_list = []
        (matrix_str, generator_bytes) = self.serialize_key()
        with Pool(processes) as p:
            with concurrent.futures.ProcessPoolExecutor(processes) as executor:
                future_list = {
                    executor.submit(self.augment_search, self.vector_length, self.predicate_scheme, self.group_name,
                                    matrix_str, generator_bytes, query_bytes, self.public_parameters, start,
                                    end)
                    for (start, end) in data_set_split
                    }
                for future in concurrent.futures.as_completed(future_list):
                    res = future.result()
                    if res is not None and len(res) > 0:
                        overall_return_list = overall_return_list + res

        return overall_return_list
Exemplo n.º 17
0
def _summarize( msas_or_conus, dirname, time0 ):
    all_msas = set(map(lambda msa_or_conus: msa_or_conus.lower( ), msas_or_conus))
    with ThreadPool(processes = min(8, cpu_count( ), len( all_msas ) ) ) as pool:
        _ = list(pool.map(lambda msa_or_conus: _summarize_metro_or_conus(
            msa_or_conus, dirname, time0 ), all_msas ) )
    logging.info( 'at %0.3f seconds to create all %d summaries.' % (
        time.time( ) - time0, len( all_msas ) ) )
Exemplo n.º 18
0
def runLGB(train_X, train_y, test_X, test_y, test_X2, label, dev_index,
           val_index):
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    params = {
        'learning_rate': 0.05,
        'application': 'binary',
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'auc',
        'data_random_seed': 3,
        'bagging_fraction': 1.0,
        'feature_fraction': 0.4,
        'nthread': min(mp.cpu_count() - 1, 6),
        'lambda_l1': 1,
        'lambda_l2': 1
    }
    rounds_lookup = {
        'toxic': 1400,
        'severe_toxic': 500,
        'obscene': 550,
        'threat': 380,
        'insult': 500,
        'identity_hate': 480
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=10)
    print(model.feature_importance())
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Exemplo n.º 19
0
def ProcessFrag( f ):

  with file_handle(f) as fi:
    Tmp = [l.rstrip() for l in fi]
  Frags = Tmp[10:]    # remove first 10 lines, which are just file formatting
  del Tmp

  mpi = multiprocessing.Pool(processes=multiprocessing.cpu_count())

  start = time.perf_counter()
#  Tmp = [frag(x) for x in tqdm(Frags, total=len(Frags))]
  Tmp = [x for x in tqdm(mpi.imap(read_in_json, Frags, chunk),total=len(Frags))]
  mpi.close()
  mpi.join()
  end = time.perf_counter()
  print((end-start)/1000, 'ms')

  Tmp2 = [frag for frag in Tmp if frag is not None]
  Tmp3 = list(itertools.chain.from_iterable(Tmp2))    # flatten nested lists
  FgSele = list(set(tuple(Tmp3)))                     # select unique smiles

  del Tmp
  del Tmp2
  del Tmp3
  del Frags
  gc.collect()

  return FgSele
def get_n_cpu():
    try:
        available_cpus = int(getenv('SLURM_NTASKS'))
    except:
        available_cpus = int(pp.cpu_count() / 2)

    return available_cpus
    def evaluate_summarizer(self, parsers, **kwargs):
        """
        :param parsers: list
            List stores newssum.parser.StoryParser.
        :param kwargs:
            See below

        :Keyword Arguments:
            * *w_threshod* (``int``)
                word length threshold
            * *s_threshod* (``int``)
                sentence length threshold
        """
        print("Evaluating summarizer...")
        p = multiprocessing.ProcessingPool(multiprocessing.cpu_count() - 2)

        def get_rouges(parser):
            print("Get rouges...")
            print(parser)
            selected_sents = self.get_best_sents(parser, **kwargs)
            rouge = Rouge(selected_sents, parser.highlights).get_rouge()

            return rouge

        rouges = p.map(get_rouges, parsers)
        # p.close()
        # p.join()

        avg_rouge = Rouge.cal_avg_rouge(rouges)
        Rouge.print("InfoFilter", avg_rouge)
Exemplo n.º 22
0
def single_show_summary_dataframe( df_sub, showname, mode_dataformat = DATAFORMAT.IS_LATER ):
    df_show = df_sub[ df_sub.shows == showname ].copy( )
    assert( len( df_show ) != 0 )
    if mode_dataformat == DATAFORMAT.IS_AVI_OR_MPEG:
        return df_show
    #
    def get_ffprobe_json( filename ):
        stdout_val = subprocess.check_output(
            [ _ffprobe_exec, '-v', 'quiet', '-show_streams',
             '-show_format', '-print_format', 'json', filename ],
            stderr = subprocess.STDOUT )
        file_info = json.loads( stdout_val )
        return file_info

    def is_hevc( filename ):
        data = get_ffprobe_json( filename )
        return data['streams'][0]['codec_name'].lower( ) == 'hevc'

    with Pool( processes = min( cpu_count( ), len( df_show ) ) ) as pool:
        dict_of_episodes_hevc = dict( pool.map(
            lambda filename: ( filename, is_hevc( filename ) ),
            list( df_show.paths ) ) )
        df_show[ 'is hevc' ] = list(map(lambda filename: dict_of_episodes_hevc[ filename ],
                                        list( df_show.paths ) ) )
        return df_show
Exemplo n.º 23
0
    def direct(self):
        def browse_each(config):

            log.info(Directory.INFO_MAP[0] % (self.market.name, config.name))

            try:

                map_strs = self.PRODUCT_MAP[config.name]

                for map_str in map_strs:

                    results = self.get_products_prices(map_str)

                    for product, price in results:

                        product = Directory.check_product(product)

                        if not product.id:
                            Directory.STACK.append((config, product, price))

                        elif product.part_id:
                            price.product = product
                            Directory.set_price(price)

            except KeyError:
                log.error(Directory.ERROR_MAP[1] % config.name)

        cpu = cpu_count()
        pool = _ThreadPool(cpu)
        for c in self.configs:
            pool.apply_async(browse_each, args=(c, ))
        pool.close()
        pool.join()
Exemplo n.º 24
0
 def __init__(self,
              journal_paths_dict,
              feature_generation_instance,
              n_cores=-1):
     """
     Class constructor
     :param mypath: system path to the folder with the files to be be organized
     """
     if isinstance(journal_paths_dict, dict):
         self._mypaths = journal_paths_dict.keys()
         self._myjournals = journal_paths_dict.values()
         self._mydict = journal_paths_dict
     else:
         print 'Input must be a dictionary, with the keys being the journal and the values being the respective ' \
               'paths'
         raise ValueError
     self._feat_gen = feature_generation_instance
     if n_cores == 1:
         self._n_cores = 1
         self._pool = None
     elif n_cores == -1:
         self._n_cores = mp.cpu_count()
         self._pool = mp.ProcessingPool(self._n_cores)
     else:
         self._n_cores = n_cores
         self._pool = mp.ProcessingPool(self._n_cores)
Exemplo n.º 25
0
def DomainDistances(Ref_Coords, PDB_Coords, RefReg2, Reg2, Data, parm, output):

    # Input_Coords = [pdb_name, H_Crds, N_Crds, C_Crds, G_Crds, R_Crds, T_Crds]
    #     x_Coords = [resname, resid, bb_crds, ca_crd, cg_crd, avg_crd, cb_crd]

    print(
        '##################################################################\n')

    # Create distance object for MPI
    Ref = CalculateDist([Ref_Coords, RefReg2])

    if parm['MPICPU'][0] == 1:
        Tmp = [
            CalculateDist([Tgt, Reg2[idx]])
            for idx, Tgt in enumerate(PDB_Coords)
        ]
    else:
        if parm['MPICPU'][0] == 0:
            mpi_cpu = multiprocessing.cpu_count()
        else:
            mpi_cpu = parm['MPICPU'][0]
        mpi = multiprocessing.Pool(mpi_cpu)
        Tmp = [
            x
            for x in tqdm(mpi.imap(CalculateDist, list(zip(PDB_Coords, Reg2))),
                          total=len(Reg2))
        ]
        mpi.close()
        mpi.join()

    Tgt_List = [x for x in Tmp if x is not None]
    print('\n ## Domain Distances return: {0}\n'.format(len(Tgt_List)))
    CollectDomain(Ref, Tgt_List, Data)
Exemplo n.º 26
0
def map_list_in_chunks(l, f, extra_data):
    '''
    A wrapper around ProcessPool.uimap that processes a list in chunks.
    Differs from `map_list_as_chunks` in that this method calls `f` once for each item in `l`.

    uimap already chunks but if you have extra data to pass in it will pickle
    it for every item. This function passes in the extra data to each chunk
    which significantly saves on pickling.
    https://stackoverflow.com/questions/53604048/iterating-the-results-of-a-multiprocessing-list-is-consuming-large-amounts-of-me

    Parameters
    ----------
    l : list
      the list
    f : function
      the function to process each item
      takes two parameters: item, extra_data
    extra_data : object
      the extra data to pass to each f
    '''
    cpus = cpu_count()
    chunk_length = max(1, int(len(l) / cpus))
    chunks = [l[x:x + chunk_length] for x in range(0, len(l), chunk_length)]
    pool = Pool(nodes=cpus)
    f_dumps = cloudpickle.dumps(f)
    tuples = [(chunk, f_dumps, extra_data) for chunk in chunks]
    mapped_chunks = pool.map(_process_chunk, tuples)
    return (item for chunk in mapped_chunks for item in chunk)
def relative_flammability( firescar_list, output_filename, ncores=None, mask_arr=None, mask_value=None, crs=None ):
	'''
	run relative flammability.
	Arguments:
		firescar_list = [list] string paths to all GeoTiff FireScar outputs to be processed
		output_filename = [str] path to output relative flammability filename to be generated. 
						* only GTiff supported. *
		ncores = [int] number of cores to use if None multiprocessing.cpu_count() used.
		mask_arr = [numpy.ndarray] numpy ndarray with dimensions matching the rasters' arrays
					listed in firescar_list and masked where 1=dontmask 0=mask (this is opposite
					numpy mask behavior, but follows common GIS patterns ) * THIS MAY CHANGE. *
		mask_value = [numeric] single numeric value determining the value of the newly masked out
					regions. If None, the nodata value from the firescar outputs will be used and 
					if this is an invalid value, it will be set to -9999.
		crs=[dict] rasterio-compatible crs dict object i.e.: {'init':'epsg:3338'}
	
	Returns:
		output_filename, with the side effect of the relative flammability raster being written to 
		disk in that location.
	'''
	tmp_rst = rasterio.open( firescar_list[0] )

	if ncores == None:
		ncores = multiprocessing.cpu_count() - 1

	out = sum_firescars( firescar_list, ncores=ncores )

	# calculate the relative flammability -- and fill in the mask with -9999
	relative_flammability = ( out.astype( np.float32 ) / len( firescar_list ) ).filled()

	if mask_value == None:
		mask_value = tmp_rst.nodata
		if mask_value == None or mask_value == '':
			print( 'setting mask_value to -9999')
			mask_value = -9999

	if mask_arr:
		relative_flammability[ mask_arr == 0 ] = mask_value

	meta = tmp_rst.meta
	# pop out transform to overcome warning
	if 'transform' in meta.keys():
		_ = meta.pop( 'transform' )

	meta.update( compress='lzw', count=1, dtype='float32', nodata=mask_value )

	if crs:
		meta.update( crs=crs )
	
	try:
		dirname = os.path.dirname( output_filename )
		if not os.path.exists( dirname ):
			os.makedirs( dirname )
	except:
		pass

	with rasterio.open( output_filename, 'w', **meta ) as out_rst:
		out_rst.write( np.around( relative_flammability, 4 ), 1 )

	return output_filename
Exemplo n.º 28
0
    def __init__(self,
                 directory=os.getcwd(),
                 model_file=os.path.join(os.getcwd(), 'model'),
                 classifierType='gradientboosting',
                 verbose=False,
                 num_threads=mp.cpu_count()):
        self.directory = directory
        self.model_file = model_file
        self.classifierType = classifierType
        self.verbose = verbose
        self.num_threads = num_threads

        try:
            db = MySQLdb.connect(host=host,
                                 user=user,
                                 passwd=passwd,
                                 db=database)
        except _mysql_exceptions.OperationalError, e:
            if e[0] != 1049:
                raise
            else:
                with MySQLdb.connect(host=host,
                                     user=user,
                                     passwd=passwd,
                                     db='') as cur:
                    cur.execute("CREATE DATABASE %s;" % database)
def bootstrap(data, fun, n_resamples=10000, alpha=0.05):
    """Compute confidence interval for values of function fun

    Parameters
    ==========
    data: list of arguments to fun

    """
    assert isinstance(data, list)
    n_samples = len(data[0])
    idx = np.random.randint(0, n_samples, (n_resamples, n_samples))

    def select(data, sample):
        return [d[sample] for d in data]

    def evaluate(sample):
        return fun(*select(data, sample))

    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    values = pool.map(evaluate, idx)
    pool.terminate()

    idx = idx[np.argsort(values, axis=0, kind='mergesort')]
    values = np.sort(values, axis=0, kind='mergesort')

    stat = namedtuple('stat', ['value', 'index'])
    low = stat(value=values[int((alpha/2.0)*n_resamples)],
               index=idx[int((alpha/2.0)*n_resamples)])
    high = stat(value=values[int((1-alpha/2.0)*n_resamples)],
                index=idx[int((1-alpha/2.0)*n_resamples)])

    return low, high
Exemplo n.º 30
0
    def encrypt_dataset_parallel(self, data_set):
        self.parallel = 1
        for data_item in data_set:
            if len(data_item) != self.vector_length:
                raise ValueError("Improper Vector Size")
        self.enc_data = {}

        processes = cpu_count()
        data_set_split = []
        data_set_len = len(data_set)
        self.num_records = data_set_len
        for j in range(processes):
            start = ceil(j * data_set_len / processes)
            end = ceil((j + 1) * data_set_len / processes)
            if end > data_set_len:
                end = data_set_len
            data_set_split.append((start, end, data_set[start:end]))
        total_data_size = 0
        (matrix_str, generator_bytes) = self.serialize_key()
        with Pool(processes) as p:
            with concurrent.futures.ProcessPoolExecutor(processes) as executor:
                future_list = {executor.submit(self.augment_encrypt, self.vector_length, self.predicate_scheme,
                                               self.group_name, matrix_str, generator_bytes,
                                               self.public_parameters, data_set_component, start, end)
                               for (start, end, data_set_component) in data_set_split
                               }
                for future in concurrent.futures.as_completed(future_list):
                    res = future.result()
                    if res is not None:
                        total_data_size = total_data_size + res
        self.enc_data_size = total_data_size
Exemplo n.º 31
0
    def createPNGPicObjects(cls, pImgClient):
        """
        :param PlexIMGClient pImgClient: the :py:class:`PlexIMGClient <nprstuff.npremail.email_imgur.NPRStuffIMGClient>` used to access and manipulate (add, delete, rename) images in the main Imgur_ album.
        :returns: a :py:class:`list` of :py:class:`PNGPicObject <nprstuff.npremail.PNGPicObject>` representing the images in the main Imgur_ album.
        :rtype: list
        """
        pngPICObjects = []

        def _create_object(imgMD5):
            imgName, imgID, imgurlLink, imgDateTime = pImgClient.imghashes[
                imgMD5]
            try:
                newObj = PNGPicObject(
                    {
                        'initialization': 'SERVER',
                        'imgurlLink': imgurlLink,
                        'imgName': imgName,
                        'imgMD5': imgMD5,
                        'imgDateTime': imgDateTime
                    }, pImgClient)
                return newObj
            except:
                return None

        with multiprocessing.Pool(
                processes=multiprocessing.cpu_count()) as pool:
            #
            ## doesn't work with multiprocessing for some reason...
            pngPICObjects = list(
                filter(None, map(_create_object, pImgClient.imghashes)))
            return pngPICObjects
Exemplo n.º 32
0
def calculatePearsonCorrelationMatrixMultiprocessing(matrix, axis=0, symmetrical=True, getpvalmat=False):

    if axis == 1:
        matrix = matrix.T

    nRows = matrix.shape[0]

    # create shared array that can be used from multiple processes
    output_r_arr = Array(ctypes.c_double, matrix.shape[0] * matrix.shape[0])
    # then in each new process create a new numpy array using:
    output_r = np.frombuffer(output_r_arr.get_obj())  # mp_arr and arr share the same memory
    # make it two-dimensional
    output_r = output_r.reshape((matrix.shape[0], matrix.shape[0]))  # b and arr share the same memory
    # output_r = np.zeros((nRows,nRows))  # old version

    output_p_arr = Array(ctypes.c_double, matrix.shape[0] * matrix.shape[0])
    output_p = np.frombuffer(output_p_arr.get_obj())
    output_p = output_p.reshape((matrix.shape[0], matrix.shape[0]))

    print 'Calculating Pearson R for each row, multithreaded'
    print mp.cpu_count(), 'processes in pool'

    pool = None
    try:
        pool = mp.Pool(mp.cpu_count(),
                       initializer=_init_pool,
                       initargs=(matrix, output_r_arr, output_p_arr,
                                 nRows, symmetrical))

        # bar = tqdm(total=nRows*nRows/2)
        # tqdm.write('Calculating Pearson R for each row, multithreaded')
        for result in tqdm(pool.imap_unordered(_f, range(0, nRows)), total=nRows):
            # bar.update(result)
            pass
        # bar.close()
    finally:  # To make sure processes are closed in the end, even if errors happen
        pool.close()
        pool.join()

    print output_r

    if getpvalmat:
        return output_r, output_p
    else:
        return output_r
Exemplo n.º 33
0
 def _check_threads(self, message_end=None):
     """
     Checks number of threads
     :param message_end: closing part of the error message.
     """
     if PATHOS_FOUND:
         threads = AbinsModules.AbinsParameters.threads
         if not (isinstance(threads, six.integer_types) and 1 <= threads <= mp.cpu_count()):
             raise RuntimeError("Invalid number of threads for parallelisation over atoms" + message_end)
Exemplo n.º 34
0
    def _calc_num_threads(self, df_size: int, query_size: int) -> int:
        num_queries = df_size * query_size

        if mp.cpu_count() == 1:
            max = 1
        else:
            max = mp.cpu_count() - 1

        calc = int(num_queries / 5000)
        if calc > max:
            r = max
        elif calc <= 1:
            if num_queries > 1000:
                r = 2
            else:
                r = 1
        else:
            r = calc

        return r
Exemplo n.º 35
0
def parallelmap(func, data, nodes = None):
    """
    Return the averaged signal and background (based on blank frames) over the given runs
    """
    if not nodes:
        nodes = multiprocessing.cpu_count() - 2
    pool = ProcessingPool(nodes=nodes)
    try:
        return pool.map(func, data)
    except KeyboardInterrupt:
        pool.terminate()
        pool.join()
Exemplo n.º 36
0
Arquivo: utils.py Projeto: hoidn/utils
def parallelmap(func, lst, nodes = None):
    """
    Return the averaged signal and background (based on blank frames) over the given runs using
    multiprocessing (as opposed to MPI).
    """
    from pathos.multiprocessing import ProcessingPool
    from pathos import multiprocessing
    if not nodes:
        nodes = multiprocessing.cpu_count() - 2
    pool = ProcessingPool(nodes=nodes)
    try:
        return pool.map(func, lst)
    except KeyboardInterrupt:
        pool.terminate()
        pool.join()
Exemplo n.º 37
0
def make_predictions(conf,shot_list,loader):
    loader.set_inference_mode(True)

    use_cores = max(1,mp.cpu_count()-2)

    if backend == 'tf' or backend == 'tensorflow':
        first_time = "tensorflow" not in sys.modules
        if first_time:
                import tensorflow as tf
                os.environ['KERAS_BACKEND'] = 'tensorflow'
                from keras.backend.tensorflow_backend import set_session
                config = tf.ConfigProto(device_count={"CPU":use_cores})
                set_session(tf.Session(config=config))
    else:
        os.environ['THEANO_FLAGS'] = 'device=cpu'
        import theano

    from plasma.models.builder import ModelBuilder
    specific_builder = ModelBuilder(conf) 

    y_prime = []
    y_gold = []
    disruptive = []

    model = specific_builder.build_model(True)
    model.compile(optimizer=optimizer_class(),loss=conf['data']['target'].loss)

    specific_builder.load_model_weights(model)
    model_save_path = specific_builder.get_latest_save_path()

    start_time = time.time()
    pool = mp.Pool(use_cores)
    fn = partial(make_single_prediction,builder=specific_builder,loader=loader,model_save_path=model_save_path)

    print('running in parallel on {} processes'.format(pool._processes))
    for (i,(y_p,y,is_disruptive)) in enumerate(pool.imap(fn,shot_list)):
        print('Shot {}/{}'.format(i,len(shot_list)))
        sys.stdout.flush()
        y_prime.append(y_p)
        y_gold.append(y)
        disruptive.append(is_disruptive)
    pool.close()
    pool.join()
    print('Finished Predictions in {} seconds'.format(time.time()-start_time))
    loader.set_inference_mode(False)
    return y_prime,y_gold,disruptive
Exemplo n.º 38
0
    def __init__(self, logger, directory, urls, max_cores = 0):
        self.logger = logger # default logger
        self.directory = directory # working directory
        self.urls = urls # service URLs
        self.results = [] # parallel execution results
        self.returnVal = {} # final execution results
        self.task_list =[] # parallel task list

        #self.clearCommonParams()
        # user defined parameters
        self.common_params = None
        self.method_params = None
        self.num_threads = None
        self.num_jobs = None
	#self.callback_url = os.environ['SDK_CALLBACK_URL']
        # default parameters
        self.num_cores = mp.cpu_count()
        self.max_cores = max_cores
Exemplo n.º 39
0
def bootstrap(data, fun, n_resamples=10000, alpha=0.05):
  """
    Compute confidence interval for values of function fun on data_preprocess_scripts

    Args:
        data : list of numpy arrays
            Each numpy array will be subsampled and then passed to fun.

        fun :
            Function taking len(data_preprocess_scripts) numpy arrays -> float

        n_resamples : int, 10000 is default
            Number of times to resample data_preprocess_scripts to produce intervals.

        alpha : float, 0.5 is default
            Confidence level parameter for confidence intervals.

    """
  assert isinstance(data, list)
  n_samples = len(data[0])
  idx = np.random.randint(0, n_samples, (n_resamples, n_samples))

  def select(data, sample):
    return [d[sample] for d in data]

  def evaluate(sample):
    return fun(*select(data, sample))

  pool = multiprocessing.Pool(multiprocessing.cpu_count())
  values = pool.map(evaluate, idx)
  pool.terminate()

  idx = idx[np.argsort(values, axis=0, kind='mergesort')]
  values = np.sort(values, axis=0, kind='mergesort')

  stat = namedtuple('stat', ['value', 'index'])
  low = stat(
      value=values[int((alpha / 2.0) * n_resamples)],
      index=idx[int((alpha / 2.0) * n_resamples)])
  high = stat(
      value=values[int((1 - alpha / 2.0) * n_resamples)],
      index=idx[int((1 - alpha / 2.0) * n_resamples)])

  return low, high
Exemplo n.º 40
0
    def train_on_files(self,shot_files,use_shots,all_machines):
        conf = self.conf
        all_signals = conf['paths']['all_signals'] 
        shot_list = ShotList()
        shot_list.load_from_shot_list_files_objects(shot_files,all_signals)
        shot_list_picked = shot_list.random_sublist(use_shots) 

        previously_saved,machines_saved = self.previously_saved_stats()
        machines_to_compute = all_machines - machines_saved
        recompute = conf['data']['recompute_normalization']
        if recompute:
            machines_to_compute = all_machines
            previously_saved = False

        if not previously_saved or len(machines_to_compute) > 0:
            if previously_saved:
                self.load_stats()
            print('computing normalization for machines {}'.format(machines_to_compute))
            use_cores = max(1,mp.cpu_count()-2)
            pool = mp.Pool(use_cores)
            print('running in parallel on {} processes'.format(pool._processes))
            start_time = time.time()

            for (i,stats) in enumerate(pool.imap_unordered(self.train_on_single_shot,shot_list_picked)):
            #for (i,stats) in enumerate(map(self.train_on_single_shot,shot_list_picked)):
                if stats.machine in machines_to_compute:
                    self.incorporate_stats(stats)
                    self.machines.add(stats.machine)
                sys.stdout.write('\r' + '{}/{}'.format(i,len(shot_list_picked)))

            pool.close()
            pool.join()
            print('Finished Training Normalizer on {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time))
            self.save_stats()
        else:
            self.load_stats()
        print(self)
Exemplo n.º 41
0
    def calculate_centroids(self, p=None):
        """
        Perform integration to find centroid at all turns up to N. Multiprocessing pool used to calculate independent
        turn values.
        Will automatically use `integrate_first_order` or `integrate_second_order` if appropriate.
        Args:
            p: Specify number of processes for pool. If not given then `cpu_count` is used.

        Returns:
            array of floats
        """
        if p:
            pool_size = p
        else:
            pool_size = cpu_count()

        pool = Pool(pool_size)

        #  attempt to speed things up by spreading out difficult integration values at the end of range
        #  appeared to not work
        #     x = []
        #     for i in range(cpu_count()):
        #         x += range(N)[i::4]

        if len(self.mu) == 1:
            integration_function = self.integrate_first_order
        elif len(self.mu) == 2:
            integration_function = self.integrate_second_order
        else:
            integration_function = self.integrate_any_order

        x = range(self.N)
        results = pool.map(integration_function, x)
        pool.close()

        return results
Exemplo n.º 42
0
 def __init__(self,process):
     self.size = mp.cpu_count()
     self.process = process
     self.phase=None
     self.pool = mp.ProcessingPool(mp.cpu_count())
Exemplo n.º 43
0
def populate(centres, masses, halomodel=None, profile=None, hodmod=None, edges=None):
    """
    Populate a series of DM halos with galaxies given a HOD model.

    Parameters
    ----------
    centres : (N,3)-array
        The cartesian co-ordinates of the centres of the halos

    masses : array_like
        The masses (in M_sun/h) of the halos

    halomodel : type :class:`halomod.HaloModel`
        A HaloModel object pre-instantiated. One can either use this, or
        *both* `profile` and `hodmod` arguments.

    profile : type :class:`profile.Profile`
        A density profile to use.

    hodmod : object of type :class:`hod.HOD`
        A HOD model to use to populate the dark matter.

    edges : float, len(2) iterable, or (2,3)-array
        Periodic box edges. If float, defines the upper limit of cube, with lower limit at zero.
        If len(2) iterable, defines edges of cube.
        If (2,3)-array, specifies edges of arbitrary rectangular prism.

    Returns
    -------
    pos : array
        (N,3)-array of positions of galaxies.

    halo : array
        (N)-array of associated haloes (by index)

    H : int
        Number of central galaxies. The first H galaxies in pos/halo correspond to centrals.
    """
    if halomodel is not None:
        profile = halomodel.profile
        hodmod = halomodel.hod

    masses = np.array(masses)

    # Define which halos have central galaxies.
    cgal = np.random.binomial(1, hodmod.nc(masses))
    cmask = cgal > 0
    central_halos = np.arange(len(masses))[cmask]

    if hodmod._central:
        masses = masses[cmask]
        centres = centres[cmask]

    # Calculate the number of satellite galaxies in halos
    # Using _ns, rather than ns, gives the correct answer for both central condition and not.
    # Note that other parts of the algorithm also need to be changed if central condition is not true.
    # if hodmod._central:
    #     sgal = poisson.rvs(hodmod._ns(masses[cmask]))
    # else:
    sgal = poisson.rvs(hodmod._ns(masses))

    # Get an array ready, hopefully speeds things up a bit
    ncen = np.sum(cgal)
    nsat = np.sum(sgal)

    pos = np.empty((ncen + nsat, 3))
    halo = np.empty(ncen+nsat)

    # Assign central galaxy positions
    halo[:ncen] = central_halos
    if hodmod._central:
        pos[:ncen, :] = centres
    else:
        pos[:ncen, :] = centres[cmask]


    smask = sgal > 0
    # if hodmod._central:
    #     sat_halos = central_halos[np.arange(len(masses[cmask]))[smask]]
    # else:
    if hodmod._central:
        sat_halos = central_halos[np.arange(len(masses))[smask]]
    else:
        sat_halos = np.arange(len(masses))[smask]

    sgal = sgal[smask]
    centres = centres[smask]
    masses = masses[smask]

    # Now go through each halo and calculate galaxy positions
    start = time.time()
    halo[ncen:] = np.repeat(sat_halos,sgal)
    indx = np.concatenate(([0],np.cumsum(sgal))) + ncen

#    print "SMASHING THIS NOW"
    def fill_array(i):
        m,n,ctr = masses[i], sgal[i],centres[i]
        pos[indx[i]:indx[i+1],:] = profile.populate(n, m, ba=1, ca=1, centre=ctr)

    if HAVE_POOL:
        mp.ProcessingPool(mp.cpu_count()).map(fill_array,range(len(masses)))
    else:
        for i in range(len(masses)):
            fill_array(i)

    nhalos_with_gal = len(set(central_halos.tolist()+sat_halos.tolist()))

    print "Took ", time.time() - start, " seconds, or ", (time.time() - start)/nhalos_with_gal, " each halo."
    print "NhalosWithGal: ", nhalos_with_gal, ", Ncentrals: ", ncen,", NumGal: ", len(halo), ", MeanGal: ", float(
        len(halo))/nhalos_with_gal, ", MostGal: ", sgal.max() + 1 if len(sgal)>0 else 1

    if edges is None:
        pass
    elif np.isscalar(edges):
        edges = np.array([[0, 0, 0], [edges, edges, edges]])
    elif np.array(edges).shape == (2,):
        edges = np.array([[edges[0]]*3, [edges[1]]*3])

    if edges is not None:
        for j in range(3):
            d = pos[:, j] - edges[0][j]
            pos[d < 0, j] = edges[1][j] + d[d < 0]
            d = pos[:, j] - edges[1][j]
            pos[d > 0, j] = edges[0][j] + d[d > 0]

    return pos, halo.astype("int"), ncen
# parsing configuration file to import dir of cross-corr results
from seissuite.ant import (pscrosscorr, psutils)

process_type = "serial"

multiprocess = True
#import multiprocessing as mp
if multiprocess:
    
    try:
        import pathos.multiprocessing as mp
    except:
        import multiprocessing as mp
    
    no_of_cores = int(mp.cpu_count())

    process_type = "parallel"



# import CONFIG class initalised in ./configs/tmp_config.pickle
config_pickle = 'configs/tmp_config.pickle'
f = open(name=config_pickle, mode='rb')
CONFIG = pickle.load(f)
f.close()
    
# import variables from initialised CONFIG class.
MSEED_DIR = CONFIG.MSEED_DIR
DATABASE_DIR = CONFIG.DATABASE_DIR
DATALESS_DIR = CONFIG.DATALESS_DIR
Exemplo n.º 45
0
 def __init__(self, source, config, processes=cpu_count()):
     self._pool = Pool(processes)
     self._results = self._pool.imap(_merge_records, izip(source, repeat(config)))
Exemplo n.º 46
0

def genseq(idx):

    first = np.where(np.random.multinomial(1,pvals=pops)==1)[0][0]
    last = first
    last_ts = datetime.now()
    result = {'artist_idx':[first],'ts':[last_ts]}
    for i in xrange(seq_length-1):
        next_listen = draw(last)
        last = next_listen
        gap_bin = 120*np.where(np.random.multinomial(1,pvals=td)==1)[0][0]
        gap = np.random.randint(gap_bin,gap_bin+120)
        result['artist_idx'].append(next_listen)
        new_ts = last_ts+timedelta(0,gap)
        result['ts'].append(new_ts)
        last_ts = new_ts

    df = pd.DataFrame(result)
    df['block'] = ((df['artist_idx'].shift(1) != df['artist_idx']).astype(int).cumsum())-1
    df.to_pickle(str(idx)+'.pkl')
    logging.info('idx {} complete'.format(idx))

pool = Pool(cpu_count())
indices = range(n)
pool.map(genseq,indices)
pool.close()



Exemplo n.º 47
0
#        if os.path.isfile(filename):
#            data = _merge(filename, data)

        if data:
            dill_save_obj(data, filename)


def load_finland_ids():
    with open('finland_ids.csv', 'r') as f:
        lines = f.readlines()
    return map(lambda l: l.split(',')[0].strip(), lines)

if __name__ == '__main__':

    chunk_size = pmp.cpu_count() - 2
    id_list = load_finland_ids()
    start_time = datetime(2013,11,1)
    end_time = datetime(2013,11,02)

    id_list_group = (lambda l, s=chunk_size: [l[i:i+s] for i in range(0, len(l), s)])(id_list)

    print "chuck size : " + str(chunk_size) + " len(id_list_group) : " + str(len(id_list_group))

    try:
        for id_group in id_list_group:
            p = pmp.Pool(chunk_size)
            p.map(lambda fin_id: read_vtt_data(fin_id, start_time, end_time), id_group)
            p.close()
            p.join()
            p.terminate()
Exemplo n.º 48
0
    print('done reading simulation cases')

    df['sampled_std']=None
    df['mse']=None
    df['error_per']=None
    df['pearson']=None

    if 0:
        '''removing non interesting cases'''
        df=df[(df.quant_size*df.number_of_bins<15)&(df.quant_size*df.number_of_bins>6)]
        df=df[df.quant_size*df.number_of_bins>8.9]
        print('dropping non interesting cases - running only %d simulations'%df.shape[0])
        df.reset_index(drop=True,inplace=True)
    if not u.run_serial and u.number_of_splits==1:
        '''multi cores'''
        num_of_cpu = mp.cpu_count()
        if 0:
            print('disabling parallel work!!!!!')
            num_of_cpu = 1
        print('number of cpus: %d' % num_of_cpu)
        p = mp.Pool(num_of_cpu)
        args = [dict(df=df.iloc[inx], A_rows=A_rows, samples=u.samples) for inx in np.array_split(range(df.shape[0]), num_of_cpu)]
        df[['sampled_std','mse','error_per','pearson']] = pd.concat(p.map(lambda y: y['df'].apply(lambda x:pd.Series(globals()[x.method](samples=y['samples'] ,quant_size=x.quant_size, std_threshold=x.std_threshold,number_of_bins=x.number_of_bins,A_rows=y['A_rows']),index=['sampled_std','mse','error_per','pearson']),axis=1),args))
    else:
        for inx in tqdm(np.array_split(range(df.shape[0]),100)):
            df.loc[inx+df.index.values[0],['sampled_std','mse','error_per','pearson']]=df.iloc[inx].apply(lambda x:pd.Series(globals()[x.method](samples=u.samples ,quant_size=x.quant_size,number_of_bins=x.number_of_bins, std_threshold=x.std_threshold,A_rows=A_rows),index=['sampled_std','mse','error_per','pearson']),axis=1)
    print(df.head())
    df.to_csv('resutls_%08d.csv.gz'%u.number_of_split,compression='gzip')
    # df.dropna(how='any',inplace=True)
    # res=df.pivot_table(values='mse',columns=['number_of_bins','method'],index=['quant_size'],aggfunc=[np.mean,np.median])
    # res.to_csv('7.2 resutls.csv')
Exemplo n.º 49
0
 def __init__(self, source, config, single_process=False, processes=cpu_count()):
     if single_process:
         self.results = imap(transform, izip(source, repeat(config)))
     else:
         self.pool = Pool(processes)
         self.results = self.pool.imap(transform, izip(source, repeat(config)))
Exemplo n.º 50
0
print(clock)
'''
# --------------------------------------- #
#    This approach uses multiple cores    #
from pathos.multiprocessing import ProcessingPool, cpu_count

if __name__ == '__main__': # This is essential if Used on windows!
    clock = time() # starting time
    
    # creates a worker pool from given comand line parameter. If the given
    # parameter is to large all detectable CPUs will be utilised. If the given
    # parameter is nonsense only 1 core will be utilized.
    workers = 1
    if len(sys.argv) >= 2 and sys.argv[1].isdigit() and int(sys.argv[1]) > 0:
        workers = cpu_count()
        if int(sys.argv[1]) <= workers:
            workers = int(sys.argv[1])
    
    print 'N:  ' + str(N)
    print 'PW: ' + str(workers)
    sleep(3) # just 3 seconds pause to read the input again.

    # All the magic happens here:
    pool = ProcessingPool(workers)
    Ys = pool.map(steadyState,y0)   

    clock = time()-clock # elapsed time
    print 'Seconds: ' + str(clock) # Not essential but useful.

    # Serilisation of results and stats:
Exemplo n.º 51
0
    def set_experiment(self, experiment, options, mstats, logbook, pset, toolbox, population,  invalid_ind, hof, START_GEN=None, POP_SIZE=None,  SEED=42):        

        if options:
            self.options = options

        if not self.re_positive_dataset:
            print "Loading datasets"
            self.load_dataset(self.options)

        if experiment:
            self.experiment = experiment


        if hof:
            self.hof = hof
        else:
            self.hof = deap.tools.ParetoFront(self.pareto_similar)

        if START_GEN:
            self.START_GEN = START_GEN
        if POP_SIZE:
            self.POP_SIZE = POP_SIZE

        self.CXPB = options.cxpb           
        self.MUTPB = options.mutpb
            

        if mstats:
            self.mstats = mstats
        else:
            stats = {}

            def lambda_factory(idx):
                return lambda ind: ind.fitness.values[idx]
                
            for tag in self.FITNESS_TAGS:
                s = deap.tools.Statistics( key=lambda_factory(
                    self.FITNESS_TAGS.index(tag) 
                ))
                stats[tag] = s
    
            #stats["Discrimination"] = deap.tools.Statistics(key=lambda ind: ind.fitness.values[0])
            #stats["Support"] = deap.tools.Statistics(key=lambda ind: ind.fitness.values[1])

            #stats_accuracy = deap.tools.Statistics(key=lambda ind: ind.fitness.values[0])
            #stats_coverage = deap.tools.Statistics(key=lambda ind: ind.fitness.values[1])

            self.mstats = deap.tools.MultiStatistics( **stats  ) #accuracy=stats_accuracy, coverage=stats_coverage )

            self.mstats.register("avg", numpy.mean, axis=0)
            self.mstats.register("std", numpy.std , axis=0)
            self.mstats.register("min", numpy.min , axis=0)
            self.mstats.register("max", numpy.max , axis=0)

        if logbook:
            self.logbook = logbook
        else:
            self.logbook = deap.tools.Logbook()
            #self.logbook.header = "gen", "evals", "hypervolume", "memoize"
            self.logbook.header = "gen", "evals", "memoize"
            self.logbook.header += tuple((self.FITNESS_TAGS)) #tuple(["accuracy","coverage"]) #

        if pset:
            self.pset = pset

        if population:
            self.population = population

        if invalid_ind:
            self.invalid_ind = invalid_ind

        #creator.create("FitnessMax", base.Fitness, weights=self.FITNESS_WEIGHTS)
        #creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax, pset=self.pset)

        if toolbox:
            self.toolbox = toolbox
        else:
            self.toolbox = deap.base.Toolbox()
	    if self.options.ncpu and ( self.options.ncpu == "auto" or int(self.options.ncpu) > 1):
                """
                Check if PATHOS was loaded
                """
                global PATHOS
                if PATHOS:
                    if self.options.ncpu == "auto":
		        n_cpu = cpu_count()
                    else:
                        n_cpu = int(self.options.ncpu)
		    print "Using pathos.multiprocessing with", n_cpu, "cpus."
                    pool = Pool(n_cpu)
                    self.toolbox.register("map", pool.map)
                else:
                    print "WARNING: Failed to load pathos; using a single processor."

            """ Basic toolbox functions"""
            self.toolbox.register("expr", deap.gp.genHalfAndHalf, pset=self.pset, type_=self.pset.ret, min_=1, max_=4)
            self.toolbox.register("individual", deap.tools.initIterate, deap.creator.Individual, self.toolbox.expr)
            self.toolbox.register("population", deap.tools.initRepeat, list, self.toolbox.individual)
            self.toolbox.register("compile", deap.gp.compile, pset=self.pset)
            
            # Set evaluation 
            if self.options.grammar == "pssm":
                self.toolbox.register("evaluate", self.eval_pssm_match)
            else:
                try:
                #self.toolbox.register("evaluate", self.memoize_pfm_match)
                #self.toolbox.register("evaluate", self.eval_regex_match)
                #self.toolbox.register("evaluate", self.memoize_regex_match)
                #self.toolbox.register("evaluate", self.memoize_python_matcher)
                    if self.options.matcher == "grep":
                        print "Loading grep matcher"
                        self.test_grep()
                        self.toolbox.register("evaluate", self.memoize_grep_match)
                    elif self.options.matcher == "python" :
                        print "Loading Python matcher"
                        self.toolbox.register("evaluate", self.memoize_python_matcher)
                    else:
                        raise Exception("Unknown matching mode" + str(self.options.matcher))
                    print "Loaded."
                except Exception as e:
                    print "Exception occured while setting matcher: ({0}): {1}".format( e.message, str(e.args))
                    print "Use --matcher=python as argument."
                    exit(-1)
                    #print "Falling back to python matcher"                    
                    #self.toolbox.register("evaluate", self.memoize_python_matcher)
                        
                    

                # Add penalties
                for penalty_feasible in self.FITNESS_PENALTIES:
                    if penalty_feasible is not None:
                        self.toolbox.decorate("evaluate", 
                                              deap.tools.DeltaPenality(penalty_feasible, (0.0,)*len(self.FITNESS_TAGS)))
                
                #self.toolbox.register("evaluate", self.eval_pfm_match)
                #self.toolbox.register("evaluate", self.eval_pfm_match)
                #self.toolbox.register("evaluate", self.eval_precomputed_pfm_matches)
            
            #self.toolbox.register("mate", deap.gp.cxOnePoint)
            self.toolbox.register("mate", deap.gp.cxOnePointLeafBiased, termpb=0.1 )
            self.toolbox.register("expr_mut", deap.gp.genGrow, min_=1, max_=4, pset=self.pset)
            self.toolbox.register("mutate", deap.gp.mutUniform, expr=self.toolbox.expr_mut, pset=self.pset)
            #self.toolbox.register("mutate", self.multimutate, pset=self.pset)


            """ Static Limit for GP Tree """
            MAX_HEIGHT = 17 # Koza 
            #MAX_HEIGHT = 8 
            #MAX_HEIGHT = 89
            #MAX_HEIGHT = 25
            self.toolbox.decorate("mate", deap.gp.staticLimit(operator.attrgetter('height'), MAX_HEIGHT))
            self.toolbox.decorate("mutate", deap.gp.staticLimit(operator.attrgetter('height'), MAX_HEIGHT))

            """ Seeding """
            self.toolbox.register("init_seeds", self.initSeeds, deap.creator.Individual)
            self.toolbox.register("init_seeded_population", self.initSeededPopulation, list, self.toolbox.init_seeds)                                                
            
            self.population = self.toolbox.population(self.POP_SIZE) #Random pop for algorithms that need a initial pop
            self.toolbox.register("memoizecount", lambda: self.memoize_count)

            if self.options.moo == "SPEA2":
                self.toolbox.register("select", deap.tools.selSPEA2)
            elif self.options.moo == "NSGA2":
                self.toolbox.register("select", deap.tools.selNSGA2)
            elif self.options.moo == "MOEAD":
                pass
            else:
                if self.options.moo == "ESCMA":
                    from deap import cma
                    """
                    # We start with a centroid in [-4, 4]**D
                    sigma = 2 * 10**(-2 * numpy.random.rand())
                    strategy = cma.StrategyMultiObjective(self.population, 
                                                          #centroid=numpy.random.uniform(-4, 4, len(self.population)), 
                                                          sigma=sigma, 
                                                          lambda_=len(self.population)
                                                     )

                    #self.toolbox.register("generate", strategy.generate, deap.creator.Individual)
                    self.toolbox.register("update", strategy.update)
                    for ind in self.population:
                        ind.fitness.values = self.toolbox.evaluate(ind)
                    """

                    strategy = cma.StrategyMultiObjective(self.population, sigma=1.0, mu=self.POP_SIZE, lambda_=self.POP_SIZE)
                    self.toolbox.register("generate", strategy.generate, deap.creator.Individual)
                    self.toolbox.register("update", strategy.update)
                    pass
                    
                else: #DEFAULT
                    if self.options.moo != "NSGAR":
                        print "Unknown algorithm", self.options.moo, ". Defaulting to NSGAR"
                    self.toolbox.register("preselect", nsgafortin.selTournamentFitnessDCD)
                    self.toolbox.register("select", nsgafortin.selNSGA2)
                    # Evaluate the individuals with an invalid fitness
                    self.population = self.toolbox.population(self.POP_SIZE) #Random pop
                    invalid_ind = [ind for ind in self.population if not ind.fitness.valid]
                    fitnesses = self.toolbox.map(self.toolbox.evaluate, invalid_ind)
                    for ind, fit in zip(invalid_ind, fitnesses):
                        ind.fitness.values = fit
                    self.population = self.toolbox.select(self.population, self.POP_SIZE)
                
            # Set REVCOMP
            self.REVCOMP = options.revcomp


        if not self.options.no_seed:

            seed_options = copy.deepcopy(self.options)
            seed_options.no_seed=True
            seed_options.grammar="alpha"
            #seed_options.moo="SPEA2" #FIXME There's currently a bug preventing seeds from being produced by the FORTIN selection algorithm # Nope, doesn't work at all, just fix it already.
            seedengine = Engine(None)
            #seedengine
            #seedengine.boot(self.options)
            #print self.population
            #raw_input()
            seedengine.set_experiment("regex", seed_options,None, None, None, self.toolbox, self.toolbox.population(self.POP_SIZE), None, None, 0)
            ENGINE_GEN = 50
            print "Seeded for", ENGINE_GEN," generations using grammar:", seed_options.grammar

            self_seeds, self.logbook = seedengine.run(ENGINE_GEN)
            self_seeds = [ind for ind in seedengine.hof]
            #print self_seeds
            #raw_input()
            self.population = self_seeds #TODO: Problem with small seed counts
            pass
        else:
            if len(self.initial_seeds) > 0:
                #self.initial_seeds = ["add,C,A", "C", "add,A,add,C,add,T,G" ]
                print "Seeded population initated"
                self.population = self.toolbox.init_seeded_population()                 
            elif not self.population:
                self.population = self.toolbox.population(self.POP_SIZE)
                
        print "Initial pop size:", len(self.population)