def run_N(self, nb_execution=10, loop=100, grphq=False, pas=10, duration_gif=0.5): """ Exécute N itération de l'algorithme des k-means, et conserve les centres qui produisent le moins d'erreur. Chaque itération est produite à partir de centres initiaux aléatoires, donc les résultats sont différents à chaque fois. Retourne cette erreur minimale. Les paramètres d'entrée sont les même que pour run, avec l'ajout de : nb_execution : entier désignant le nombre de calcul de k-means à faire. """ f = partial(self.__k_run, loop=loop, grphq=grphq, pas=pas) pool = Pool(self.cpu) memory = list(pool.uimap(f, range(nb_execution))) pool.close() pool.join() ind = np.argmin(np.array([m[0] for m in memory])) means = memory[ind][1] self.means = means self.calc_grp() if grphq: self.grphq.create_gif(duration=duration_gif) del pool return memory[ind][0]
def test_pathos_pp_callable () : """Test parallel processnig with pathos: ParallelPool """ logger = getLogger("ostap.test_pathos_pp_callable") if not pathos : logger.error ( "pathos is not available" ) return logger.info ('Test job submission with %s' % pathos ) if DILL_PY3_issue : logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" ) return ## logger.warning ("test is disabled for UNKNOWN REASON") ## return from pathos.helpers import cpu_count ncpus = cpu_count () from pathos.pools import ParallelPool as Pool pool = Pool ( ncpus ) logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) ) pool.restart ( True ) mh = MakeHisto() jobs = pool.uimap ( mh.process , [ ( i , n ) for ( i , n ) in enumerate ( inputs ) ] ) result = None for h in progress_bar ( jobs , max_value = len ( inputs ) ) : if not result : result = h else : result.Add ( h ) pool.close () pool.join () pool.clear () logger.info ( "Histogram is %s" % result.dump ( 80 , 10 ) ) logger.info ( "Entries %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) with wait ( 1 ) , use_canvas ( 'test_pathos_pp_callable' ) : result.draw ( ) return result
def test_pathos_mp_function () : """Test parallel processnig with pathos: ProcessPool """ logger = getLogger("ostap.test_pathos_mp_function") if not pathos : logger.error ( "pathos is not available" ) return logger.info ('Test job submission with %s' % pathos ) if DILL_PY3_issue : logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" ) return from pathos.helpers import cpu_count ncpus = cpu_count () from pathos.pools import ProcessPool as Pool pool = Pool ( ncpus ) logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) ) with pool_context ( pool ) : jobs = pool.uimap ( make_histo , zip ( count() , inputs ) ) result = None for h in progress_bar ( jobs , max_value = len ( inputs ) ) : if not result : result = h else : result.Add ( h ) logger.info ( "Histogram is %s" % result.dump ( 80 , 10 ) ) logger.info ( "Entries %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) with wait ( 1 ) , use_canvas ( 'test_pathos_mp_function' ) : result.draw ( ) return result
class ReMap(ReIterBase): def __init__(self, fn, iterable_input, proc_type=None, n_proc=1, per_proc_buffer=1, ordered=True, name='reMap', verbose=True): """ This is a map function that can be iterated over more than once. Returns an iterator. Parameters ---------- fn iterable_input iterable input proc_type if 'sub' then uses a pathos ProcessPool to map function if 'thread' then uses standard multiprocessing ThreadPool else uses regular map n_proc number of workers in a pool (ignored if no pool) per_proc_buffer since pool's map function does not know limits, there is a forced stop-and-yield-all after this many processed tasks per process/thread ordered use ordered map by default, uses `imap_unordered` otherwise name name to use for logging messages verbose """ name += '' if proc_type not in ('sub', 'proc', 'subprocess', 'th', 'thread') else ' ' + proc_type super().__init__(iterable_input=iterable_input, name=name, verbose=verbose) self.fn = fn self.proc_type = proc_type self.per_proc_buffer = per_proc_buffer self.n_proc = n_proc self.ordered = ordered def _iter(self): if self.proc_type in ('thread', 'th') and self.n_proc > 0: with ThreadPool(self.n_proc) as p: # this is a workaround for limiting input iterator consumption, got it from SO buff = [] for itm in self.iterable_input: buff.append(itm) if len(buff) >= self.per_proc_buffer * self.n_proc: if self.ordered: for itm in p.imap(self.fn, buff): yield itm else: for itm in p.imap_unordered(self.fn, buff): yield itm buff = [] # feed the remaining buffer after input is exhausted if self.ordered: for itm in p.imap(self.fn, buff): yield itm else: for itm in p.imap_unordered(self.fn, buff): yield itm elif self.proc_type in ('sub', 'proc', 'subprocess') and self.n_proc > 0: try: log.info("Trying to terminate previous pool") # this is stupid, but that's how pathos is built self.pool.terminate() self.pool.clear() log.info("Yay! Cleared previous process pool") except AttributeError: log.warning("Is this the first time creating a pool...") self.pool = ProcessPool(nodes=self.n_proc) # this is a workaround for limiting input iterator consumption, got it from SO buff = [] for itm in self.iterable_input: buff.append(itm) if len(buff) >= self.per_proc_buffer * self.n_proc: if self.ordered: for itm in self.pool.imap(self.fn, buff): yield itm else: for itm in self.pool.uimap(self.fn, buff): yield itm buff = [] # feed the remaining buffer after input is exhausted if self.ordered: for itm in self.pool.imap(self.fn, buff): yield itm else: for itm in self.pool.uimap(self.fn, buff): yield itm else: for itm in map(self.fn, self.iterable_input): yield itm
def run_global_automated(self, grphq=False, duration_gif=0.5, pas=1, B=10, loop=100): """ Implémentation modifiée de run_global où le choix du nombre de cluster est déterminé par des statistiques calculés au fur et à mesure. Les paramètres sont : grphq, duration_gif, pas, B, loop et correspondent aux définitions évoqués dans run_global. Paramètre de sortie : instance idéal de Kmeans. """ pool = Pool(self.cpu) pool.close() pool.join() mini, maxi = np.min(self.data.data, axis=0), np.max(self.data.data, axis=0) shape = self.data.data.shape i = 1 self.set_nb_cluster(i) self.choose_means_initiate() self.calc_grp() self.choose_means() self.calc_grp() means = self.means if grphq: self.grphq.plot_graph(self.data.data, self.grp, self.means.reshape((1, -1)), 1) self.print_meta_data() gap, var = self.gap_stat_mono(self.error, i, mini, maxi, shape, pool, B) cond = True km_cpy = self.copy(erase_dir=False) print("Fin de l'étape {}".format(i)) while cond: i += 1 self.set_nb_cluster(i) pool.restart() f = partial(self.__multi_j, loop=loop, means=means) s = pool.uimap(f, range(0, self.L, pas)) pool.close() pool.join() s = np.array(list(s)) arg = np.argmin(s[:, 1]) j = int(s[arg, 0]) means_cpy = np.vstack((means, self.data.data[j])) self.means = means_cpy k = 0 backup = (None, None, -1, -1) self.calc_grp() while (self.cond_conv(backup)) and (k < loop): k += 1 backup = self.backup_metadata() self.choose_means() if ((self.choose_means != self.choose_means_moy_true) and (self.choose_means != self.choose_means_med_true)): self.calc_grp() self.migration = np.count_nonzero( (self.grp[:, 1] - backup[1][:, 1])) self.same_means = np.array_equal(self.means, backup[0]) means = self.means gap_f, var_f = self.gap_stat_mono(self.error, i, mini, maxi, shape, pool, B) diff = gap - (gap_f - var_f) print("Gap statistical (étape {}) : {}".format(i - 1, diff)) if grphq: self.grphq.plot_graph(self.data.data, self.grp, self.means, i) self.print_meta_data() print("Fin de l'étape {}".format(i)) if diff >= 0: break else: gap = gap_f km_cpy = self.copy(erase_dir=False) if grphq: self.grphq.create_gif(duration=duration_gif) self = km_cpy.copy(erase_dir=False) self.calc_grp() print("Le nombre optimal de classes est : {}".format(self.nb_cluster)) del pool return self
def run_global(self, loop=100, grphq=False, duration_gif=0.5, pas=1, choose_nb_graph=False, B=10): """ Implémente l'algorithme des global k-means qui calcule incrémentalement la configuration optimale des groupes pour un nombre de clusters donnée. L'algotrithme procède comme suit : 0) On définit le nombre cluster à 1 et on calcule le centre de la matrice de données. 1) On incrément le nombre de cluster. On définit comme centre les centres de l'étape précédente. On définit successivement chaque individu de la matrice de données comme dernier centre, on exécute l'algorithme du k-means avec chaque lot de centres et on garde le lot de centre qui minimise l'erreur. i+1) On réitère l'étape précédente jusqu'à obtenir le bon nombre de groupe. !!! Très gourmand en ressources. Paramètres d'entrée : loop : entier définissant le nombre d'itérations au sein des calcule de k-means avant arrêt du calcul, défaut = 100 grphq : boolean indiquant si les graphes doivent être affichés et enregistrés. duration_gif : réel qui caractérise la durée de chaque image dans la production du gif final, inutile si grphq = False pas : entier qui détermine l'écart entre chaque individu à tester pour le choix des individus comme centre. choose_nb_graph : boolean, affiche un lot de statistiques qui permettent de déterminer le nombre idéal de clusters. B : entier qui qui entre en jeu dans le calcul des statistiques évoquées précédemment. Paramètre de sortie : err : erreur de classification pour le nombre de cluster choisi. """ pool = Pool(self.cpu) pool.close() pool.join() err = [] n = self.nb_cluster self.set_nb_cluster(1) self.choose_means_initiate() self.calc_grp() self.choose_means() self.calc_grp() means = self.means err.append([1, self.error, self.clustering_error_rel(), self.var]) if grphq: self.grphq.plot_graph(self.data.data, self.grp, self.means.reshape((1, -1)), 1) self.print_meta_data() print("Fin de l'étape {}".format(1)) for i in range(2, n + 1): self.set_nb_cluster(i) pool.restart() f = partial(self.__multi_j, loop=loop, means=means) s = pool.uimap(f, range(0, self.L, pas)) pool.close() pool.join() s = np.array(list(s)) arg = np.argmin(s[:, 1]) j = int(s[arg, 0]) means_cpy = np.vstack((means, self.data.data[j])) self.means = means_cpy k = 0 backup = (None, None, -1, -1) self.calc_grp() while (self.cond_conv(backup)) and (k < loop): k += 1 backup = self.backup_metadata() self.choose_means() if ((self.choose_means != self.choose_means_moy_true) and (self.choose_means != self.choose_means_med_true)): self.calc_grp() self.migration = np.count_nonzero( (self.grp[:, 1] - backup[1][:, 1])) self.same_means = np.array_equal(self.means, backup[0]) means = self.means if grphq: self.grphq.plot_graph(self.data.data, self.grp, self.means, i) self.print_meta_data() err.append([i, self.error, self.clustering_error_rel(), self.var]) print("Fin de l'étape {}".format(i)) err = np.array(err) err = err[np.argsort(err[:, 0]), :].T if grphq: self.grphq.create_gif(duration=duration_gif) if choose_nb_graph: self.grphq.plot_crb_err_cluster(self.gap_stat(err, B)) del pool return err