def compute_good_turing(self, labels, batch_size=stg.JOBLIB_BATCH_SIZE, parallel=True): """Compute good turing estimator""" stg.logger.info("Computing Good Turing Estimator") n_jobs = u.get_n_processes() if parallel else 1 stg.logger.info("Compute frequencies") # Get frequencies freq = self.frequencies(labels, batch_size, n_jobs=n_jobs) # Check if there are labels appearing only once if not any(np.where(freq == 1)[0]): stg.logger.info("No labels appearing only once") n1 = 0 # n1 = np.inf else: stg.logger.info("Compute frequencies of frequencies") # Get frequency of frequencies freq_freq = self.frequencies(freq, batch_size=batch_size, n_jobs=n_jobs) n1 = freq_freq[0] # Get Good Turing estimator self.good_turing = n1 / self.n_samples # Get Good Turing estimator self.good_turing_smooth = self.alpha * n1/self.n_samples + \ (1 - self.alpha) * self.good_turing_smooth
def solve_parametric(self, theta, batch_size=stg.JOBLIB_BATCH_SIZE, parallel=True, # Solve problems in parallel message="Solving for all theta", ): """ Solve parametric problems for each value of theta. Parameters ---------- theta : DataFrame Parameter values. parallel : bool, optional Solve problems in parallel. Default True. message : str, optional Message to be printed on progress bar. Returns ------- dict Results dictionary. """ n = len(theta) # Number of points n_jobs = u.get_n_processes() if parallel else 1 stg.logger.info(message + " (n_jobs = %d)" % n_jobs) results = Parallel(n_jobs=n_jobs, batch_size=batch_size)( delayed(self.populate_and_solve)(theta.iloc[i]) for i in tqdm(range(n)) ) return results
def encode_strategies(strategies, batch_size=stg.JOBLIB_BATCH_SIZE, parallel=True): """ Encode strategies Parameters ---------- strategies : Strategies array Array of strategies to be encoded. Returns ------- numpy array Encodings for each strategy in strategies. Strategies array Array of unique strategies. """ stg.logger.info("Encoding strategies") N = len(strategies) stg.logger.info("Getting unique set of strategies") start_time = time() unique = unique_strategies(strategies) end_time = time() stg.logger.info("Extraction time %.3f sec" % (end_time - start_time)) n_unique_strategies = len(unique) stg.logger.info("Found %d unique strategies" % n_unique_strategies) # Map strategies to number n_jobs = u.get_n_processes() if parallel else 1 stg.logger.info("Assign samples to unique strategies (n_jobs = %d)" % n_jobs) results = Parallel(n_jobs=n_jobs, batch_size=batch_size)( delayed(assign_to_unique_strategy)(s, unique) for s in tqdm(strategies)) y = np.array(results) return y, unique
def assign_samples(self, discarded_samples, selected_strategies, batch_size, parallel=True): """ Assign samples to strategies choosing the ones minimizing the cost. """ # Backup strategies labels and encodings # self.y_full = self.y_train # Reassign y_labels # selected_strategies: find index where new labels are # discarded_strategies: -1 self.y_train = np.array([ np.where(selected_strategies == label)[0][0] if label in selected_strategies else -1 for label in self.y_train ]) # Assign discarded samples and compute degradation degradation = np.zeros(len(discarded_samples)) n_jobs = u.get_n_processes() if parallel else 1 stg.logger.info("Assign samples to selected strategies (n_jobs = %d)" % n_jobs) results = Parallel(n_jobs=n_jobs, batch_size=batch_size)( delayed(best_strategy)(self.X_train.iloc[i], self.obj_train[i], self.encoding, self.problem) for i in tqdm(range(len(discarded_samples)))) for i in range(len(discarded_samples)): sample_idx = discarded_samples[i] self.y_train[sample_idx], degradation[i] = results[i] return degradation
def choose_best(self, problem_data, labels, parallel=False, batch_size=stg.JOBLIB_BATCH_SIZE, use_cache=True): """ Choose best strategy between provided ones Parameters ---------- labels : list Strategy labels to compare. parallel : bool, optional Perform `n_best` strategies evaluation in parallel. True by default. use_cache : bool, optional Use solver cache if available. True by default. Returns ------- dict Results as a dictionary. """ n_best = self._learner.options['n_best'] # For each n_best classes get x, y, time and store the best one x = [] time = [] infeas = [] cost = [] strategies = [self.encoding[label] for label in labels] # Cache is a list of solver caches to pass cache = [None] * n_best if self._solver_cache and use_cache: cache = [self._solver_cache[label] for label in labels] n_jobs = u.get_n_processes(n_best) if parallel else 1 results = Parallel(n_jobs=n_jobs, batch_size=batch_size)( delayed(self._problem.solve)(problem_data, strategy=strategies[j], cache=cache[j]) for j in range(n_best)) x = [r["x"] for r in results] time = [r["time"] for r in results] infeas = [r["infeasibility"] for r in results] cost = [r["cost"] for r in results] # Pick best class between k ones infeas = np.array(infeas) cost = np.array(cost) idx_filter = np.where(infeas <= stg.INFEAS_TOL)[0] if len(idx_filter) > 0: # Case 1: Feasible points # -> Get solution with best cost # between feasible ones if self._problem.sense() == Minimize: idx_pick = idx_filter[np.argmin(cost[idx_filter])] elif self._problem.sense() == Maximize: idx_pick = idx_filter[np.argmax(cost[idx_filter])] else: e.value_error('Objective type not understood') else: # Case 2: No feasible points # -> Get solution with minimum infeasibility idx_pick = np.argmin(infeas) # Store values we are interested in result = {} result['x'] = x[idx_pick] result['time'] = np.sum(time) result['strategy'] = strategies[idx_pick] result['cost'] = cost[idx_pick] result['infeasibility'] = infeas[idx_pick] return result
def __init__(self, **options): """ Initialize OptimalTrees class. Parameters ---------- options : dict Learner options as a dictionary. """ if not OptimalTree.is_installed(): e.value_error("Interpretable AI not installed") # Import julia and IAI module from interpretableai import iai self.iai = iai from julia import Distributed self.nprocs = Distributed.nprocs # Define name self.name = stg.OPTIMAL_TREE # Assign settings self.n_input = options.pop('n_input') self.n_classes = options.pop('n_classes') self.options = {} self.options['hyperplanes'] = options.pop('hyperplanes', False) # self.options['fast_num_support_restarts'] = \ # options.pop('fast_num_support_restarts', [20]) self.options['parallel'] = options.pop('parallel_trees', True) self.options['cp'] = options.pop('cp', None) self.options['max_depth'] = options.pop('max_depth', octstg.DEFAULT_TRAINING_PARAMS['max_depth']) self.options['minbucket'] = options.pop('minbucket', octstg.DEFAULT_TRAINING_PARAMS['minbucket']) # Pick minimum between n_best and n_classes self.options['n_best'] = min(options.pop('n_best', stg.N_BEST), self.n_classes) self.options['save_svg'] = options.pop('save_svg', False) # Get fraction between training and validation self.options['frac_train'] = options.pop('frac_train', stg.FRAC_TRAIN) # Load Julia n_cpus = get_n_processes() n_cur_procs = self.nprocs() if n_cur_procs < n_cpus and self.options['parallel']: # Add processors to match number of cpus Distributed.addprocs((n_cpus - n_cur_procs)) # Assign optimaltrees options self.optimaltrees_options = {'random_seed': 1} self.optimaltrees_options['max_depth'] = self.options['max_depth'] self.optimaltrees_options['minbucket'] = self.options['minbucket'] if self.options['hyperplanes']: self.optimaltrees_options['hyperplane_config'] = \ {'sparsity': 'all'} if self.options['cp']: self.optimaltrees_options['cp'] = self.options['cp']