class GA_FeatureSelection(): def __init__(self, UCM, URM_train, test_playlists_indices, logFile, bestIndividualFile, mode="selection", numGenerations=30, populationSize=30, initialRandomDistribution=np.random.uniform(0, 1), verbose=True): self.UCM = UCM self.URM_train = URM_train self.test_playlists_indices = test_playlists_indices.astype(np.int) self.logFile = open(logFile, "a") self.bestIndividualFile = open(bestIndividualFile, "a") self.initialRandomDistribution = initialRandomDistribution self.verbose = verbose self.top = 0 self.current = 0 self.evaluator = Evaluator(Datareader(mode='offline', only_load=True, verbose=False)) self.NUM_VARIABLES = UCM.shape[1] if (mode == "weighting" or mode == "selection"): self.mode = mode # Crossover probability self.CXPB = 0.5 # Mutation probability self.MUTPB = 0.2 # Number of generations for which the evolution runs self.NGEN = numGenerations self.POPULATION_SIZE = populationSize def writeOnLogFile(self, stringToLog): self.logFile.write(stringToLog + "\n") self.logFile.flush() def writeOnBestIndividualFile(self, stringToLog): self.bestIndividualFile.write(stringToLog + "\n") self.bestIndividualFile.flush() # Set the max number of features def isIndividualAccettable(self, individual): return np.sum(np.array(individual)) < 10000 def fitnessFunction(self, individual): # Convert list into a numpy array individual = np.array(individual) # Make a copy of the UCM and filter it for each column if self.verbose: print('Filtering UCM...') start = time.time() UCM_filtered = self.UCM.copy() UCM_filtered = UCM_filtered.astype(np.float64) inplace_csr_column_scale(UCM_filtered, individual) if self.verbose: print('UCM filtered in', time.time() - start, 'sec') # Compute similarity if self.verbose: print('Computing similarity...') start = time.time() similarity = tversky_similarity(UCM_filtered, shrink=200, alpha=0.1, beta=1, target_items=self.test_playlists_indices, binary=False) similarity = similarity.tocsr() if self.verbose: print('Similarity computed in', time.time() - start, 'sec') # Compute eurm if self.verbose: print('Computing eurm...') start = time.time() eurm = dot_product(similarity, self.URM_train, k=500) if self.verbose: print('eurm computed in', time.time() - start, 'sec') print('Converting eurm in csr...') start = time.time() eurm = eurm.tocsr() eurm = eurm[self.test_playlists_indices, :] if self.verbose: print('eurm converted in', time.time() - start, 'sec') # Evaluate rec_list = eurm_to_recommendation_list(eurm) print('current', self.current) score_cat_1 = self.evaluator.evaluate_single_metric(rec_list, name='Genetic', metric='prec', level='track', cat=1, verbose=False) score_cat_2 = self.evaluator.evaluate_single_metric(rec_list, name='Genetic', metric='prec', level='track', cat=2, verbose=False) score = (score_cat_1 + score_cat_2) / 2 self.current += 1 if self.verbose: print(score) print("Numfeatures {}".format(np.sum(individual))) print('\n') return score, def setupParameters(self): creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, fitness=creator.FitnessMax) self.toolbox = base.Toolbox() # Attribute generator # define 'attr_bool' to be an attribute ('gene') # which corresponds to integers sampled uniformly # from the range [0,1] (i.e. 0 or 1 with equal # probability) # Structure initializers # define 'individual' to be an individual # consisting of 100 'attr_bool' elements ('genes') if (self.mode == "weighting"): self.toolbox.register("attr_float", self.initialRandomDistribution) self.toolbox.register("individual", tools.initRepeat, creator.Individual, self.toolbox.attr_float, self.NUM_VARIABLES) elif (self.mode == "selection"): # self.toolbox.register("attr_bool", random.randint, 0, 1) self.toolbox.register("attr_bool", self.initialRandomDistribution) self.toolbox.register("individual", tools.initRepeat, creator.Individual, self.toolbox.attr_bool, self.NUM_VARIABLES) # define the population to be a list of individuals self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual) # ---------- # Operator registration # ---------- # register the goal / fitness function self.toolbox.register("evaluate", self.fitnessFunction) # self.toolbox.decorate("evaluate", tools.DeltaPenality(self.isIndividualAccettable, -1.0)) # register the crossover operator self.toolbox.register("mate", tools.cxTwoPoint) # register a mutation operator with a probability to # flip each attribute/gene of 0.05 if self.mode == "weighting": self.toolbox.register("mutate", randomMutationCustom, indpb=0.05) elif self.mode == "selection": self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) # operator for selecting individuals for breeding the next # generation: each individual of the current generation # is replaced by the 'fittest' (best) of three individuals # drawn randomly from the current generation. self.toolbox.register("select", tools.selTournament, tournsize=3) # self.toolbox.register("select", tools.selRandom) def main(self): self.start_time = time.time() random.seed(64) self.setupParameters() self.writeOnLogFile(time.strftime("%Y-%m-%d %H:%M") + "\n") self.writeOnBestIndividualFile(time.strftime("%Y-%m-%d %H:%M") + "\n") # create an initial population of 300 individuals (where # each individual is a list of integers) pop = self.toolbox.population(n=self.POPULATION_SIZE) print("Start of evolution") self.current = 0 # Evaluate the entire population fitnesses = list(map(self.toolbox.evaluate, pop)) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit print(" Evaluated %i individuals" % len(pop)) # Begin the evolution for g in range(self.NGEN): print("-- Generation %i --" % g) self.writeOnLogFile("-- Generation %i --" % g) # Select the next generation individuals offspring = self.toolbox.select(pop, len(pop)) # Clone the selected individuals offspring = list(map(self.toolbox.clone, offspring)) # Apply crossover and mutation on the offspring for child1, child2 in zip(offspring[::2], offspring[1::2]): # cross two individuals with probability CXPB if random.random() < self.CXPB: self.toolbox.mate(child1, child2) # fitness values of the children # must be recalculated later del child1.fitness.values del child2.fitness.values for mutant in offspring: # mutate an individual with probability MUTPB if random.random() < self.MUTPB: self.toolbox.mutate(mutant) del mutant.fitness.values # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = map(self.toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit print(" Evaluated %i individuals" % len(invalid_ind)) # The population is entirely replaced by the offspring pop[:] = offspring # Gather all the fitnesses in one list and print the stats fits = [ind.fitness.values[0] for ind in pop] length = len(pop) mean = sum(fits) / length sum2 = sum(x*x for x in fits) std = abs(sum2 / length - mean**2)**0.5 # Update top value if max(fits) > self.top: self.top = max(fits) # Write on log file best_ind = tools.selBest(pop, 1)[0] self.writeOnBestIndividualFile('GEN ' + str(g) + ' | ' + str(self.top)) self.writeOnBestIndividualFile("%s" % best_ind + '\n') print(" Top %s" % self.top) print(" Min %s" % min(fits)) print(" Max %s" % max(fits)) print(" Avg %s" % mean) print(" Std %s" % std) self.writeOnLogFile(" Top %s\n" % self.top + " Min %s\n" % min(fits) + " Max %s\n" % max(fits) + " Avg %s\n" % mean + " Std %s\n" % std) print("-- End of (successful) evolution --") best_ind = tools.selBest(pop, 1)[0] print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values[0])) print("Elapsed time" + str(time.time()-self.start_time))
a = float(sys.argv[1]) b = float(sys.argv[2]) c = float(sys.argv[3]) d = float(sys.argv[4]) e = float(sys.argv[5]) f = float(sys.argv[6]) g = float(sys.argv[7]) res = ensembler(matrix, [a, b, c, d, e, f, g], normalization_type="max") ev = Evaluator(dr) ret = [ -ev.evaluate_single_metric(eurm_to_recommendation_list(res, cat=cat), cat=cat, name="ens" + str(cat), metric='prec', level='track') ] if os.path.isfile("best.npy"): best = np.load("best.npy") if ret[0] < best[-1].astype(np.float): b = sys.argv[1:] b.append(ret[0]) np.save("best", b) else: b = sys.argv[1:] b.append(ret[0]) np.save("best", b)
class Optimizer(object): def __init__(self, matrices_names, matrices_array, dr, cat, start, end, n_calls=1000, n_random_starts=0.1, n_points=50, step=0.001, verbose=True): self.target_metric = 'ndcg' self.best_score = 0 self.best_params = 0 self.norm = norm_max_row self.verbose = verbose self.n_cpu = int(multiprocessing.cpu_count() / 10) if self.n_cpu == 0: self.n_cpu = 1 # Do not edit self.start = start self.end = end self.cat = cat self.global_counter = 0 self.start_index = (cat - 1) * 1000 self.end_index = cat * 1000 self.matrices_array = list() self.matrices_names = matrices_names self.n_calls = n_calls self.global_counter = 0 self.x0 = None self.y0 = None self.n_random_starts = int(n_calls * n_random_starts) self.n_points = n_points self.step = step # memory_on_disk= False self.memory_on_notebook = True self.dr = dr self.ev = Evaluator(self.dr) for matrix in matrices_array: self.matrices_array.append( self.norm( eurm_remove_seed( matrix, datareader=dr)[self.start_index:self.end_index])) del self.dr, matrices_array def run(self): self.x0 = None self.y0 = None space = [ Real(self.start, self.end, name=x) for x in self.matrices_names ] self.res = gp_minimize(self.obiettivo, space, base_estimator=None, n_calls=self.n_calls, n_random_starts=self.n_random_starts, acq_func='gp_hedge', acq_optimizer='auto', x0=self.x0, y0=self.y0, random_state=None, verbose=self.verbose, callback=None, n_points=self.n_points, n_restarts_optimizer=10, xi=self.step, kappa=1.96, noise='gaussian', n_jobs=self.n_cpu) def obiettivo(self, x): eurm = sum(x[i] * matrix for i, matrix in enumerate(self.matrices_array)) # real objective function ris = -self.ev.evaluate_single_metric(eurm_to_recommendation_list( eurm, cat=self.cat, remove_seed=False, verbose=False), verbose=False, cat=self.cat, name="ens" + str(self.cat), metric=self.target_metric, level='track') # memory variables if self.x0 is None: self.x0 = [[x]] self.y0 = [ris] else: self.x0.append(x) self.y0.append(ris) self.global_counter += 1 if ris < self.best_score: print("[NEW BEST]") self.pretty_print(ris, x) self.best_score = ris self.best_params = x.copy() self.best_params_dict = dict(zip(self.matrices_names, x.copy())) b = list() if os.path.isfile("best/cat" + str(self.cat) + ".plk"): b.append(self.best_params_dict) b.append(ris) save_obj(b, "best/cat" + str(self.cat)) else: b.append(self.best_params_dict) b.append(ris) save_obj(b, "best/cat" + str(self.cat)) elif self.verbose: self.pretty_print(ris, x) return ris def pretty_print(self, ris, x): print("CAT:", self.cat, "ITER:", self.global_counter, "RES:", ris, end="\tvals:\t") for i in range(len(x)): print(self.matrices_names[i], "%.4f" % (x[i]), end="\t") print() print("-" * 80) print()