def evaluate(bit_vector): # Takes a bitarray object # gene is a list of int, bit_vector a bitarray gene = BITtoGene(bit_vector) smile = opt.canonicalize(cfg_util.decode( opt.GenetoCFG(gene))) # Transform the gene into a smile score = score_util.calc_score(smile) # Calculate the J score return score
def Rollout(self): self.rnn.reset_state() for m in self.moves: self.rnn.forward(np.array([m]).astype(np.int32)) beam_width = 16 # must be bigger than 3! eps = 1e-100 lhs_list = zinc_grammar.lhs_list lhs_map = zinc_grammar.lhs_map initial_stack = self.stack initial_rule = self.moves candidates = [(self.rnn, initial_stack, initial_rule, 0.0)] sequence_length = 250 for t in range(sequence_length): next_candidates = [] for previous_model, previous_stack, rules, log_likelihood in candidates: if len(previous_stack) == 0: next_candidates.append( (None, previous_stack, rules, log_likelihood)) continue model = previous_model.copy() x = np.asarray([rules[-1]]).astype(np.int32) with chainer.using_config('train', False): with chainer.no_backprop_mode(): unmasked_probability = model.forward(x).data[0] stack = copy.copy(previous_stack) next_nonterminal = lhs_map[stack.pop()] mask = zinc_grammar.masks[next_nonterminal] masked_log_probability = np.log(unmasked_probability * mask + eps) order = masked_log_probability.argsort()[:-beam_width:-1] for sampled_rule in order: if masked_log_probability[sampled_rule] > np.log( eps) + eps: rhs = filter( lambda a: (type(a) == nltk.grammar.Nonterminal) and (str(a) != 'None'), zinc_grammar.GCFG.productions() [sampled_rule].rhs()) next_candidates.append( (model, stack + list(map(str, rhs))[::-1], rules + [sampled_rule], log_likelihood + masked_log_probability[sampled_rule])) candidates = sorted(next_candidates, key=lambda x: -x[3])[:beam_width] if all([len(candidate[1]) == 0 for candidate in candidates]): break smiles = [] self.moves_rollout = [] for candidate in candidates: self.moves_rollout.append(candidate[2]) smiles.append(cfg_util.decode(candidate[2])) pool = multiprocessing.Pool(multiprocessing.cpu_count()) scores = pool.map(rdock_util.score, smiles) pool.close() pool.terminate() self.rollout_scores = scores self.rollout_smiles = smiles return [(score, smiles) for score, smiles in zip(scores, smiles)]
def GetResult(self): # Get the result smiles = cfg_util.decode(self.moves) try: score = max(1.0, smiles_util.calc_score(smiles)) except: score = 0.0 return (score, smiles)
def save_log(population): save = input( "Save logs and image of final population ? (press 'y' or 'n') : ") if save == 'n': pass else: directory = input("Please input log file name (or directory) : ") # Creating a folder for this log os.system('mkdir ' + directory) file_name = directory # Stocking the final population in a pickle object f = open(directory + '/' + file_name + ".p", 'wb') # Remove double and non valid smile from the list befor stocking it ms = [] smile_list = [] for bit_vector in population: gene = BITtoGene(bit_vector) smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene))) if smile != '' and smile != None and smile not in smile_list: if MolFromSmiles(smile) != None: smile_list.append(smile) ms.append(MolFromSmiles(smile)) pickle.dump(smile_list, f) f.close() # Stocking the random.seed of this experiement in a text file f = open(directory + '/' + 'seed.txt', 'w') f.write(str(time) + '\n') #Stocking the final population and their score in the same file f.write('smile' + '\t' + 'score' + '\n') for smile in smile_list: score = score_util.calc_score(smile) f.write(smile + '\t' + str(score) + '\n') f.close() # Saving population Image if save == 'n': pass else: for i in range(len(ms)): Draw.MolToFile(ms[i], directory + '/' + str(i) + '.png', size=(120, 120)) os.system( 'montage ' + directory + '/*.png ' + directory + '/final.png' ) # Execute this command in the shell. Put all images of the molecules in a unique image
def main(): rnn = RNN(rule_size=len(zinc_grammar.GCFG.productions())) serializers.load_npz("model-9.npz", rnn) rule_size = len(zinc_grammar.GCFG.productions()) valid_smiles = [] for trial in range(10000): rules_sampled = [0] rnn.reset_state() for _ in range(280): with chainer.no_backprop_mode(): rule_prev = np.array([rules_sampled[-1]]).astype(np.int32) prob = rnn.get_probability(rule_prev).data[0] rule_sampled = np.random.choice(rule_size, p=prob) #print(rule_prev, rule_sampled, prob) rules_sampled.append(rule_sampled) smiles = cfg_util.decode(rules_sampled) if is_valid_smiles(smiles): valid_smiles.append(smiles) print(smiles, file=sys.stderr) if trial%100 == 0: print("{},{},{}".format(trial, len(valid_smiles), len(set(valid_smiles))))
def main(): rnn = RNN(rule_size=len(zinc_grammar.GCFG.productions())) serializers.load_npz("model-9.npz", rnn) rule_size = len(zinc_grammar.GCFG.productions()) valid_smiles = [] for trial in range(10000): print(trial, file=sys.stderr) rules_sampled = [0] stack = ['chain'] rnn.reset_state() for _ in range(280): rule_prev = np.array([rules_sampled[-1]]).astype(np.int32) with chainer.no_backprop_mode(): unmasked_prob = rnn.get_probability(rule_prev).data[0] if len(stack) > 0: p = stack.pop() else: p = 'Nothing' next_nonterminal = zinc_grammar.lhs_map[p] mask = zinc_grammar.masks[next_nonterminal] masked_prob_unnormalized = unmasked_prob * mask Z = np.sum(masked_prob_unnormalized) masked_prob = masked_prob_unnormalized / Z rule_sampled = np.random.choice(rule_size, p=masked_prob) rhs = filter( lambda a: (type(a) == nltk.grammar.Nonterminal) and (str(a) != 'None'), zinc_grammar.GCFG.productions()[rule_sampled].rhs()) stack.extend(list(map(str, rhs))[::-1]) rules_sampled.append(rule_sampled) smiles = cfg_util.decode(rules_sampled) if is_valid_smiles(smiles): valid_smiles.append(smiles) print(smiles, file=sys.stderr) if trial % 100 == 0: print("{},{},{}".format(trial, len(valid_smiles), len(set(valid_smiles))))
def __init__(self, rules): self.rules = list(rules) self.smiles = cfg_util.decode(rules)[0] self.subtreesize = [-1 for _ in range(len(self.rules))] self.__calc_subtreesize(0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--mu', type=int, default=32) parser.add_argument('--lam', type=int, default=64) parser.add_argument('--generation', type=int, default=1000) args = parser.parse_args() np.random.seed(args.seed) gene_length = 300 N_mu = args.mu N_lambda = args.lam # initialize population seed_smiles = [] with open(args.smifile) as f: for line in f: smiles = line.rstrip() seed_smiles.append(smiles) start_time = time.time() initial_smiles = np.random.choice(seed_smiles, N_mu+N_lambda) initial_smiles = [s for s in initial_smiles] initial_genes = [CFGtoGene(cfg_util.encode(s), max_len=gene_length) for s in initial_smiles] initial_scores = rdock_util.score_qsub(initial_smiles) population = [] for score, gene, smiles in zip(initial_scores, initial_genes, initial_smiles): population.append((score, smiles, gene)) population = sorted(population, key=lambda x: x[0])[:N_mu] all_smiles = [canonicalize(p[1]) for p in population] all_result = [(p[0], s) for p, s in zip(population, all_smiles)] scores = [p[0] for p in population] max_score = np.max(scores) elapsed_time = time.time() - start_time print("%{},{},{}".format(0, max_score, elapsed_time)) for p in population: print("{},{}".format(p[0], p[1])) for generation in range(args.generation): new_population_smiles = [] new_population_genes = [] for _ in range(N_lambda): p = population[np.random.randint(len(population))] p_gene = p[2] c_gene = mutation(p_gene) c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene))) if c_smiles != '' and c_smiles not in all_smiles: new_population_smiles.append(c_smiles) new_population_genes.append(c_gene) all_smiles.append(c_smiles) new_population_scores = rdock_util.score_qsub(new_population_smiles) for score, gene, smiles in zip(new_population_scores, new_population_genes, new_population_smiles): population.append((score, smiles, gene)) all_result.append((score, smiles)) population = sorted(population, key=lambda x: x[0])[:N_mu] scores = [i[0] for i in population] max_score = np.max(scores) elapsed_time = time.time() - start_time print("%{},{},{}".format(generation+1, max_score, elapsed_time)) for p in population: print("{},{}".format(p[0], p[1])) print("list of generated smiles:") for r in all_result: print("{},{}".format(r[0], r[1]))
def main(Pipes, island_id, nb_of_island, mig_interval, logn=-1): #parser = argparse.ArgumentParser() #parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi') #parser.add_argument('--seed', type=int, default=t.time()) #args = parser.parse_args() smifile = '250k_rndm_zinc_drugs_clean.smi' if logn == -1: np.random.seed(0 + island_id) else: np.random.seed(int(t.time())) #np.random.seed(0) global best_smiles global best_score global all_smiles gene_length = 300 N_mu = int(1000 / nb_of_island) N_lambda = int(2000 / nb_of_island) # initialize population seed_smiles = [] with open(smifile) as f: for line in f: smiles = line.rstrip() seed_smiles.append(smiles) initial_smiles = np.random.choice(seed_smiles, N_mu + N_lambda) initial_smiles = [canonicalize(s) for s in initial_smiles] initial_genes = [ CFGtoGene(cfg_util.encode(s), max_len=gene_length) for s in initial_smiles ] initial_scores = [score_util.calc_score(s) for s in initial_smiles] #print(initial_scores) population = [] for score, gene, smiles in zip(initial_scores, initial_genes, initial_smiles): population.append((score, smiles, gene)) population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu] th = threading.Timer(60, current_best, []) th.start() print("Start!") all_smiles = [p[1] for p in population] #print([p[0] for p in population]) #mig_interval = 5 # A migration every 1000 iteration x = [i for i in range(mig_interval, 1000000000, mig_interval) ] # All the generation in wich a migration should occur k = 1 # First migration t0 = t.time() for generation in range(1000000000): scores = [p[0] for p in population] mean_score = np.mean(scores) min_score = np.min(scores) std_score = np.std(scores) best_score = np.max(scores) idx = np.argmax(scores) best_smiles = population[idx][1] print("%{},{},{},{},{}".format(generation, best_score, mean_score, min_score, std_score)) new_population = [] for _ in range(N_lambda): p = population[np.random.randint(len(population))] p_gene = p[2] c_gene = mutation(p_gene) c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene))) if c_smiles not in all_smiles: c_score = score_util.calc_score(c_smiles) c = (c_score, c_smiles, c_gene) new_population.append(c) all_smiles.append(c_smiles) population.extend(new_population) population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu] # Every mig_interval generation make if generation in x: print('Starting Migration') if k >= nb_of_island: k = 1 population = migration(Pipes, island_id, nb_of_island, population, k) k += 1 if t.time() - t0 >= 3600 * 8: break if logn == -1: f = open( str(island_id) + '_final_pop' + '_' + str(nb_of_island) + '_' + str(mig_interval) + '.csv', 'w') if logn != -1: f = open( str(island_id) + '_final_pop' + '_' + str(nb_of_island) + '_' + str(mig_interval) + '_' + str(logn) + '.csv', 'w') population = pd.DataFrame(population) population.to_csv(f) f.close()
def main(): global time time = t.time() print(time) random.seed(time) max_generation = 1000000 max_time = 8 * 3600 # 8 hours population_size = 100 bit_vector_size = 2400 # Maximum length of vectors in the population (should be a multiple of 8) P = [0.5 for _ in range(0, bit_vector_size)] # Probability vector LR = 0.1 # Learning Rate (typically 0.1–0.4) MS = 0.05 # Degree of mutation (typical value is 0.05) Pr_mutation = 0.08 # Probability of mutation (typically 0.02) mu = 2 # Number of vector used to make P evolve k = 0 duration = 0 converge = False best_fitness = -1e11 best_bit_vector = None while converge is not True and duration < max_time: # k < max_generation or duration < max_time depending on what you want population = [] score_smile = [] best_bit_vector = None best_fitness = -1e10 for i in range(0, population_size): bit_vector = generate_bit_vector( P) # Create a new vector which represents an individual population.append(bit_vector) fitness = evaluate( population[i]) # Evaluate the fitness of the new vectorsr if fitness > -1e10: score_smile.append([fitness, bit_vector]) #print(fitness) #gene = BITtoGene(bit_vector) #smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene))) #print(smile) if fitness > best_fitness: # /!\ '<' and '>' best_fitness = fitness # Update the best individual (i.e. max fitness) best_bit_vector = bit_vector gene = BITtoGene(best_bit_vector) smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene))) print(best_fitness, '=', smile) try: # Evolution for j in range(0, len(P)): #P[j] = P[j]*(1 - LR) + int(best_bit_vector[j])*LR # Update the probability vector with the best indiv score_smile = sorted( score_smile, key=lambda x: x[0], reverse=False) # The best smile is a the end of the list print(score_smile) if len(score_smile) < mu: N = len(score_smile) else: N = mu X = 0 for i in range(N): X += (i + 1) * (score_smile[i][1][j] - P[j]) P[j] = P[j] + LR / ( P[j] * (1 - P[j])) * X # Information Geometric implementation # Mutation for j in range(0, len(P)): if random.random() < Pr_mutation: P[j] = P[j] * (1 - MS) + random.randint(0, 1) * MS except: print('No valid SMILE generated : pass') converge = convergence(P) k += 1 duration = t.time() - time print(k, ' time : ', duration, ' s') gene = BITtoGene(best_bit_vector) smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene))) print(smile) save_log(population) return best_bit_vector
def main(): parser = argparse.ArgumentParser() parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() np.random.seed(args.seed) global best_smiles global best_score global all_smiles gene_length = 300 N_mu = 100 N_lambda = 200 # initialize population seed_smiles = [] with open(args.smifile) as f: for line in f: smiles = line.rstrip() seed_smiles.append(smiles) initial_smiles = np.random.choice(seed_smiles, N_mu + N_lambda) initial_smiles = [canonicalize(s) for s in initial_smiles] initial_genes = [ CFGtoGene(cfg_util.encode(s), max_len=gene_length) for s in initial_smiles ] initial_scores = [score_util.calc_score(s) for s in initial_smiles] population = [] for score, gene, smiles in zip(initial_scores, initial_genes, initial_smiles): population.append((score, smiles, gene)) population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu] t = threading.Timer(60, current_best, []) t.start() print("Start!") all_smiles = [p[1] for p in population] for generation in range(1000000000): scores = [p[0] for p in population] mean_score = np.mean(scores) min_score = np.min(scores) std_score = np.std(scores) best_score = np.max(scores) idx = np.argmax(scores) best_smiles = population[idx][1] print("%{},{},{},{},{}".format(generation, best_score, mean_score, min_score, std_score)) new_population = [] for _ in range(N_lambda): p = population[np.random.randint(len(population))] p_gene = p[2] c_gene = mutation(p_gene) c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene))) if c_smiles not in all_smiles: c_score = score_util.calc_score(c_smiles) c = (c_score, c_smiles, c_gene) new_population.append(c) all_smiles.append(c_smiles) population.extend(new_population) population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu]
def main(): population = [] rules = np.load("rules.npz")['arr_0'] initial_rules = np.random.choice(rules, 100) initial_genes = [CFGtoGene(rule, max_len=288) for rule in initial_rules] initial_scores = [] for i in range(0, len(initial_genes), multiprocessing.cpu_count()): pool = multiprocessing.Pool(multiprocessing.cpu_count()) initial_scores.extend( pool.map(rdock_util.score, [ cfg_util.decode(GenetoCFG(gene))[0] for gene in initial_genes[i:i + multiprocessing.cpu_count()] ])) pool.close() pool.terminate() for s, m in zip(initial_scores, initial_genes): population.append((s, m)) trial = 0 valid_smiles = [] scores = [] all_smiles = [] t = threading.Timer(60, current_best, [scores]) t.start() for generation in range(100): print("generation", generation) population = sorted(population, key=lambda x: x[0])[:100] for s, g in population: print(s, cfg_util.decode(GenetoCFG(g))[0]) cpu_count = multiprocessing.cpu_count() # crossover children_smiles = [] children_genes = [] while len(children_smiles) < cpu_count * 0.8: idx1, idx2 = np.random.choice(len(population), size=2) score1, gene1 = population[idx1] score2, gene2 = population[idx2] cut_point = np.random.choice(len(gene1)) gene_child = gene1[:cut_point] + gene2[cut_point:] smiles_child = cfg_util.decode(GenetoCFG(gene_child))[0] if is_valid_smiles( smiles_child) and smiles_child not in all_smiles: children_smiles.append(smiles_child) children_genes.append(gene_child) all_smiles.append(smiles_child) # mutation while len(children_smiles) < cpu_count: idx = np.random.choice(len(population)) score, gene = population[idx] mutation_idx = np.random.choice(len(gene)) gene_mutant = copy.deepcopy(gene) gene_mutant[mutation_idx] = np.random.choice(80) smiles_mutant = cfg_util.decode(GenetoCFG(gene_mutant))[0] if is_valid_smiles( smiles_mutant) and smiles_mutant not in all_smiles: children_smiles.append(smiles_mutant) children_genes.append(gene_mutant) all_smiles.append(smiles_mutant) pool = multiprocessing.Pool(cpu_count) scores_child = pool.map(rdock_util.score, children_smiles) pool.close() pool.terminate() scores.extend(scores_child) assert (len(scores_child) == len(children_genes)) for s, g in zip(scores_child, children_genes): if (s, g) not in population: population.append((s, g))