def init_grammar(self, max_num_mols): L = get_smiles_from_database(max_num_mols) for ind, smiles in enumerate(L): if ind >= max_num_mols: break if ind > self.last_processed: # don't repeat try: # this causes g to remember all the rules occurring in these molecules these_actions = self.grammar.raw_strings_to_actions( [smiles]) this_tree = self.grammar.last_tree_processed these_tuples = tree_with_rule_inds_to_list_of_tuples( this_tree) for p, nt, c in these_tuples: if (p, nt) not in self.grammar.conditional_frequencies: self.grammar.conditional_frequencies[(p, nt)] = {} if c not in self.grammar.conditional_frequencies[(p, nt)]: self.grammar.conditional_frequencies[(p, nt)][c] = 1 else: self.grammar.conditional_frequencies[(p, nt)][c] += 1 # count the frequency of the occurring rules for aa in these_actions: for a in aa: if a not in self.grammar.rule_frequency_dict: self.grammar.rule_frequency_dict[a] = 0 self.grammar.rule_frequency_dict[a] += 1 lengths = [len(x) for x in these_actions] new_max_len = max(lengths) self.total_len += sum(lengths) if new_max_len > self.max_len: self.max_len = new_max_len print("Max len so far:", self.max_len) except Exception as e: #TODO: fix this, make errors not happen ;) print(e) self.last_processed = ind # if we discovered a new rule, remember that if not len(self.new_rules) or self.grammar.rate_tracker[-1][ -1] > self.new_rules[-1][-1]: self.new_rules.append( (ind, *self.grammar.rate_tracker[-1])) print(self.new_rules[-1]) if ind % 10 == 9: self.save() if ind % 100 == 0 and ind > 0: self.stats[ind] = { 'max_len': self.max_len, 'avg_len': self.total_len / ind, 'num_rules': len(self.grammar.rules), } self.grammar.normalize_conditional_frequencies() self.grammar.calc_terminal_distance() return self.max_len # maximum observed molecule length
def count_rule_frequencies(self, trees): for tree in trees: these_tuples = tree_with_rule_inds_to_list_of_tuples(tree) for p, nt, c in these_tuples: if (p, nt) not in self.conditional_frequencies: self.grammar.conditional_frequencies[(p, nt)] = {} if c not in self.conditional_frequencies[(p, nt)]: self.conditional_frequencies[(p, nt)][c] = 1 else: self.conditional_frequencies[(p, nt)][c] += 1 these_actions = [rule.rule_id for rule in tree.rules()] for a in these_actions: if a not in self.rule_frequency_dict: self.rule_frequency_dict[a] = 0 self.rule_frequency_dict[a] += 1