def main_(): num_nodes = 10 from vectorgraph import VectorGraph graph = VectorGraph() ids = [str(i) for i in range(num_nodes)] for id in ids: node = graph.create_node(id) graph.add(node)
def sim2(): graph = VectorGraph(DIM=500) ids = [str(i) for i in range(10)] for c in ids: node = graph.create_node(c) graph.add(node) n1, n2, n3, n4 = map(graph.get, '1234') #for n in (n2, n3): n1.bump_edge(n2) print(n1.edge_weight(n2)) exit() n2.bump_edge(n3) for x in (n1, n2): for y in (n2, n3, n4): print(x, '->', y, '=', x.edge_weight(y))
def sim(): graph = VectorGraph(DIM=500) ids = [str(i) for i in range(1000)] for c in ids: node = graph.create_node(c) graph.add(node) counts = {c: 0 for c in ids} samples = [(i, c) for i, c in enumerate(ids) if not i % 5] for i, c in samples: n1 = graph[c] for c2 in ids[i:]: n2 = graph[c2] n1.bump_edge(n2) counts[c] += 1 for i, c in samples: n1 = graph[c] total_edge = sum(n1.edge_weight(n2) for n2 in graph.nodes) yield {'number of edges': counts[c], 'total edge weight': total_edge}
class Numila(object): """A chunking, graphical model of language acquisition. Parameterized by params.yml. The two critical parameters are a graph class that matches the interface in abstract_graph.py and a parsing class. """ def __init__(self, param_file='params.yml', name='numila', log_stream='WARNING', log_file='WARNING', **params): self.name = name self.log = utils.get_logger(name, stream=log_stream, file=log_file) # Read default params from file, overwriting with keyword arguments. with open(param_file) as f: self.params = yaml.load(f.read()) if not all(k in self.params for k in params): raise ValueError for k in params: if k not in self.params: raise ValueError(k + 'is not a valid parameter.') self.params.update(params) self.log.info('parameters:\n\n%s\n', yaml.dump(self.params, default_flow_style=False)) # The GRAPH parameter determines which implementation of a Graph # this Numila instance should use. Thus, Numila is a class that # is parameterized by another class, similarly to how a functor # in OCaml is a module parameterized by another module. graph = self.params['GRAPH'].lower() if graph.startswith('vector'): from vectorgraph import VectorGraph as Graph elif graph.startswith('prob'): from probgraph import ProbGraph as Graph else: raise ValueError('Invalid GRAPH parameter: {}'.format(self.params['GRAPH'])) self.graph = Graph(edges=['ftp', 'btp'], **self.params) # Same deal for Parse. parse = self.params['PARSE'].lower() if parse == 'greedy': from greedy_parse import GreedyParse as Parse elif parse == 'full': from full_parse import FullParse as Parse if self.graph.HIERARCHICAL: self.log.warning('FullParse can only be used with non-hierarchical merge') self.graph.HIERARCHICAL = False else: raise ValueError('Invalid PARSE parameter: {}'.format(self.params['PARSE'])) self.Parse = Parse self._debug = {'speak_chunks': 0} def parse(self, utterance, learn=True): """Parses the utterance and returns the result.""" if self.params['DECAY']: self.graph.decay() if isinstance(utterance, str): utterance = utterance.split(' ') if self.params['ADD_BOUNDARIES']: utterance = ['ø'] + utterance + ['ø'] return self.Parse(self, utterance, learn=learn) def fit(self, training_corpus, lap=None): """Trains the model on a training corpus.""" with utils.Timer(print_func=None) as timer: try: for count, utt in enumerate(training_corpus, 1): self.parse(utt) if lap and count % lap == 0: timer.lap(count) except KeyboardInterrupt: pass # allow interruption of training self.log.warning('Trained on %s utterances in %s seconds', count, timer.elapsed) return self def score(self, utt, **kwargs): """Returns a grammaticality score for an utterance.""" return self.parse(utt, learn=False).score(**kwargs) def map_score(self, utts, **kwargs): with utils.Timer(print_func=None) as timer: result = [self.score(u, **kwargs) for u in utts] self.log.warning('Scored %s utterances in %s seconds', len(utts), timer.elapsed) return result @utils.contract(lambda x: 0 <= x <= 1) def chunkiness(self, node1, node2): """How well two nodes form a chunk. The geometric mean of forward transitional probability and backward transitional probability. """ ftp_weight = 1 btp_weight = self.params['BTP_PREFERENCE'] if btp_weight == 'only': ftp_weight, btp_weight = 0, 1 generalize = self.params['GENERALIZE'] ftp = node1.edge_weight(node2, 'ftp', generalize=generalize) btp = node2.edge_weight(node1, 'btp', generalize=generalize) sum_weights = btp_weight + ftp_weight gmean = (ftp ** ftp_weight * btp ** btp_weight) ** (1 / sum_weights) return gmean def get_chunk(self, node1, node2, *, create=False, add=False): """Returns a chunk of node1 and node2 if the chunk is in the graph. If `create` is False, we only return the desired chunk if it has been stored as an exemplar in the graph. Otherwise, we always return a chunk, creating it if necessary. If the chunk doesn't exist, we check if the pair is chunkable enough for a new chunk to be created. If so, the new chunk is returned. """ existing_chunk = self.graph.get_chunk(node1, node2) if existing_chunk: return existing_chunk assert not (node1.id_string == 'ø' or node2.id_string == 'ø') if create: if node1.id_string in self.graph and node1 is not self.graph[node1.id_string]: self.log.debug('Fixing a chunk node') node1 = self.graph[node1.id_string] if node2.id_string in self.graph and node2 is not self.graph[node2.id_string]: self.log.debug('Fixing a chunk node') node2 = self.graph[node2.id_string] chunk = self.graph.bind(node1, node2) if add: self.graph.add(chunk) return chunk def add_chunk(self, chunk): self.graph.add(chunk) self.log.debug('new chunk: %s', chunk) def speak(self, words, verbose=False, return_flat=True, preshuffled=False, order_func=None): """Returns the list of words ordered properly.""" # Get all the base token nodes. def get_node(token): try: return self.graph[token] except KeyError: self.log.debug('Unknown token while speaking: %s', token) return self.graph.create_node(token) nodes = [get_node(w) for w in words] if not preshuffled: # In the case of a tie, the first pair is chosen, thus we shuffle # to make this effect random. np.random.shuffle(nodes) # Convert as many nodes as possible into chunks by combining # the two chunkiest nodes into a chunk until can't chunk again. while len(nodes) > 1: self.log.debug('nodes: %s', nodes) pairs = list(itertools.permutations(nodes, 2)) best_pair = max(pairs, key=lambda pair: self.chunkiness(*pair)) node1, node2 = best_pair chunk = self.get_chunk(node1, node2, create=False) self.log.debug('chunk: %s', chunk) if not chunk: break nodes.remove(node1) nodes.remove(node2) nodes.append(chunk) self._debug['speak_chunks'] = sum(1 for n in nodes if n.children) if order_func is None: # ordering function is a parameter order_func = {'markov': self._order_markov, 'outward': self._order_outward}[self.params['SPEAK']] utterance = list(order_func(nodes)) if return_flat: return utils.flatten_parse(utterance) else: return utterance def _order_markov(self, nodes): last_node = self.graph['ø'] while nodes: #next_node = max(nodes, key=lambda n: self.chunkiness(last_node, n)) best_idx = np.argmax([self.chunkiness(last_node, n) for n in nodes]) next_node = nodes.pop(best_idx) yield next_node last_node = next_node def _order_outward(self, nodes): # most_common = max(nodes, key=node.weight) # TODO utterance = [nodes.pop(0)] while nodes: # Add a node to the beginning or end of the utterance. begin_chunkinesses = [self.chunkiness(n, utterance[0]) for n in nodes] end_chunkinesses = [self.chunkiness(utterance[-1], n) for n in nodes] best_idx = np.argmax(begin_chunkinesses + end_chunkinesses) if best_idx >= len(nodes): utterance.append(nodes.pop(best_idx % len(nodes))) else: utterance.insert(0, nodes.pop(best_idx)) return utterance