def add_words_to_nx_graph(graph, phrase): # TODO: decide if 50 is the right choice here for sentence in tokenize_sentences(phrase, 50, transform_call=lambda s: s.lower()): words = sentence.split() # gets "start" symbol start_node = graph.find_node_by_key(Symbols.START) if not start_node: start_node = graph.add_node(Symbols.START) # adds first word of phrase as root head = words.pop(0) prev_node = graph.add_node(head) start_node.add_neighbor(prev_node) for w in words: node = graph.add_node(w) prev_node.add_neighbor(node) # graph.add_edge(prev_word, w) prev_node = node # adds "end" child to last word of phrase # prev_node.set(end=True) end_node = graph.find_node_by_key(Symbols.END) if not end_node: end_node = graph.add_node(Symbols.END) prev_node.add_neighbor(end_node)
def ingest(self, phrase): for sentence in tokenize_sentences(phrase, 50, lowercase=True): phrase_words = sentence.split() phrase_words.append(Symbols.END) phrase_len = len(phrase_words) # phrases under 3 are of no use to a 2nd-order chain if phrase_len < 3: return # grabs first 2 words of phrase self.__heads.increment((phrase_words[0], phrase_words[1])) for i in range(phrase_len - 2): w1 = phrase_words[i] w2 = phrase_words[i + 1] w3 = phrase_words[i + 2] w_pair = (w1, w2) if w_pair in self.__words: trailing_words = self.__words[w_pair] if w3 in trailing_words: trailing_words[w3] = trailing_words[w3] + 1 else: trailing_words[w3] = 1 else: trailing_words = {w3: 1} self.__words[w_pair] = trailing_words