def build(self, text=None): """ Take text (for example as returned by corpus() ) and build a trie. The maximum n-gram length is self.max_chars characters or self.max_words words, whichever is shorter. """### if text == None: text = self.read_corpus() t = TreeStructure.trie() istop = len(text) pr = progress(istop, 'building language model') for i in range(istop): substr = '' j = i nwords = 0 lastchar = ' ' while j < istop: char = text[j] substr += char endword = (char == ' ' and lastchar != ' ') lastchar = char j += 1 if endword: nwords += 1 if len(substr ) >= self.max_chars or nwords >= self.max_words: break #print '"'+substr+'"' t.add(substr) if i % 5000 == 0: pr.update(i, '(%d nodes)' % len(t.nodes)) self.prefixtree = t self.corpus = text self.timestamp = time.localtime()[:6] return self
def clean(self, text): text = self.match_case(text) if '\n' not in self.alphabet: text = text.replace('\n', ' ') text = ['. '] + list(text) + [' '] pr = progress(len(text), 'cleaning') for i in range(len(text)): if i > 0: prevx = text[i - 1] else: prevx = '' if i + 1 < len(text): nextx = text[i + 1] else: nextx = '' x = text[i] if not x in self.alphabet: x = self.translations.get(x, '') if self.space_delimited: if x in '.,?!;:' and nextx == ' ': x = ' ' + x elif x in '"()[]{}' and prevx == ' ': x = x + ' ' # In space_delimited mode, space now unambiguously denotes the end of a word. Ending the sentence # is a separate issue (only likely to happen after the end of a word). So effectively, each punctuation # concept ("sentence ends now", "sentence ends now as a question", etc...) has been made a "word". For # now, the default is *not* to do things in space_delimited mode, but in future it would be desirable. # However, TODO: how to undo this at the prediction stage?? if prevx.endswith(' ') or prevx.endswith('\n'): x = x.lstrip(' ') # collapse multiple spaces text[i] = x if i % 5000 == 0: pr.update(i) pr.done() text = ''.join(text) return text
def build(self, text=None): """ Take text (for example as returned by corpus() ) and build a trie. The maximum n-gram length is self.max_chars characters or self.max_words words, whichever is shorter. """### if text == None: text = self.read_corpus() t = TreeStructure.trie() istop = len(text) pr = progress(istop, 'building language model') for i in range(istop): substr = ''; j = i; nwords = 0; lastchar = ' ' while j<istop: char = text[j] substr += char endword = (char == ' ' and lastchar != ' ') lastchar = char j += 1 if endword: nwords += 1 if len(substr) >= self.max_chars or nwords >= self.max_words: break #print '"'+substr+'"' t.add(substr) if i%5000 == 0: pr.update(i, '(%d nodes)'%len(t.nodes)) self.prefixtree = t self.corpus = text self.timestamp = time.localtime()[:6] return self
def clean(self, text): text = self.match_case(text) if '\n' not in self.alphabet: text = text.replace('\n', ' ') text = ['. '] + list(text) + [' '] pr = progress(len(text), 'cleaning') for i in range(len(text)): if i > 0: prevx = text[i-1] else: prevx = '' if i+1 < len(text): nextx = text[i+1] else: nextx = '' x = text[i] if not x in self.alphabet: x = self.translations.get(x,'') if self.space_delimited: if x in '.,?!;:' and nextx == ' ': x = ' ' + x elif x in '"()[]{}' and prevx == ' ': x = x + ' ' # In space_delimited mode, space now unambiguously denotes the end of a word. Ending the sentence # is a separate issue (only likely to happen after the end of a word). So effectively, each punctuation # concept ("sentence ends now", "sentence ends now as a question", etc...) has been made a "word". For # now, the default is *not* to do things in space_delimited mode, but in future it would be desirable. # However, TODO: how to undo this at the prediction stage?? if prevx.endswith(' ') or prevx.endswith('\n'): x = x.lstrip(' ') # collapse multiple spaces text[i] = x if i%5000 == 0: pr.update(i) pr.done() text = ''.join(text) return text
def packtrie(t): """ Helper function for trie.pack() """### if t.ispacked(): raise TrieError, 'already packed' x = '' lut = [0] for node in t.nodes[:-1]: lut.append(lut[-1]+node.packed_size()) if progress != None: pr = progress(len(t.nodes), ' packing') for i in range(len(t.nodes)): x += packnode(t.nodes[i], lut=lut) if i%5000 == 0 and progress != None: pr.update(i, '(%d nodes)'%i) if progress != None: pr.done() return x
def packtrie(t): """ Helper function for trie.pack() """ ### if t.ispacked(): raise TrieError, 'already packed' x = '' lut = [0] for node in t.nodes[:-1]: lut.append(lut[-1] + node.packed_size()) if progress != None: pr = progress(len(t.nodes), ' packing') for i in range(len(t.nodes)): x += packnode(t.nodes[i], lut=lut) if i % 5000 == 0 and progress != None: pr.update(i, '(%d nodes)' % i) if progress != None: pr.done() return x
def unpacktrie(x): """ Helper function for trie.unpack() """### t = trie() t.nodes = [] ind = 0 rlut = {} if progress != None: pr = progress(len(x), 'unpacking') while ind < len(x): rlut[ind] = len(t.nodes) node,ind = unpacknode(x,ind) t.nodes.append(node) nnodes = len(t.nodes) if nnodes % 5000 == 0 and progress != None: pr.update(ind, '(%d nodes)'%nnodes) for node in t.nodes: node.parent = rlut[node.parent] for key,val in node.children.items(): node.children[key] = rlut[val] return t
def unpacktrie(x): """ Helper function for trie.unpack() """ ### t = trie() t.nodes = [] ind = 0 rlut = {} if progress != None: pr = progress(len(x), 'unpacking') while ind < len(x): rlut[ind] = len(t.nodes) node, ind = unpacknode(x, ind) t.nodes.append(node) nnodes = len(t.nodes) if nnodes % 5000 == 0 and progress != None: pr.update(ind, '(%d nodes)' % nnodes) for node in t.nodes: node.parent = rlut[node.parent] for key, val in node.children.items(): node.children[key] = rlut[val] return t