Пример #1
0
    def build(self, text=None):
        """
		Take text (for example as returned by corpus() ) and build a trie. The
		maximum n-gram length is self.max_chars characters or self.max_words
		words, whichever is shorter.
		"""###
        if text == None: text = self.read_corpus()
        t = TreeStructure.trie()
        istop = len(text)
        pr = progress(istop, 'building language model')
        for i in range(istop):
            substr = ''
            j = i
            nwords = 0
            lastchar = ' '
            while j < istop:
                char = text[j]
                substr += char
                endword = (char == ' ' and lastchar != ' ')
                lastchar = char
                j += 1
                if endword:
                    nwords += 1
                    if len(substr
                           ) >= self.max_chars or nwords >= self.max_words:
                        break
            #print '"'+substr+'"'
            t.add(substr)
            if i % 5000 == 0: pr.update(i, '(%d nodes)' % len(t.nodes))
        self.prefixtree = t
        self.corpus = text
        self.timestamp = time.localtime()[:6]
        return self
Пример #2
0
    def clean(self, text):
        text = self.match_case(text)
        if '\n' not in self.alphabet: text = text.replace('\n', ' ')
        text = ['. '] + list(text) + [' ']
        pr = progress(len(text), 'cleaning')
        for i in range(len(text)):
            if i > 0: prevx = text[i - 1]
            else: prevx = ''
            if i + 1 < len(text): nextx = text[i + 1]
            else: nextx = ''

            x = text[i]
            if not x in self.alphabet: x = self.translations.get(x, '')
            if self.space_delimited:
                if x in '.,?!;:' and nextx == ' ': x = ' ' + x
                elif x in '"()[]{}' and prevx == ' ': x = x + ' '
                # In space_delimited mode, space now unambiguously denotes the end of a word. Ending the sentence
                # is a separate issue (only likely to happen after the end of a word). So effectively, each punctuation
                # concept ("sentence ends now", "sentence ends now as a question", etc...) has been made a "word". For
                # now, the default is *not* to do things in space_delimited mode, but in future it would be desirable.
                # However, TODO: how to undo this at the prediction stage??

            if prevx.endswith(' ') or prevx.endswith('\n'):
                x = x.lstrip(' ')  # collapse multiple spaces
            text[i] = x
            if i % 5000 == 0: pr.update(i)
        pr.done()
        text = ''.join(text)
        return text
	def build(self, text=None):
		"""
		Take text (for example as returned by corpus() ) and build a trie. The
		maximum n-gram length is self.max_chars characters or self.max_words
		words, whichever is shorter.
		"""###
		if text == None: text = self.read_corpus()
		t = TreeStructure.trie()
		istop = len(text)
		pr = progress(istop, 'building language model')
		for i in range(istop):
			substr = ''; j = i; nwords = 0; lastchar = ' '
			while j<istop:
				char = text[j]
				substr += char
				endword = (char == ' ' and lastchar != ' ')
				lastchar = char
				j += 1
				if endword:
					nwords += 1
					if len(substr) >= self.max_chars or nwords >= self.max_words: break
			#print '"'+substr+'"'
			t.add(substr)
			if i%5000 == 0: pr.update(i, '(%d nodes)'%len(t.nodes))
		self.prefixtree = t
		self.corpus = text
		self.timestamp = time.localtime()[:6]
		return self
	def clean(self, text):
		text = self.match_case(text)
		if '\n' not in self.alphabet: text = text.replace('\n', ' ')
		text = ['. '] + list(text) + [' ']
		pr = progress(len(text), 'cleaning')
		for i in range(len(text)):
			if i > 0: prevx = text[i-1]
			else: prevx = ''
			if i+1 < len(text): nextx = text[i+1]
			else: nextx = ''
			
			x = text[i]
			if not x in self.alphabet: x = self.translations.get(x,'')			
			if self.space_delimited:
				if x in '.,?!;:' and nextx == ' ': x = ' ' + x
				elif x in '"()[]{}' and prevx == ' ': x = x + ' '
				# In space_delimited mode, space now unambiguously denotes the end of a word. Ending the sentence
				# is a separate issue (only likely to happen after the end of a word). So effectively, each punctuation
				# concept ("sentence ends now", "sentence ends now as a question", etc...) has been made a "word". For
				# now, the default is *not* to do things in space_delimited mode, but in future it would be desirable.
				# However, TODO: how to undo this at the prediction stage??
	
			if prevx.endswith(' ') or prevx.endswith('\n'): x = x.lstrip(' ') # collapse multiple spaces
			text[i] = x
			if i%5000 == 0: pr.update(i)
		pr.done()
		text = ''.join(text)
		return text
def packtrie(t):
	"""
	Helper function for trie.pack()
	"""###
	if t.ispacked(): raise TrieError, 'already packed'
	x = ''
	lut = [0]
	for node in t.nodes[:-1]:
		lut.append(lut[-1]+node.packed_size())
	if progress != None: pr = progress(len(t.nodes), ' packing')
	for i in range(len(t.nodes)):
		x += packnode(t.nodes[i], lut=lut)
		if i%5000 == 0 and progress != None: pr.update(i, '(%d nodes)'%i)
	if progress != None: pr.done()
	return x
Пример #6
0
def packtrie(t):
    """
	Helper function for trie.pack()
	"""

  ###
    if t.ispacked(): raise TrieError, 'already packed'
    x = ''
    lut = [0]
    for node in t.nodes[:-1]:
        lut.append(lut[-1] + node.packed_size())
    if progress != None: pr = progress(len(t.nodes), ' packing')
    for i in range(len(t.nodes)):
        x += packnode(t.nodes[i], lut=lut)
        if i % 5000 == 0 and progress != None: pr.update(i, '(%d nodes)' % i)
    if progress != None: pr.done()
    return x
def unpacktrie(x):
	"""
	Helper function for trie.unpack()
	"""###
	t = trie()
	t.nodes = []
	ind = 0
	rlut = {}
	if progress != None: pr = progress(len(x), 'unpacking')
	while ind < len(x):
		rlut[ind] = len(t.nodes)
		node,ind = unpacknode(x,ind)
		t.nodes.append(node)
		nnodes = len(t.nodes)
		if nnodes % 5000 == 0 and progress != None: pr.update(ind, '(%d nodes)'%nnodes)
	for node in t.nodes:
		node.parent = rlut[node.parent]
		for key,val in node.children.items(): node.children[key] = rlut[val]
	return t
Пример #8
0
def unpacktrie(x):
    """
	Helper function for trie.unpack()
	"""

  ###
    t = trie()
    t.nodes = []
    ind = 0
    rlut = {}
    if progress != None: pr = progress(len(x), 'unpacking')
    while ind < len(x):
        rlut[ind] = len(t.nodes)
        node, ind = unpacknode(x, ind)
        t.nodes.append(node)
        nnodes = len(t.nodes)
        if nnodes % 5000 == 0 and progress != None:
            pr.update(ind, '(%d nodes)' % nnodes)
    for node in t.nodes:
        node.parent = rlut[node.parent]
        for key, val in node.children.items():
            node.children[key] = rlut[val]
    return t