Exemplo n.º 1
0
    def build(self, text=None):
        """
		Take text (for example as returned by corpus() ) and build a trie. The
		maximum n-gram length is self.max_chars characters or self.max_words
		words, whichever is shorter.
		"""###
        if text == None: text = self.read_corpus()
        t = TreeStructure.trie()
        istop = len(text)
        pr = progress(istop, 'building language model')
        for i in range(istop):
            substr = ''
            j = i
            nwords = 0
            lastchar = ' '
            while j < istop:
                char = text[j]
                substr += char
                endword = (char == ' ' and lastchar != ' ')
                lastchar = char
                j += 1
                if endword:
                    nwords += 1
                    if len(substr
                           ) >= self.max_chars or nwords >= self.max_words:
                        break
            #print '"'+substr+'"'
            t.add(substr)
            if i % 5000 == 0: pr.update(i, '(%d nodes)' % len(t.nodes))
        self.prefixtree = t
        self.corpus = text
        self.timestamp = time.localtime()[:6]
        return self
Exemplo n.º 2
0
    def __init__(self,
                 filename=None,
                 alphabet=None,
                 translation_file=None,
                 space_delimited=False,
                 max_chars=16,
                 max_words=3,
                 trim=True,
                 verbose=False):
        if alphabet == None: alphabet = default_alphabet
        self.alphabet = alphabet
        self.space_delimited = space_delimited
        self.translation_file = FindFile(translation_file)
        self.max_chars = max_chars
        self.max_words = max_words
        self.trim = trim
        self.verbose = verbose
        self.corpus = None
        self.timestamp = None

        self.prefixtree = TreeStructure.trie()
        if filename != None: self.loadtrie(filename)
        self.translations = self.read_translations(self.translation_file)
        self.hashattr = [
            'space_delimited', 'max_words', 'max_chars', 'trim', 'translations'
        ]
	def build(self, text=None):
		"""
		Take text (for example as returned by corpus() ) and build a trie. The
		maximum n-gram length is self.max_chars characters or self.max_words
		words, whichever is shorter.
		"""###
		if text == None: text = self.read_corpus()
		t = TreeStructure.trie()
		istop = len(text)
		pr = progress(istop, 'building language model')
		for i in range(istop):
			substr = ''; j = i; nwords = 0; lastchar = ' '
			while j<istop:
				char = text[j]
				substr += char
				endword = (char == ' ' and lastchar != ' ')
				lastchar = char
				j += 1
				if endword:
					nwords += 1
					if len(substr) >= self.max_chars or nwords >= self.max_words: break
			#print '"'+substr+'"'
			t.add(substr)
			if i%5000 == 0: pr.update(i, '(%d nodes)'%len(t.nodes))
		self.prefixtree = t
		self.corpus = text
		self.timestamp = time.localtime()[:6]
		return self
Exemplo n.º 4
0
def convert(fileName):
  """
    Converts input files from standard RAVEN XML to xml-converted GetPot.  Produces a ".i" file in the end.
    @ In, fileName, the name for the XML to convert
    @ Out, None
  """
  tree = TS.parse(fileName)
  return tree.printGetPot()
	def __init__(self, filename=None, alphabet=None, translation_file=None, space_delimited=False, max_chars=16, max_words=3, trim=True, verbose=False):
		if alphabet == None: alphabet = default_alphabet
		self.alphabet = alphabet
		self.space_delimited = space_delimited
		self.translation_file = FindFile(translation_file)
		self.max_chars = max_chars
		self.max_words = max_words
		self.trim = trim
		self.verbose = verbose
		self.corpus = None
		self.timestamp = None

		self.prefixtree = TreeStructure.trie()
		if filename != None: self.loadtrie(filename)	
		self.translations = self.read_translations(self.translation_file)
		self.hashattr = ['space_delimited', 'max_words', 'max_chars', 'trim', 'translations']
Exemplo n.º 6
0
 def loadtrie(self, filename):
     filename = FindFile(
         filename)  # TODO: @@@ more graceful behaviour if not found?
     self.prefixtree = TreeStructure.trie(filename)
     self.alphabet = sorted(self.prefixtree.dist('').keys())
	def loadtrie(self, filename):
		filename = FindFile(filename) # TODO: @@@ more graceful behaviour if not found?
		self.prefixtree = TreeStructure.trie(filename)
		self.alphabet = sorted(self.prefixtree.dist('').keys())