class TamilVUNgram:
    def __init__(self):
        self.filename = u'tamilvu_dictionary_words.txt'
        self.unigram = Unigram(self.filename)
        self.unigram.frequency_model()
        print(u"--- completed Unigram model ---")
        self.bigram = Bigram(self.filename)
        self.bigram.language_model(verbose=False)
        self.trigram = Trigram(self.filename)
        self.trigram.language_model(verbose=False)
        
        print(u"--- completed Bigram,Trigram model ---")
        
    def save(self):
        # save letter2 of bigram
        # save letter of unigram
        with codecs.open("tvu_bigram.txt","w","utf-8") as fp:
            d = {}
            for k,v in self.bigram.letter2.items():
                for k2,v2 in v.items():
                    if v2 == 0:
                        continue
                    d[k+k2] = v2
            for k,v in sorted(d.items(),key=operator.itemgetter(1),reverse=True):
                fp.write(u"%s - %d\n"%(k,v))
            
        with codecs.open("tvu_unigram.txt","w","utf-8") as fp:
            for k,v in sorted(self.unigram.letter.items(),key=operator.itemgetter(1),reverse=True):
                if v == 0:
                    continue
                fp.write(u"%s - %d\n"%(k,v))
        self.trigram.save(u'tvu_trigram.txt')
        print(u"SAVED tvu_unigram.txt, tvu_bigram.txt")
示例#2
0
def get_stats():
    obj = Unigram("out-tamil-words.txt")
    obj.frequency_model()
    with codecs.open("ta_data_freq.txt", "w", "utf-8") as fp:
        pprint.pprint(obj.letter, stream=fp)
    proc_stats(obj.letter, u"ta_data_freq2.txt")
    return
def get_stats():
    obj = Unigram("out-tamil-words.txt")
    obj.frequency_model()
    with codecs.open("ta_data_freq.txt","w","utf-8") as fp:
        pprint.pprint( obj.letter, stream=fp)    
    proc_stats(obj.letter,u"ta_data_freq2.txt")
    return
示例#4
0
    def __init__(self):
        self.filename = u'tamilvu_dictionary_words.txt'
        self.unigram = Unigram(self.filename)
        self.unigram.frequency_model()
        print(u"--- completed Unigram model ---")
        self.bigram = Bigram(self.filename)
        self.bigram.language_model(verbose=False)
        self.trigram = Trigram(self.filename)
        self.trigram.language_model(verbose=False)

        print(u"--- completed Bigram,Trigram model ---")
 def __init__(self):
     self.filename = u'tamilvu_dictionary_words.txt'
     self.unigram = Unigram(self.filename)
     self.unigram.frequency_model()
     print(u"--- completed Unigram model ---")
     self.bigram = Bigram(self.filename)
     self.bigram.language_model(verbose=False)
     self.trigram = Trigram(self.filename)
     self.trigram.language_model(verbose=False)
     
     print(u"--- completed Bigram,Trigram model ---")
示例#6
0
class TamilVUNgram:
    def __init__(self):
        self.filename = "tamilvu_dictionary_words.txt"
        self.unigram = Unigram(self.filename)
        self.unigram.frequency_model()
        print("--- completed Unigram model ---")
        self.bigram = Bigram(self.filename)
        self.bigram.language_model(verbose=False)
        self.trigram = Trigram(self.filename)
        self.trigram.language_model(verbose=False)

        print("--- completed Bigram,Trigram model ---")

    def save(self):
        # save letter2 of bigram
        # save letter of unigram
        with codecs.open("tvu_bigram.txt", "w", "utf-8") as fp:
            d = {}
            for k, v in list(self.bigram.letter2.items()):
                for k2, v2 in list(v.items()):
                    if v2 == 0:
                        continue
                    d[k + k2] = v2
            for k, v in sorted(list(d.items()),
                               key=operator.itemgetter(1),
                               reverse=True):
                fp.write("%s - %d\n" % (k, v))

        with codecs.open("tvu_unigram.txt", "w", "utf-8") as fp:
            for k, v in sorted(
                    list(self.unigram.letter.items()),
                    key=operator.itemgetter(1),
                    reverse=True,
            ):
                if v == 0:
                    continue
                fp.write("%s - %d\n" % (k, v))
        self.trigram.save("tvu_trigram.txt")
        print("SAVED tvu_unigram.txt, tvu_bigram.txt")
def run(parent,outputfile):
    x=None
    for filename in glob.glob(os.path.join(parent,"*.word")):
        if not x:
            x = Unigram(filename)
        else:
            x.corpus = Corpus(filename) #update file
        x.frequency_model()
    x.save(outputfile)
    proc_stats(get_prob(x.letter), outputfile)
    return
示例#8
0
 def __init__(self, filename):
     Unigram.__init__(self, filename)
     self.bigram = dict()
 def __init__(self,filename):
     Unigram.__init__(self,filename)
     self.bigram = dict()