示例#1
0
 def __init__(self):
     """Constructor method to load external nGram class, load words, confusion matrix and dictionary."""
     self.ng = nGram(True, True, False, False, False)
     self.words = sorted(set(self.ng.words))[3246:]
     self.loadConfusionMatrix()
     self.dict = self.loadDict()
     return
示例#2
0
 def __init__(self):
     """Constructor method to load external nGram class, load words, confusion matrix and dictionary."""
     self.ng = nGram(True, True, False, False, False)
     self.words = sorted(set(self.ng.words))[3246:]
     self.loadConfusionMatrix()
     self.dict = self.loadDict()
     return
示例#3
0
 def __init__(self):
     self.grams = pd.read_csv('3gram.csv')
     self.py = pinyin()
     self.ng = nGram()
     self.segment = [
         ',', '。', '?', '!', ':', ';', '……', '【', '】', '(', ')', '“', '”',
         "《", '》', '、'
     ]
     print('init over')
示例#4
0
            for i, key in enumerate(self.dic.keys()):
                f_csv.writerow({
                    'one': key[0],
                    'two': key[1],
                    'three': key[2],
                    'num': self.dic[key]
                })
                bar.bar(i, length, "Preprocessed ")
        print("\nfinish write: " + self.path)


if __name__ == '__main__':
    from ngram import nGram
    from visualization import Progress_bar

    ng = nGram()
    ngg = NGramGenerator(3)

    file_dir = 'raw_data'

    for files in os.listdir(file_dir)[-10:]:
        b = Progress_bar()
        with open(os.path.join(file_dir, files), 'r', encoding='utf-8') as f:
            data = f.read().split()
            l = len(data)
            for index, d in enumerate(data):
                gs = ng.ngram(d)
                ngg.generate(gs)
                b.bar(index, l, "Preprocessed " + files)
        print("\nfinish index: " + files)
    ngg.save()
import ngram
import sys
import pickle


if (len(sys.argv) < 3):
	print("Error: Expected arguments: ModuleFilename,  N")
	sys.exit()

moduleFilename = sys.argv[1]
n = int(sys.argv[2])

model = ngram.nGram(n)

open(moduleFilename, 'w')

model.printInfo()

ngram.saveObject(model, moduleFilename)

示例#6
0
文件: tests.py 项目: thientu/ngram
import unittest
from ngram import nGram
ng = nGram(n=5, corpus_file=None, cache=False)


class TestNgram(unittest.TestCase):
    def test_uni_log(self):
        probability = ng.sentence_probability(sentence='hold your horses',
                                              n=1,
                                              form='log')
        self.assertAlmostEqual(probability, -24.9337710989)

    def test_uni_antilog(self):
        probability = ng.sentence_probability(sentence='hold your horses',
                                              n=1,
                                              form='antilog')
        self.assertAlmostEqual(probability, 1.48388689281e-11)

    def test_bi_log(self):
        probability = ng.sentence_probability(sentence='hold your horses',
                                              n=2,
                                              form='log')
        self.assertAlmostEqual(probability, -18.655540764)

    def test_bi_antilog(self):
        probability = ng.sentence_probability(sentence='hold your horses',
                                              n=2,
                                              form='antilog')
        self.assertAlmostEqual(probability, 7.90681521418e-09)

    def test_tri_log(self):