def index(text): valid = train.valid corp = train.corp models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp) langs = dict(valid) text = text.lower() results = train.language(models,text) persistent = results return "(" + langs[results[0]] + "," + str(results[1]) + ")"
def index(text): valid = train.valid corp = train.corp models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp) langs = dict(valid) text = text.lower() results = train.language(models, text) persistent = results return "(" + langs[results[0]] + "," + str(results[1]) + ")"
def test(): counts = {} for c in corp: right = 0 wrong = 0 wrongs = defaultdict(int) text = io.open('testcorpus/' + c, encoding='utf-8').read() #because Chinese is logographic, so tokenizing by space is inappropriate text = text.split() for i in random.sample(range(1, len(text)-23), 1000): inds = map(lambda j: i + j, range(random.randint(1, 24))) randogram = map(lambda j: text[j], inds) ans = train.language(models, ' '.join(randogram))[0] if(ans != c): wrong += 1 else: right += 1 counts[c] = (right, wrong) return counts
def test(): counts = {} for c in corp: right = 0 wrong = 0 wrongs = defaultdict(int) text = io.open('testcorpus/' + c, encoding='utf-8').read() #because Chinese is logographic, so tokenizing by space is inappropriate text = text.split() for i in random.sample(range(1, len(text) - 23), 1000): inds = map(lambda j: i + j, range(random.randint(1, 24))) randogram = map(lambda j: text[j], inds) ans = train.language(models, ' '.join(randogram))[0] if (ans != c): wrong += 1 else: right += 1 counts[c] = (right, wrong) return counts
def hello(text): # return str(train.models[0].order) l = train.language(train.models, text) return "(" +l[0] + ", " + str(l[1]) + ")"
def hello(text): # return str(train.models[0].order) l = train.language(train.models, text) return "(" + l[0] + ", " + str(l[1]) + ")"