class TestLangID(unittest.TestCase): def setUp(self): self.lm1 = LangID(unk=False) self.lm2 = LangID(unk=True) self.lm1.train() self.lm2.train() def test_lang_set(self): lang_set = self.lm1.get_lang_set() lang_set.sort() expected_lang_set = ['en','es','ar','pt'] expected_lang_set.sort() self.assertEqual(lang_set,expected_lang_set) def test_classify_en(self): lang = self.lm1.classify(u'hello world') self.assertEqual(lang,'en') def test_classify_es(self): lang = self.lm1.classify(u'hola mis amigos') self.assertEqual(lang,'es') def test_classify_unk(self): lang1 = self.lm1.classify(u'this is la fiesta del mundo') self.assertEqual(lang1,'en') lang2 = self.lm2.classify(u'this is la fiesta del mundo') self.assertEqual(lang2,'unk')
class GetLanguageDyslBolt(BasicBolt): def __init__(self, *args, **kwargs): #super(BasicBolt, self).__init__(*args, **kwargs) #print dir(BasicBolt) #BasicBolt.__init__(*args, **kwargs) self.l = LangID() self.l.train() def process(self, tup): text = tup.values[1] #language = langid.classify(text)[0] #l = LangID() #l.train() language = self.l.classify(text) storm.emit([tup.values[0], language])
def setUp(self): self.lm1 = LangID(unk=False) self.lm2 = LangID(unk=True) self.lm1.train() self.lm2.train()
def __init__(self, *args, **kwargs): #super(BasicBolt, self).__init__(*args, **kwargs) #print dir(BasicBolt) #BasicBolt.__init__(*args, **kwargs) self.l = LangID() self.l.train()
def main(): parser = ArgumentParser(description='Do you speak London? A library for Natural Language Identification.') parser.add_argument('--version', action='store_true', help='Show version') parser.add_argument('--list-langs', action='store_true', help='List supported languages in training data') parser.add_argument('--unk', choices=['y','n'], default='n', help='Input text to classify') parser.add_argument('--corpus', default='', help='Specify path to custom training-set') parser.add_argument('--lang', help='Add training sample for the language specified') parser.add_argument('input', nargs='*', help='Input text to classify') args = parser.parse_args() #print args unk = False if args.unk == 'n' else True input_text = decode_input(args.input) if args.version: sys.exit(__version__) elif args.list_langs: l = LangID(unk=unk) l.train(root=args.corpus) print 'Languages: [' + '-'.join(l.get_lang_set()) + ']' sys.exit() elif args.lang and input_text: l = LangID(unk=unk) l.train(root=args.corpus) l.add_training_sample(text=input_text, lang=args.lang) l.save_training_samples() sys.exit('Training Sample for "%s" added successfully.\n' % args.lang) elif input_text: l = LangID(unk=unk) l.train(root=args.corpus) lang = l.classify(input_text) print 'Input text:', input_text print 'Language:', lang else: parser.print_help() sys.exit('\n')