def test_klass(): dset = Dataset() # dset.add(dset.load_emojis()) dset.add(dset.tm_words()) kl = dset.klasses xx = dset.klass("xxx good xxx morning xxT") for k in xx: assert k in kl
def test_add(): dset = Dataset() assert len(dset.klasses) == 0 dset.add(dset.load_emojis()) cnt = len(dset.klasses) assert cnt > 0 words = dset.tm_words() dset.add(words) print(len(dset.klasses), len(words), cnt) assert len(dset.klasses) <= len(words) + cnt
def test_remove(): dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.klass("xxx good morning xxx asdfa") print(xx) assert len(xx) == 2 dset.remove("~good~") xx = dset.klass("xxx good xxx morning xxx") print(xx) assert len(xx) == 1
def test_process(): from microtc.emoticons import convert_emoji dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.process("xxx good 9 morning xxx fax x la", "~x~") for a, b in zip(xx, ["~xxx~good~9~morning~xxx~fax~", "~la~", "~la~"]): print(a, b) assert a == b txt = 'xxx good {} morning xxx fax x la'.format(convert_emoji('1F600')) xx = dset.process(txt, convert_emoji('1F600')) print(xx) for a, b in zip(xx, ["~xxx~good~", "~morning~xxx~fax~x~la~"]): assert a == b