def test_remove(): dset = Dataset() dset.add(dset.load_emojis()) dset.add(dset.tm_words()) xx = dset.klass("xxx good morning xxx asdfa") print(xx) assert len(xx) == 2 dset.remove("~good~") xx = dset.klass("xxx good xxx morning xxx") print(xx) assert len(xx) == 1
def test_klass(): dset = Dataset() # dset.add(dset.load_emojis()) dset.add(dset.tm_words()) kl = dset.klasses xx = dset.klass("xxx good xxx morning xxT") for k in xx: assert k in kl
def emo_data(lang='zh'): fnames = glob(join('data', lang, '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) for fname in fnames: output = dict() for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] inner = [] for tweet, label in zip(tweets, labels): if len(label) == 0: continue tweet['klass'] = label inner.append(tweet) if len(inner): output[key] = inner if len(output) == 0: continue output_fname = join(dirname(fname), 'emo') if not isdir(output_fname): os.mkdir(output_fname) output_fname = join(output_fname, basename(fname)) save_model(output, output_fname)
def test_map(): dset = Dataset() dset.add(dict(buenos="malos")) res = dset.klass("en estos buenos dias") print(res) assert "malos" in res