def main(self): self.data = self.parser.parse_args() svc = load_model(self.data.model) X = [svc.model[x] for x in read_data(self.data.test_set)] output = self.get_output() if output.endswith('.gz'): gzip_flag = True output = gzip.open(output, 'wb') else: gzip_flag = False output = open(output, 'w') with output as fpt: if not self.data.decision_function: hy = svc.predict(X) for tweet, klass in zip(tweet_iterator(self.data.test_set), hy): tweet['klass'] = str(klass) cdn = json.dumps(tweet)+"\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn) else: hy = svc.decision_function(X) for tweet, klass in zip(tweet_iterator(self.data.test_set), hy): try: o = klass.tolist() except AttributeError: o = klass tweet['decision_function'] = o cdn = json.dumps(tweet)+"\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn)
def output(self): y = [x[self._klass] for x in tweet_iterator(self.data.output)] le = base.LabelEncoderWrapper().fit(y) perf = getattr(self, "_%s" % self.data.score) y = le.transform(y) D = [] I = [] for fname in self.data.predictions: if fname == '-': D.append(I) I = [] continue hy = le.transform([x[self._klass] for x in tweet_iterator(fname)]) I.append(perf(y, hy)) if len(I): D.append(I) D = np.array(D).T p, alpha = compute_p(D) self._p = p self._alpha = alpha for _p, _alpha, mu in zip(p, alpha, D.mean(axis=0)): cdn = '' if np.isfinite(_alpha): cdn = " *" print("%0.4f" % mu, cdn)
def main(self): fnames = self.data.training_set fname = fnames[0] _ = [[x, x[self._klass]] for x in tweet_iterator(fname)] D = [x[0] for x in _] Y = [x[1] for x in _] if self.data.test_set is not None: if os.path.isfile(self.data.test_set): test_set = [x for x in tweet_iterator(self.data.test_set)] else: test_set = self.data.test_set else: test_set = None kwargs = dict(n_jobs=self.data.n_jobs) if self.data.kwargs is not None: _ = json.loads(self.data.kwargs) kwargs.update(_) evo_kwargs = dict() if kwargs.get("stacked_method", "EvoDAG.model.EvoDAGE") == "EvoDAG.model.EvoDAGE": evo_kwargs = dict(tmpdir=self.data.output_file + '_dir') if "stacked_method_args" in kwargs: evo_kwargs.update(kwargs["stacked_method_args"]) del kwargs["stacked_method_args"] evo = base.EvoMSA(stacked_method_args=evo_kwargs, **kwargs) evo.fit(D, Y, test_set=test_set) save_model(evo, self.data.output_file)
def transform(self): predict_file = self.data.training_set[0] D = [x for x in tweet_iterator(predict_file)] evo = self.load_model(self.data.model) evo.exogenous = self._exogenous D = evo.transform(D) with open(self.data.output_file, 'w') as fpt: for x, v in zip(tweet_iterator(predict_file), D): _ = dict(vec=v.tolist()) x.update(_) fpt.write(json.dumps(x) + '\n')
def main(self): if self.data.transform: return self.transform() elif self.data.fitness: return self.fitness() if not self.data.b4msa_df: return fnames = self.data.training_set if not isinstance(fnames, list): fnames = [fnames] D = [] Y = [] for fname in fnames: _ = [[x, x[self._klass]] for x in tweet_iterator(fname)] D.append([x[0] for x in _]) Y.append([x[1] for x in _]) self._logger.info('Reading test_set %s' % self.data.test_set) if self.data.test_set is not None: test_set = [x for x in tweet_iterator(self.data.test_set)] else: test_set = None kwargs = dict(n_jobs=self.data.n_jobs) if self.data.kwargs is not None: _ = json.loads(self.data.kwargs) kwargs.update(_) b4msa_kwargs = {} if self.data.b4msa_kwargs is not None: _ = json.loads(self.data.b4msa_kwargs) b4msa_kwargs.update(_) evo = base.EvoMSA(b4msa_args=b4msa_kwargs, **kwargs) evo.fit_svm(D, Y) output = self.data.output_file if self.data.test_set is None: hy = evo.transform(D[0]) with open(output, 'w') as fpt: for x, y in zip(tweet_iterator(fnames[0]), hy): x.update(dict(vec=y.tolist())) fpt.write(json.dumps(x) + '\n') else: if not os.path.isdir(output): os.mkdir(output) train = os.path.join(output, 'train.json') hy = evo.transform(D[0]) with open(train, 'w') as fpt: for x, y in zip(tweet_iterator(fnames[0]), hy): x.update(dict(vec=y.tolist())) fpt.write(json.dumps(x) + '\n') test = os.path.join(output, 'test.json') hy = evo.transform(test_set) with open(test, 'w') as fpt: for x, y in zip(tweet_iterator(self.data.test_set), hy): x.update(dict(vec=y.tolist())) fpt.write(json.dumps(x) + '\n')
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass if self.data.conf: best = json.loads(self.data.conf) else: best = load_json(self.data.params_fname)[0] best = clean_params(best) corpus, labels = [], [] for train in self.data.training_set: X_, y_ = read_data_labels(train) corpus.extend([x for x in tweet_iterator(train)]) labels.extend(y_) le = LabelEncoder() if self.data.labels: le.fit(self.data.labels.split(',')) else: le.fit(labels) y = le.transform(labels) model_klasses = os.environ.get('TEXTMODEL_KLASSES') if model_klasses: model_klasses = le.transform(model_klasses.split(',')) docs_ = [] labels_ = [] for i in range(len(corpus)): if y[i] in model_klasses: docs_.append(corpus[i]) labels_.append(y[i]) t = TextModel(docs_, **best) else: t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = ClassifierWrapper() c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for train in self.data.training_set: for tweet in tweet_iterator(train): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def test_bernoulli(): import numpy as np from EvoMSA.model import Corpus, Bernoulli from sklearn.preprocessing import LabelEncoder c = Corpus([x['text'] for x in tweet_iterator(TWEETS)]) X = c.transform([x['text'] for x in tweet_iterator(TWEETS)]) y = [x['klass'] for x in tweet_iterator(TWEETS)] le = LabelEncoder().fit(y) y = le.transform(y) b = Bernoulli() b.fit(X, y) pr = b.decision_function(X) assert pr.shape[0] == 1000 and pr.shape[1] == 4 assert np.all((pr <= 1) & (pr >= -1))
def test_multinomial(): import numpy as np from EvoMSA.model import Corpus, Multinomial from sklearn.preprocessing import LabelEncoder c = Corpus([x['text'] for x in tweet_iterator(TWEETS)]) X = c.tonp([c[x['text']] for x in tweet_iterator(TWEETS)]) y = [x['klass'] for x in tweet_iterator(TWEETS)] le = LabelEncoder().fit(y) y = le.transform(y) b = Multinomial() b.fit(X, y) pr = b.decision_function(X) print(pr.shape[0], pr, b.num_terms) assert pr.shape[0] == 1000 and pr.shape[1] == 4 assert np.all((pr <= 1) & (pr >= -1))
def test_predict(): import numpy as np sys.argv = ['EvoMSA', '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}', '-ot.model', '-n2', TWEETS, TWEETS] train(output=True) sys.argv = ['EvoMSA', '-mt.model', '-ot1.json', TWEETS] predict() hy = np.array([x['klass'] for x in tweet_iterator('t1.json')]) [x['decision_function'] for x in tweet_iterator('t1.json')] y = np.array([x['klass'] for x in tweet_iterator(TWEETS)]) acc = (y == hy).mean() print(acc) assert acc <= 1 and acc > 0.8 os.unlink('t1.json') os.unlink('t.model')
def test_TokenCount_process(): from microtc.utils import tweet_iterator from text_models.dataset import TokenCount tcount = TokenCount.bigrams() tcount.process(tweet_iterator(TWEETS)) print(tcount.counter.most_common(10)) assert tcount.counter["in~the"] == 313
def tokens(self, corpus): """Tokens used for modeling""" fname = os.path.join(ConPATH, 'data', 'en.affective.words.json') lst = [] for x in tweet_iterator(fname): lst += x['words'] return lst
def test_tweet_iterator(): import os import gzip from microtc.utils import tweet_iterator fname = os.path.dirname(__file__) + '/text.json' a = [x for x in tweet_iterator(fname)] fname_gz = fname + '.gz' with open(fname, 'r') as fpt: with gzip.open(fname_gz, 'w') as fpt2: fpt2.write(fpt.read().encode('ascii')) b = [x for x in tweet_iterator(fname_gz)] assert len(a) == len(b) for a0, b0 in zip(a, b): assert a0['text'] == b0['text'] os.unlink(fname_gz)
def __init__(self, *args, **kwargs): fname = os.path.join(os.path.dirname(__file__), 'conf', 'aggressiveness.es') corpus = [] for x in tweet_iterator(fname): corpus += x['words'] super(AggressivenessEs, self).__init__(corpus)
def test_utils_b4msa_df(): from EvoMSA.command_line import utils import shutil sys.argv = ['EvoMSA', '--kw={"seed": 1}', '-omodel.json', '--b4msa-df', TWEETS] utils(output=True) assert os.path.isfile('model.json') sys.argv = ['EvoMSA', '-omodel', '--b4msa-df', '--test_set', TWEETS, TWEETS] utils(output=True) assert os.path.isdir('model') dos = os.path.join('model', 'train.json') for a, b in zip(tweet_iterator('model.json'), tweet_iterator(dos)): for v, w in zip(a['vec'], b['vec']): print(v, w) assert_almost_equals(v, w, places=3) shutil.rmtree('model') os.unlink('model.json')
def test_numeric_klass(): from microtc.utils import tweet_iterator from microtc.command_line import params, train, predict from sklearn.preprocessing import LabelEncoder import os import json import tempfile import sys numeric = tempfile.mktemp() + '.json' output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' D = [x for x in tweet_iterator(fname)] encoder = LabelEncoder().fit([x['klass'] for x in D]) y = encoder.transform([x['klass'] for x in D]) for x, k in zip(D, y): x['klass'] = int(k) with open(numeric, 'w') as fpt: [fpt.write(json.dumps(x) + '\n') for x in D] sys.argv = ['microtc', '-o', output, '-k', '2', numeric, '-s', '2'] params() sys.argv = ['microtc', '-m', output, numeric, '-o', output] train() output2 = tempfile.mktemp() sys.argv = ['microtc', '-m', output, fname, '-o', output2] predict() os.unlink(numeric) os.unlink(output) os.unlink(output2)
def test_tfidf_corpus(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) docs = [text.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs) tfidf2 = TFIDF.counter(counter) assert tfidf.num_terms == tfidf2.num_terms assert tfidf._ndocs == tfidf2._ndocs for k in tfidf2.word2id.keys(): assert k in tfidf2.word2id for k, v in tfidf.word2id.items(): id2 = tfidf2.word2id[k] v = tfidf.wordWeight[v] v2 = tfidf2.wordWeight[id2] print(v, v2, k) assert_almost_equals(v, v2)
def fit_from_file(cls, fname, textModel_params={}): D = [x for x in tweet_iterator(fname)] # X, y = read_data_labels(fname) y = [x['klass'] for x in D] model = TextModel(D, **textModel_params) svc = cls(model) return svc.fit([model[x] for x in D], y)
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass best = load_json(self.data.params_fname) if isinstance(best, list): best = best[0] best = clean_params(best) print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet)+"\n") return hy
def test_semantic_token(): from EvoMSA.model import SemanticTokenEs, EmoSpaceEs class EmoTest(EmoSpaceEs): @staticmethod def model_fname(): return 'test.evoemo' class STest(SemanticTokenEs): @property def semantic_space(self): """Semantic space :rtype: instance """ try: return self._semantic_space except AttributeError: self._semantic_space = EmoTest() return self._semantic_space corpus = [x for x in tweet_iterator(TWEETS)] semantic = STest(corpus) print(semantic._weight.shape[0]) assert semantic._weight.shape[0] == 999 tr = semantic.transform([dict(text='buenos dias')])[0] print(tr) assert len(tr) == 3 print([semantic.id2token[x[0]] for x in tr])
def test_decision_function_gzip(): from b4msa.command_line import params, train, test from microtc.utils import tweet_iterator import os import sys import tempfile output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = [ 'b4msa', '-H', '-lspanish', '-o', output, '-k', '2', fname, '-s', '2', '-n0' ] params() sys.argv = ['b4msa', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() + '.gz' sys.argv = [ 'b4msa', '-m', output, fname, '-o', output2, '--decision-function' ] test() d = [x for x in tweet_iterator(output2)] os.unlink(output) os.unlink(output2) assert len(d) assert len(d) == len([x for x in d if 'decision_function' in x])
def test_semantic_affective_es(): from EvoMSA.model import SemanticAffectiveEs, EmoSpaceEs class EmoTest(EmoSpaceEs): @staticmethod def model_fname(): return 'test.evoemo' class STest(SemanticAffectiveEs): @property def semantic_space(self): """Semantic space :rtype: instance """ try: return self._semantic_space except AttributeError: self._semantic_space = EmoTest() return self._semantic_space corpus = [x for x in tweet_iterator(TWEETS)] semantic = STest(corpus) tokens = semantic.tokens(None) assert tokens print(semantic._weight.shape[0]) assert semantic._weight.shape[0] == 1386
def test_numeric_klass(): from microtc.utils import tweet_iterator from microtc.params import DefaultParams, Fixed from microtc.command_line import params, train, predict from sklearn.preprocessing import LabelEncoder import os import json import tempfile import sys numeric = tempfile.mktemp() + '.json' output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' D = [x for x in tweet_iterator(fname)] encoder = LabelEncoder().fit([x['klass'] for x in D]) y = encoder.transform([x['klass'] for x in D]) for x, k in zip(D, y): x['klass'] = int(k) with open(numeric, 'w') as fpt: [fpt.write(json.dumps(x) + '\n') for x in D] P = DefaultParams.copy() P["dist_vector"] = Fixed("entropy+0+1") params('-o', output, '-k', '2', numeric, '-s', '2', **P) train('-m', output, numeric, '-o', output) output2 = tempfile.mktemp() predict('-m', output, fname, '-o', output2) os.unlink(numeric) os.unlink(output) os.unlink(output2)
def test_textmodel_weighting_key(): from microtc.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) for w in ['tfidf', 'tf', 'entropy']: TextModel(token_list=[-2, -1], weighting=w).fit(tw)
def test_OutputClassifier(): from EvoMSA.model import Corpus, OutputClassifier from sklearn.preprocessing import LabelEncoder c = Corpus([x['text'] for x in tweet_iterator(TWEETS)]) X = c.transform([x['text'] for x in tweet_iterator(TWEETS)]) y = [x['klass'] for x in tweet_iterator(TWEETS)] le = LabelEncoder().fit(y) y = le.transform(y) b = OutputClassifier(output='xx') assert b._output == 'xx' b.fit(X, y) assert os.path.isfile('xx_train.csv') pr = b.decision_function(X) assert os.path.isfile('xx_test.csv') assert len(open('xx_test.csv').readlines()) == pr.shape[0] os.unlink('xx_train.csv') os.unlink('xx_test.csv')
def test_TokenCount_clean(): from microtc.utils import tweet_iterator from text_models.dataset import TokenCount tcount = TokenCount.single_co_ocurrence() tcount.process(tweet_iterator(TWEETS)) ant = len(tcount.counter) tcount.clean() act = len(tcount.counter) assert ant > act
def main(self): self.data = self.parser.parse_args() svc = load_model(self.data.model) with open(self.get_output(), 'w') as fpt: for tw in tweet_iterator(self.data.test_set): extra = dict([(int(a), float(b)) for a, b in svc.model[tw['text']]] + [('num_terms', svc.num_terms)]) tw.update(extra) fpt.write(json.dumps(tw) + "\n")
def tweet_iterator(self, fname): lang = self._lang for tw in tweet_iterator(fname): if tw.get('lang', '') == lang: text = get_text(tw) if text[:2] == 'RT': continue tw['text'] = text yield tw
def test_textmodel_entropy(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, threshold=0.01) assert isinstance(text, TextModel) print(len(text.model._w2id)) assert len(text.model._w2id) == 39
def test_textmodel(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel([x['text'] for x in tw]) # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle")) # assert False assert isinstance(text[tw[0]['text']], list)
def test_label_encoder(): import numpy as np from EvoMSA.base import LabelEncoderWrapper y = [2, 2, -1, 0, 0] l = LabelEncoderWrapper().fit(y) print(l._m) yy = l.transform(y) assert np.all(np.array([2, 2, 0, 1, 1]) == yy) y = [x['klass'] for x in tweet_iterator(TWEETS)] l = LabelEncoderWrapper().fit(y)
def test_textmodel_entropy(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, threshold=0.01) assert isinstance(text, TextModel) print(len(text.model._w2id)) assert len(text.model._w2id) == 299
def test_textmodel_token_min_filter(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, token_min_filter=1) print(len(text.model._w2id)) assert len(text.model._w2id) == 59 text = TextModel(tw, token_min_filter=0.3) print(len(text.model._w2id)) assert len(text.model._w2id) == 12 text = TextModel(tw, token_min_filter=1, threshold=0.01)
def test_params(): import os import itertools from b4msa.params import BASIC_OPTIONS from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator params = dict(del_diac=[True, False], usr_option=BASIC_OPTIONS, url_option=BASIC_OPTIONS) params = sorted(params.items()) fname = os.path.dirname(__file__) + '/text.json' tw = [x for x in tweet_iterator(fname)] text = [x['text'] for x in tw] for x in itertools.product(*[x[1] for x in params]): args = dict(zip([x[0] for x in params], x)) ins = TextModel(text, **args) assert isinstance(ins[text[0]], list)
def test_decision_function(): from b4msa.command_line import params, train, test from microtc.utils import tweet_iterator import os import sys import tempfile output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2'] params() sys.argv = ['b4msa', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() sys.argv = ['b4msa', '-m', output, fname, '-o', output2, '--decision-function'] test() d = [x for x in tweet_iterator(output2)] os.unlink(output) os.unlink(output2) assert len(d) assert len(d) == len([x for x in d if 'decision_function' in x])