def test_tweet_iterator(): import os import gzip from b4msa.utils import tweet_iterator fname = os.path.dirname(__file__) + '/text.json' a = [x for x in tweet_iterator(fname)] fname_gz = fname + '.gz' with open(fname, 'r') as fpt: with gzip.open(fname_gz, 'w') as fpt2: fpt2.write(fpt.read().encode('ascii')) b = [x for x in tweet_iterator(fname_gz)] assert len(a) == len(b) for a0, b0 in zip(a, b): assert a0['text'] == b0['text'] os.unlink(fname_gz)
def test_decision_function_gzip(): from b4msa.command_line import params, train, test from b4msa.utils import tweet_iterator import os import sys import tempfile output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = [ 'b4msa', '-H', '-lspanish', '-o', output, '-k', '2', fname, '-s', '2', '-n0' ] params() sys.argv = ['b4msa', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() + '.gz' sys.argv = [ 'b4msa', '-m', output, fname, '-o', output2, '--decision-function' ] test() d = [x for x in tweet_iterator(output2)] os.unlink(output) os.unlink(output2) assert len(d) assert len(d) == len([x for x in d if 'decision_function' in x])
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) best = load_json(self.data.params_fname)[0] print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def test_textmodel(): from b4msa.textmodel import TextModel from b4msa.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel([x['text'] for x in tw]) # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle")) # assert False assert isinstance(text[tw[0]['text']], list)
def main(self): self.data = self.parser.parse_args() logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) with open(self.data.model, 'rb') as fpt: svc = pickle.load(fpt) X = [ svc.model.transform_q_voc_ratio(x) for x in read_data(self.data.test_set) ] qv = [x[1] for x in X] X = [x[0] for x in X] output = self.get_output() if output.endswith('.gz'): gzip_flag = True output = gzip.open(output, 'wb') else: gzip_flag = False output = open(output, 'w') with output as fpt: if not self.data.decision_function: hy = svc.predict(X) for tweet, klass, r in zip(tweet_iterator(self.data.test_set), hy, qv): tweet['klass'] = str(klass) tweet['q_voc_ratio'] = r cdn = json.dumps(tweet) + "\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn) else: hy = svc.decision_function(X) for tweet, klass, r in zip(tweet_iterator(self.data.test_set), hy, qv): try: o = klass.tolist() except AttributeError: o = klass tweet['decision_function'] = o tweet['q_voc_ratio'] = r cdn = json.dumps(tweet) + "\n" cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn fpt.write(cdn)
def main(self): self.data = self.parser.parse_args() logging.basicConfig(level=self.data.verbose) with open(self.data.model, 'rb') as fpt: svc = pickle.load(fpt) with open(self.get_output(), 'w') as fpt: for tw in tweet_iterator(self.data.test_set): extra = dict(svc.model[tw['text']] + [('num_terms', svc.num_terms)]) tw.update(extra) fpt.write(json.dumps(tw) + "\n")
def test_params(): import os import itertools from b4msa.params import BASIC_OPTIONS from b4msa.textmodel import TextModel from b4msa.utils import tweet_iterator params = dict(strip_diac=[True, False], usr_option=BASIC_OPTIONS, url_option=BASIC_OPTIONS) params = sorted(params.items()) fname = os.path.dirname(__file__) + '/text.json' tw = [x for x in tweet_iterator(fname)] text = [x['text'] for x in tw] for x in itertools.product(*[x[1] for x in params]): args = dict(zip([x[0] for x in params], x)) ins = TextModel(text, **args) assert isinstance(ins[text[0]], list)