Пример #1
0
def test_tweet_iterator():
    import os
    import gzip
    from b4msa.utils import tweet_iterator

    fname = os.path.dirname(__file__) + '/text.json'
    a = [x for x in tweet_iterator(fname)]
    fname_gz = fname + '.gz'
    with open(fname, 'r') as fpt:
        with gzip.open(fname_gz, 'w') as fpt2:
            fpt2.write(fpt.read().encode('ascii'))
    b = [x for x in tweet_iterator(fname_gz)]
    assert len(a) == len(b)
    for a0, b0 in zip(a, b):
        assert a0['text'] == b0['text']
    os.unlink(fname_gz)
Пример #2
0
def test_decision_function_gzip():
    from b4msa.command_line import params, train, test
    from b4msa.utils import tweet_iterator
    import os
    import sys
    import tempfile
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = [
        'b4msa', '-H', '-lspanish', '-o', output, '-k', '2', fname, '-s', '2',
        '-n0'
    ]
    params()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp() + '.gz'
    sys.argv = [
        'b4msa', '-m', output, fname, '-o', output2, '--decision-function'
    ]
    test()
    d = [x for x in tweet_iterator(output2)]
    os.unlink(output)
    os.unlink(output2)
    assert len(d)
    assert len(d) == len([x for x in d if 'decision_function' in x])
Пример #3
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        logging.basicConfig(level=self.data.verbose)
        logger = logging.getLogger('b4msa')
        logger.setLevel(self.data.verbose)
        best = load_json(self.data.params_fname)[0]
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet) + "\n")
        return hy
Пример #4
0
def test_textmodel():
    from b4msa.textmodel import TextModel
    from b4msa.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
Пример #5
0
 def main(self):
     self.data = self.parser.parse_args()
     logging.basicConfig(level=self.data.verbose)
     logger = logging.getLogger('b4msa')
     logger.setLevel(self.data.verbose)
     with open(self.data.model, 'rb') as fpt:
         svc = pickle.load(fpt)
     X = [
         svc.model.transform_q_voc_ratio(x)
         for x in read_data(self.data.test_set)
     ]
     qv = [x[1] for x in X]
     X = [x[0] for x in X]
     output = self.get_output()
     if output.endswith('.gz'):
         gzip_flag = True
         output = gzip.open(output, 'wb')
     else:
         gzip_flag = False
         output = open(output, 'w')
     with output as fpt:
         if not self.data.decision_function:
             hy = svc.predict(X)
             for tweet, klass, r in zip(tweet_iterator(self.data.test_set),
                                        hy, qv):
                 tweet['klass'] = str(klass)
                 tweet['q_voc_ratio'] = r
                 cdn = json.dumps(tweet) + "\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
         else:
             hy = svc.decision_function(X)
             for tweet, klass, r in zip(tweet_iterator(self.data.test_set),
                                        hy, qv):
                 try:
                     o = klass.tolist()
                 except AttributeError:
                     o = klass
                 tweet['decision_function'] = o
                 tweet['q_voc_ratio'] = r
                 cdn = json.dumps(tweet) + "\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
Пример #6
0
 def main(self):
     self.data = self.parser.parse_args()
     logging.basicConfig(level=self.data.verbose)
     with open(self.data.model, 'rb') as fpt:
         svc = pickle.load(fpt)
     with open(self.get_output(), 'w') as fpt:
         for tw in tweet_iterator(self.data.test_set):
             extra = dict(svc.model[tw['text']] +
                          [('num_terms', svc.num_terms)])
             tw.update(extra)
             fpt.write(json.dumps(tw) + "\n")
Пример #7
0
def test_params():
    import os
    import itertools
    from b4msa.params import BASIC_OPTIONS
    from b4msa.textmodel import TextModel
    from b4msa.utils import tweet_iterator

    params = dict(strip_diac=[True, False],
                  usr_option=BASIC_OPTIONS,
                  url_option=BASIC_OPTIONS)
    params = sorted(params.items())
    fname = os.path.dirname(__file__) + '/text.json'
    tw = [x for x in tweet_iterator(fname)]
    text = [x['text'] for x in tw]
    for x in itertools.product(*[x[1] for x in params]):
        args = dict(zip([x[0] for x in params], x))
        ins = TextModel(text, **args)
        assert isinstance(ins[text[0]], list)