예제 #1
0
 def main(self):
     self.data = self.parser.parse_args()
     svc = load_model(self.data.model)
     X = [svc.model[x] for x in read_data(self.data.test_set)]
     output = self.get_output()
     if output.endswith('.gz'):
         gzip_flag = True
         output = gzip.open(output, 'wb')
     else:
         gzip_flag = False
         output = open(output, 'w')
     with output as fpt:
         if not self.data.decision_function:
             hy = svc.predict(X)
             for tweet, klass in zip(tweet_iterator(self.data.test_set), hy):
                 tweet['klass'] = str(klass)
                 cdn = json.dumps(tweet)+"\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
         else:
             hy = svc.decision_function(X)
             for tweet, klass in zip(tweet_iterator(self.data.test_set), hy):
                 try:
                     o = klass.tolist()
                 except AttributeError:
                     o = klass
                 tweet['decision_function'] = o
                 cdn = json.dumps(tweet)+"\n"
                 cdn = bytes(cdn, encoding='utf-8') if gzip_flag else cdn
                 fpt.write(cdn)
예제 #2
0
 def output(self):
     y = [x[self._klass] for x in tweet_iterator(self.data.output)]
     le = base.LabelEncoderWrapper().fit(y)
     perf = getattr(self, "_%s" % self.data.score)
     y = le.transform(y)
     D = []
     I = []
     for fname in self.data.predictions:
         if fname == '-':
             D.append(I)
             I = []
             continue
         hy = le.transform([x[self._klass] for x in tweet_iterator(fname)])
         I.append(perf(y, hy))
     if len(I):
         D.append(I)
     D = np.array(D).T
     p, alpha = compute_p(D)
     self._p = p
     self._alpha = alpha
     for _p, _alpha, mu in zip(p, alpha, D.mean(axis=0)):
         cdn = ''
         if np.isfinite(_alpha):
             cdn = " *"
         print("%0.4f" % mu, cdn)
예제 #3
0
 def main(self):
     fnames = self.data.training_set
     fname = fnames[0]
     _ = [[x, x[self._klass]] for x in tweet_iterator(fname)]
     D = [x[0] for x in _]
     Y = [x[1] for x in _]
     if self.data.test_set is not None:
         if os.path.isfile(self.data.test_set):
             test_set = [x for x in tweet_iterator(self.data.test_set)]
         else:
             test_set = self.data.test_set
     else:
         test_set = None
     kwargs = dict(n_jobs=self.data.n_jobs)
     if self.data.kwargs is not None:
         _ = json.loads(self.data.kwargs)
         kwargs.update(_)
     evo_kwargs = dict()
     if kwargs.get("stacked_method",
                   "EvoDAG.model.EvoDAGE") == "EvoDAG.model.EvoDAGE":
         evo_kwargs = dict(tmpdir=self.data.output_file + '_dir')
     if "stacked_method_args" in kwargs:
         evo_kwargs.update(kwargs["stacked_method_args"])
         del kwargs["stacked_method_args"]
     evo = base.EvoMSA(stacked_method_args=evo_kwargs, **kwargs)
     evo.fit(D, Y, test_set=test_set)
     save_model(evo, self.data.output_file)
예제 #4
0
 def transform(self):
     predict_file = self.data.training_set[0]
     D = [x for x in tweet_iterator(predict_file)]
     evo = self.load_model(self.data.model)
     evo.exogenous = self._exogenous
     D = evo.transform(D)
     with open(self.data.output_file, 'w') as fpt:
         for x, v in zip(tweet_iterator(predict_file), D):
             _ = dict(vec=v.tolist())
             x.update(_)
             fpt.write(json.dumps(x) + '\n')
예제 #5
0
 def main(self):
     if self.data.transform:
         return self.transform()
     elif self.data.fitness:
         return self.fitness()
     if not self.data.b4msa_df:
         return
     fnames = self.data.training_set
     if not isinstance(fnames, list):
         fnames = [fnames]
     D = []
     Y = []
     for fname in fnames:
         _ = [[x, x[self._klass]] for x in tweet_iterator(fname)]
         D.append([x[0] for x in _])
         Y.append([x[1] for x in _])
     self._logger.info('Reading test_set %s' % self.data.test_set)
     if self.data.test_set is not None:
         test_set = [x for x in tweet_iterator(self.data.test_set)]
     else:
         test_set = None
     kwargs = dict(n_jobs=self.data.n_jobs)
     if self.data.kwargs is not None:
         _ = json.loads(self.data.kwargs)
         kwargs.update(_)
     b4msa_kwargs = {}
     if self.data.b4msa_kwargs is not None:
         _ = json.loads(self.data.b4msa_kwargs)
         b4msa_kwargs.update(_)
     evo = base.EvoMSA(b4msa_args=b4msa_kwargs, **kwargs)
     evo.fit_svm(D, Y)
     output = self.data.output_file
     if self.data.test_set is None:
         hy = evo.transform(D[0])
         with open(output, 'w') as fpt:
             for x, y in zip(tweet_iterator(fnames[0]), hy):
                 x.update(dict(vec=y.tolist()))
                 fpt.write(json.dumps(x) + '\n')
     else:
         if not os.path.isdir(output):
             os.mkdir(output)
         train = os.path.join(output, 'train.json')
         hy = evo.transform(D[0])
         with open(train, 'w') as fpt:
             for x, y in zip(tweet_iterator(fnames[0]), hy):
                 x.update(dict(vec=y.tolist()))
                 fpt.write(json.dumps(x) + '\n')
         test = os.path.join(output, 'test.json')
         hy = evo.transform(test_set)
         with open(test, 'w') as fpt:
             for x, y in zip(tweet_iterator(self.data.test_set), hy):
                 x.update(dict(vec=y.tolist()))
                 fpt.write(json.dumps(x) + '\n')
예제 #6
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        if self.data.conf:
            best = json.loads(self.data.conf)
        else:
            best = load_json(self.data.params_fname)[0]
        best = clean_params(best)
        corpus, labels = [], []
        for train in self.data.training_set:
            X_, y_ = read_data_labels(train)
            corpus.extend([x for x in tweet_iterator(train)])
            labels.extend(y_)
        le = LabelEncoder()
        if self.data.labels:
            le.fit(self.data.labels.split(','))
        else:
            le.fit(labels)
        y = le.transform(labels)
        model_klasses = os.environ.get('TEXTMODEL_KLASSES')

        if model_klasses:
            model_klasses = le.transform(model_klasses.split(','))
            docs_ = []
            labels_ = []
            for i in range(len(corpus)):
                if y[i] in model_klasses:
                    docs_.append(corpus[i])
                    labels_.append(y[i])

            t = TextModel(docs_, **best)
        else:
            t = TextModel(corpus, **best)

        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = ClassifierWrapper()
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for train in self.data.training_set:
                for tweet in tweet_iterator(train):
                    tweet['decision_function'] = hy[i].tolist()
                    i += 1
                    fpt.write(json.dumps(tweet) + "\n")
        return hy
예제 #7
0
def test_bernoulli():
    import numpy as np
    from EvoMSA.model import Corpus, Bernoulli
    from sklearn.preprocessing import LabelEncoder
    c = Corpus([x['text'] for x in tweet_iterator(TWEETS)])
    X = c.transform([x['text'] for x in tweet_iterator(TWEETS)])
    y = [x['klass'] for x in tweet_iterator(TWEETS)]
    le = LabelEncoder().fit(y)
    y = le.transform(y)
    b = Bernoulli()
    b.fit(X, y)
    pr = b.decision_function(X)
    assert pr.shape[0] == 1000 and pr.shape[1] == 4
    assert np.all((pr <= 1) & (pr >= -1))
예제 #8
0
def test_multinomial():
    import numpy as np
    from EvoMSA.model import Corpus, Multinomial
    from sklearn.preprocessing import LabelEncoder
    c = Corpus([x['text'] for x in tweet_iterator(TWEETS)])
    X = c.tonp([c[x['text']] for x in tweet_iterator(TWEETS)])
    y = [x['klass'] for x in tweet_iterator(TWEETS)]
    le = LabelEncoder().fit(y)
    y = le.transform(y)
    b = Multinomial()
    b.fit(X, y)
    pr = b.decision_function(X)
    print(pr.shape[0], pr, b.num_terms)
    assert pr.shape[0] == 1000 and pr.shape[1] == 4
    assert np.all((pr <= 1) & (pr >= -1))
예제 #9
0
def test_predict():
    import numpy as np
    sys.argv = ['EvoMSA', '--evodag-kw={"popsize": 10, "early_stopping_rounds": 10, "time_limit": 5, "n_estimators": 5}',
                '-ot.model', '-n2', TWEETS, TWEETS]
    train(output=True)
    sys.argv = ['EvoMSA', '-mt.model', '-ot1.json', TWEETS]
    predict()
    hy = np.array([x['klass'] for x in tweet_iterator('t1.json')])
    [x['decision_function'] for x in tweet_iterator('t1.json')]
    y = np.array([x['klass'] for x in tweet_iterator(TWEETS)])
    acc = (y == hy).mean()
    print(acc)
    assert acc <= 1 and acc > 0.8
    os.unlink('t1.json')
    os.unlink('t.model')
예제 #10
0
def test_TokenCount_process():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.bigrams()
    tcount.process(tweet_iterator(TWEETS))
    print(tcount.counter.most_common(10))
    assert tcount.counter["in~the"] == 313
예제 #11
0
파일: model.py 프로젝트: MDAlvarezH/EvoMSA
 def tokens(self, corpus):
     """Tokens used for modeling"""
     fname = os.path.join(ConPATH, 'data', 'en.affective.words.json')
     lst = []
     for x in tweet_iterator(fname):
         lst += x['words']
     return lst
예제 #12
0
def test_tweet_iterator():
    import os
    import gzip
    from microtc.utils import tweet_iterator
    
    fname = os.path.dirname(__file__) + '/text.json'
    a = [x for x in tweet_iterator(fname)]
    fname_gz = fname + '.gz'
    with open(fname, 'r') as fpt:
        with gzip.open(fname_gz, 'w') as fpt2:
            fpt2.write(fpt.read().encode('ascii'))
    b = [x for x in tweet_iterator(fname_gz)]
    assert len(a) == len(b)
    for a0, b0 in zip(a, b):
        assert a0['text'] == b0['text']
    os.unlink(fname_gz)
예제 #13
0
 def __init__(self, *args, **kwargs):
     fname = os.path.join(os.path.dirname(__file__), 'conf',
                          'aggressiveness.es')
     corpus = []
     for x in tweet_iterator(fname):
         corpus += x['words']
     super(AggressivenessEs, self).__init__(corpus)
예제 #14
0
def test_utils_b4msa_df():
    from EvoMSA.command_line import utils
    import shutil
    sys.argv = ['EvoMSA', '--kw={"seed": 1}', '-omodel.json', '--b4msa-df', TWEETS]
    utils(output=True)
    assert os.path.isfile('model.json')
    sys.argv = ['EvoMSA', '-omodel', '--b4msa-df', '--test_set', TWEETS, TWEETS]
    utils(output=True)
    assert os.path.isdir('model')
    dos = os.path.join('model', 'train.json')
    for a, b in zip(tweet_iterator('model.json'), tweet_iterator(dos)):
        for v, w in zip(a['vec'], b['vec']):
            print(v, w)
            assert_almost_equals(v, w, places=3)
    shutil.rmtree('model')
    os.unlink('model.json')
예제 #15
0
def test_numeric_klass():
    from microtc.utils import tweet_iterator
    from microtc.command_line import params, train, predict
    from sklearn.preprocessing import LabelEncoder
    import os
    import json
    import tempfile
    import sys
    numeric = tempfile.mktemp() + '.json'
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    D = [x for x in tweet_iterator(fname)]
    encoder = LabelEncoder().fit([x['klass'] for x in D])
    y = encoder.transform([x['klass'] for x in D])
    for x, k in zip(D, y):
        x['klass'] = int(k)
    with open(numeric, 'w') as fpt:
        [fpt.write(json.dumps(x) + '\n') for x in D]
    sys.argv = ['microtc', '-o', output, '-k', '2', numeric, '-s', '2']
    params()
    sys.argv = ['microtc', '-m', output, numeric, '-o', output]
    train()
    output2 = tempfile.mktemp()
    sys.argv = ['microtc', '-m', output, fname, '-o', output2]
    predict()
    os.unlink(numeric)
    os.unlink(output)
    os.unlink(output2)
예제 #16
0
def test_tfidf_corpus():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    docs = [text.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs)
    tfidf2 = TFIDF.counter(counter)
    assert tfidf.num_terms == tfidf2.num_terms
    assert tfidf._ndocs == tfidf2._ndocs
    for k in tfidf2.word2id.keys():
        assert k in tfidf2.word2id
    for k, v in tfidf.word2id.items():
        id2 = tfidf2.word2id[k]
        v = tfidf.wordWeight[v]
        v2 = tfidf2.wordWeight[id2]
        print(v, v2, k)
        assert_almost_equals(v, v2)
예제 #17
0
 def fit_from_file(cls, fname, textModel_params={}):
     D = [x for x in tweet_iterator(fname)]
     # X, y = read_data_labels(fname)
     y = [x['klass'] for x in D]
     model = TextModel(D, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in D], y)
예제 #18
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        best = load_json(self.data.params_fname)
        if isinstance(best, list):
            best = best[0]
        best = clean_params(best)
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True, random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet)+"\n")
        return hy
예제 #19
0
def test_tweet_iterator():
    import os
    import gzip
    from microtc.utils import tweet_iterator

    fname = os.path.dirname(__file__) + '/text.json'
    a = [x for x in tweet_iterator(fname)]
    fname_gz = fname + '.gz'
    with open(fname, 'r') as fpt:
        with gzip.open(fname_gz, 'w') as fpt2:
            fpt2.write(fpt.read().encode('ascii'))
    b = [x for x in tweet_iterator(fname_gz)]
    assert len(a) == len(b)
    for a0, b0 in zip(a, b):
        assert a0['text'] == b0['text']
    os.unlink(fname_gz)
예제 #20
0
def test_semantic_token():
    from EvoMSA.model import SemanticTokenEs, EmoSpaceEs

    class EmoTest(EmoSpaceEs):
        @staticmethod
        def model_fname():
            return 'test.evoemo'

    class STest(SemanticTokenEs):
        @property
        def semantic_space(self):
            """Semantic space

            :rtype: instance
            """

            try:
                return self._semantic_space
            except AttributeError:
                self._semantic_space = EmoTest()
            return self._semantic_space

    corpus = [x for x in tweet_iterator(TWEETS)]
    semantic = STest(corpus)
    print(semantic._weight.shape[0])
    assert semantic._weight.shape[0] == 999
    tr = semantic.transform([dict(text='buenos dias')])[0]
    print(tr)
    assert len(tr) == 3
    print([semantic.id2token[x[0]] for x in tr])
예제 #21
0
def test_decision_function_gzip():
    from b4msa.command_line import params, train, test
    from microtc.utils import tweet_iterator
    import os
    import sys
    import tempfile
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = [
        'b4msa', '-H', '-lspanish', '-o', output, '-k', '2', fname, '-s', '2',
        '-n0'
    ]
    params()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp() + '.gz'
    sys.argv = [
        'b4msa', '-m', output, fname, '-o', output2, '--decision-function'
    ]
    test()
    d = [x for x in tweet_iterator(output2)]
    os.unlink(output)
    os.unlink(output2)
    assert len(d)
    assert len(d) == len([x for x in d if 'decision_function' in x])
예제 #22
0
def test_semantic_affective_es():
    from EvoMSA.model import SemanticAffectiveEs, EmoSpaceEs

    class EmoTest(EmoSpaceEs):
        @staticmethod
        def model_fname():
            return 'test.evoemo'

    class STest(SemanticAffectiveEs):
        @property
        def semantic_space(self):
            """Semantic space

            :rtype: instance
            """

            try:
                return self._semantic_space
            except AttributeError:
                self._semantic_space = EmoTest()
            return self._semantic_space

    corpus = [x for x in tweet_iterator(TWEETS)]
    semantic = STest(corpus)
    tokens = semantic.tokens(None)
    assert tokens
    print(semantic._weight.shape[0])
    assert semantic._weight.shape[0] == 1386
예제 #23
0
def test_numeric_klass():
    from microtc.utils import tweet_iterator
    from microtc.params import DefaultParams, Fixed
    from microtc.command_line import params, train, predict
    from sklearn.preprocessing import LabelEncoder
    import os
    import json
    import tempfile
    import sys
    numeric = tempfile.mktemp() + '.json'
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    D = [x for x in tweet_iterator(fname)]
    encoder = LabelEncoder().fit([x['klass'] for x in D])
    y = encoder.transform([x['klass'] for x in D])
    for x, k in zip(D, y):
        x['klass'] = int(k)

    with open(numeric, 'w') as fpt:
        [fpt.write(json.dumps(x) + '\n') for x in D]

    P = DefaultParams.copy()
    P["dist_vector"] = Fixed("entropy+0+1")
    params('-o', output, '-k', '2', numeric, '-s', '2', **P)
    train('-m', output, numeric, '-o', output)
    output2 = tempfile.mktemp()
    predict('-m', output, fname, '-o', output2)
    os.unlink(numeric)
    os.unlink(output)
    os.unlink(output2)
예제 #24
0
 def fit_from_file(cls, fname, textModel_params={}):
     D = [x for x in tweet_iterator(fname)]
     # X, y = read_data_labels(fname)
     y = [x['klass'] for x in D]
     model = TextModel(D, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in D], y)
예제 #25
0
def test_textmodel_weighting_key():
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    for w in ['tfidf', 'tf', 'entropy']:
        TextModel(token_list=[-2, -1], weighting=w).fit(tw)
예제 #26
0
def test_OutputClassifier():
    from EvoMSA.model import Corpus, OutputClassifier
    from sklearn.preprocessing import LabelEncoder
    c = Corpus([x['text'] for x in tweet_iterator(TWEETS)])
    X = c.transform([x['text'] for x in tweet_iterator(TWEETS)])
    y = [x['klass'] for x in tweet_iterator(TWEETS)]
    le = LabelEncoder().fit(y)
    y = le.transform(y)
    b = OutputClassifier(output='xx')
    assert b._output == 'xx'
    b.fit(X, y)
    assert os.path.isfile('xx_train.csv')
    pr = b.decision_function(X)
    assert os.path.isfile('xx_test.csv')
    assert len(open('xx_test.csv').readlines()) == pr.shape[0]
    os.unlink('xx_train.csv')
    os.unlink('xx_test.csv')
예제 #27
0
def test_TokenCount_clean():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.single_co_ocurrence()    
    tcount.process(tweet_iterator(TWEETS))
    ant = len(tcount.counter)
    tcount.clean()
    act = len(tcount.counter)
    assert ant > act
예제 #28
0
 def main(self):
     self.data = self.parser.parse_args()
     svc = load_model(self.data.model)
     with open(self.get_output(), 'w') as fpt:
         for tw in tweet_iterator(self.data.test_set):
             extra = dict([(int(a), float(b)) for a, b in svc.model[tw['text']]]
                          + [('num_terms', svc.num_terms)])
             tw.update(extra)
             fpt.write(json.dumps(tw) + "\n")
예제 #29
0
 def tweet_iterator(self, fname):
     lang = self._lang
     for tw in tweet_iterator(fname):
         if tw.get('lang', '') == lang:
             text = get_text(tw)
             if text[:2] == 'RT':
                 continue
             tw['text'] = text
             yield tw
예제 #30
0
def test_textmodel_entropy():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, threshold=0.01)
    assert isinstance(text, TextModel)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 39
예제 #31
0
def test_textmodel():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
예제 #32
0
def test_textmodel():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
예제 #33
0
def test_label_encoder():
    import numpy as np
    from EvoMSA.base import LabelEncoderWrapper
    y = [2, 2, -1, 0, 0]
    l = LabelEncoderWrapper().fit(y)
    print(l._m)
    yy = l.transform(y)
    assert np.all(np.array([2, 2, 0, 1, 1]) == yy)
    y = [x['klass'] for x in tweet_iterator(TWEETS)]
    l = LabelEncoderWrapper().fit(y)
예제 #34
0
def test_textmodel_entropy():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, threshold=0.01)
    assert isinstance(text, TextModel)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 299
예제 #35
0
def test_textmodel_token_min_filter():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, token_min_filter=1)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 59
    text = TextModel(tw, token_min_filter=0.3)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 12
    text = TextModel(tw, token_min_filter=1, threshold=0.01)
예제 #36
0
def test_params():
    import os
    import itertools
    from b4msa.params import BASIC_OPTIONS
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator

    params = dict(del_diac=[True, False], usr_option=BASIC_OPTIONS,
                  url_option=BASIC_OPTIONS)
    params = sorted(params.items())
    fname = os.path.dirname(__file__) + '/text.json'
    tw = [x for x in tweet_iterator(fname)]
    text = [x['text'] for x in tw]
    for x in itertools.product(*[x[1] for x in params]):
        args = dict(zip([x[0] for x in params], x))
        ins = TextModel(text, **args)
        assert isinstance(ins[text[0]], list)
예제 #37
0
def test_decision_function():
    from b4msa.command_line import params, train, test
    from microtc.utils import tweet_iterator
    import os
    import sys
    import tempfile
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2']
    params()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp()
    sys.argv = ['b4msa', '-m', output, fname,
                '-o', output2, '--decision-function']
    test()
    d = [x for x in tweet_iterator(output2)]
    os.unlink(output)
    os.unlink(output2)
    assert len(d)
    assert len(d) == len([x for x in d if 'decision_function' in x])