Exemplo n.º 1
0
    def run(self, X, y):
        """Perform the search using X and y to guide it

        :param X: Dataset set - independent variables
        :type X: list
        :param y: Dataset set - dependent variable
        :type y: list or np.array
        :rtype: :py:class:`EvoMSA.model_selection.Node`
        """

        self._logger.info("Starting the search")
        self._le = self._le.fit(y)
        y = self._le.transform(y)
        r = [(node.performance(X, y), node) for node in self._nodes]
        node = max(r, key=lambda x: x[0])[1]
        while True:
            self._logger.info("Model: %s perf: %0.4f" % (node, node.perf))
            nodes = list(node)
            if len(nodes) == 0:
                if self._output:
                    save_model(node, self._output)
                return node
            r = [(xx.performance(X, y), xx) for xx in nodes]
            perf, comp = max(r, key=lambda x: x[0])
            if perf < node.perf:
                break
            node = comp
        if self._output:
            save_model(node, self._output)
        return node
Exemplo n.º 2
0
 def main(self):
     fnames = self.data.training_set
     fname = fnames[0]
     _ = [[x, x[self._klass]] for x in tweet_iterator(fname)]
     D = [x[0] for x in _]
     Y = [x[1] for x in _]
     if self.data.test_set is not None:
         if os.path.isfile(self.data.test_set):
             test_set = [x for x in tweet_iterator(self.data.test_set)]
         else:
             test_set = self.data.test_set
     else:
         test_set = None
     kwargs = dict(n_jobs=self.data.n_jobs)
     if self.data.kwargs is not None:
         _ = json.loads(self.data.kwargs)
         kwargs.update(_)
     evo_kwargs = dict()
     if kwargs.get("stacked_method",
                   "EvoDAG.model.EvoDAGE") == "EvoDAG.model.EvoDAGE":
         evo_kwargs = dict(tmpdir=self.data.output_file + '_dir')
     if "stacked_method_args" in kwargs:
         evo_kwargs.update(kwargs["stacked_method_args"])
         del kwargs["stacked_method_args"]
     evo = base.EvoMSA(stacked_method_args=evo_kwargs, **kwargs)
     evo.fit(D, Y, test_set=test_set)
     save_model(evo, self.data.output_file)
Exemplo n.º 3
0
def bow(lang='zh', num_terms=2**14):
    tweets = data_bow(lang=lang)
    token_min_filter = 0
    if lang == 'zh':
        token_list = [1, 2, 3]
        q_grams_words = False
        # token_min_filter=0.0005
    else:
        token_list = [-1, 2, 3, 4]
        q_grams_words = True
        # token_min_filter=0.001
    tm = TextModel(token_list=token_list,
                   token_max_filter=len(tweets),
                   token_min_filter=token_min_filter,
                   q_grams_words=q_grams_words,
                   **TM_ARGS).fit(tweets)

    model = tm.model
    id2word = {v: k for k, v in model.word2id.items()}
    N = model._ndocs
    word_weight = [[N / 2**v, id2word[k]] for k, v in model.wordWeight.items()]
    word_weight.sort(key=lambda x: x[0], reverse=True)
    word_weight = word_weight[:num_terms]
    model.word2id = {token: k for k, (w, token) in enumerate(word_weight)}
    model.wordWeight = {k: w for k, (w, token) in enumerate(word_weight)}

    save_model(tm, join('models', f'{lang}_{microtc.__version__}.microtc'))
    return tm
Exemplo n.º 4
0
    def run(self, X, y, early_stopping=1000, **kwargs):
        """

        :param early_stopping: Number of rounds to perform early stopping
        :type early_stopping: int
        :rtype: :py:class:`EvoMSA.model_selection.Node`
        """
        visited = set([(node.performance(X, y, **kwargs), node)
                       for node in self._nodes])
        _ = max(visited, key=lambda x: x[0])[1]
        best = None
        nodes = LifoQueue()
        nodes.put(_)
        index = len(visited)
        while not nodes.empty() and (len(visited) - index) < early_stopping:
            node = nodes.get()
            if best is None or node > best:
                index = len(visited)
                best = node
                if self._output:
                    save_model(best, self._output)
            self._logger.info("Model: %s perf: %0.4f " % (best, best.perf) +
                              "visited: %s " % len(visited) +
                              "size: %s " % nodes.qsize() + "Rounds: %s" %
                              (len(visited) - index))
            nn = [(xx, xx.fit(self._X, self._y,
                              **self._kwargs).performance(X, y, **kwargs))
                  for xx in node if xx not in visited]
            [visited.add(x) for x, _ in nn]
            nn = [xx for xx, perf in nn if perf >= self.perf(node)]
            if len(nn) == 0:
                continue
            nn.sort()
            [nodes.put(x) for x in nn]
        return best
Exemplo n.º 5
0
def test_textmodel_save_load():
    import os
    from microtc.textmodel import TextModel
    from microtc.utils import tweet_iterator, save_model, load_model
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    tm = TextModel().fit(tw)
    save_model(tm, 't.model')
    assert isinstance(load_model('t.model'), TextModel)
    os.unlink('t.model')
Exemplo n.º 6
0
def vector_space(args):
    k, t, X, output = args
    if output is not None and os.path.isfile(output):
        return k, load_model(output)
    try:
        res = t.transform(X)
    except AttributeError:
        res = t.tonp([t[_] for _ in X])
    if output is not None:
        save_model(res, output)
    return k, res
Exemplo n.º 7
0
    def create_space(cls, fname, output=None, **kwargs):
        """Create the space from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param output: Path to store the model, it is cls.model_fname if None
        :type output: str
        :param kwargs: Keywords pass to TextModel
        """
        tm, coef, intercept, klass = cls._create_space(fname, **kwargs)
        if output is None:
            output = cls.model_fname()
        save_model([tm, coef, intercept, klass], output)
Exemplo n.º 8
0
def test_counter():
    from microtc.utils import Counter, save_model, load_model
    import os
    c = Counter()
    c.update([1, 2, 3, 1])
    c.update([3])
    assert c[1] == 2
    print(c.update_calls)
    assert c.update_calls == 2
    save_model(c, "t.voc")
    cc = load_model("t.voc")
    os.unlink("t.voc")
    print(cc.update_calls, "**")
    assert cc.update_calls ==  2
Exemplo n.º 9
0
def get_model(basename, data, labels, args):
    modelfile = get_filename(args, os.path.join("models", os.path.basename(basename)))

    if not os.path.exists(modelfile):

        if not os.path.isdir("models"):
            os.mkdir("models")

        args['docs'] = data
        model = TextModel(**args)
        save_model(model, modelfile)
    else:
        model = load_model(modelfile)
    return model
Exemplo n.º 10
0
 def main(self):
     self.data = self.parser.parse_args()
     params_fname = self.data.params_fname
     if params_fname is not None:
         best = load_json(params_fname)
         if isinstance(best, list):
             best = best[0]
     else:
         best = dict()
     best = clean_params(best)
     kw = json.loads(self.data.kwargs) if self.data.kwargs is not None else dict()
     best.update(kw)
     svc = SVC.fit_from_file(self.data.training_set, best)
     save_model(svc, self.get_output())
Exemplo n.º 11
0
    def create_space(cls, fname, output, **kwargs):
        """Create the model from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param output: Path to store the model
        :type output: str
        :param kwargs: Keywords pass to TextModel
        """

        X = [x for x in tweet_iterator(fname)]
        m = cls(**kwargs)
        m.fit(X, [x['klass'] for x in X])
        save_model(m, output)
Exemplo n.º 12
0
def test_evomsa_wrapper():
    from microtc.utils import save_model
    from EvoMSA.base import EvoMSA
    from test_base import get_data
    X, y = get_data()
    model = EvoMSA(stacked_method="sklearn.naive_bayes.GaussianNB",
                   n_jobs=2).fit(X, y)
    save_model(model, 'tmp.evomsa')
    assert os.path.isfile('tmp.evomsa')
    evo = EvoMSA(models=[["tmp.evomsa", "EvoMSA.model.Identity"]],
                 stacked_method="sklearn.naive_bayes.GaussianNB",
                 n_jobs=2).fit(X, y)
    assert evo
    os.unlink("tmp.evomsa")
Exemplo n.º 13
0
 def fit_svm(self, Xvs, y):
     svc_models = []
     for (_, cl), X, output in zip(self.models, Xvs, self.cache.ml_train()):
         if output is not None and os.path.isfile(output):
             svc_models.append(load_model(output))
             continue
         try:
             c = cl(random_state=self._seed)
         except TypeError:
             c = cl()
         c.fit(X, y)
         svc_models.append(c)
         if output is not None:
             save_model(c, output)
     self._svc_models = svc_models
Exemplo n.º 14
0
def get_model(basename, data, labels, args):
    modelfile = get_filename(
        args, os.path.join("models", os.path.basename(basename)))

    if not os.path.exists(modelfile):

        if not os.path.isdir("models"):
            os.mkdir("models")

        args['docs'] = data
        model = TextModel(**args)
        save_model(model, modelfile)
    else:
        model = load_model(modelfile)
    return model
Exemplo n.º 15
0
def recall_emo(lang='zh', n_jobs=1):
    def predict(fname, ds, tm, emoji):
        D = []
        for key, tweets in load_model(fname).items():
            labels = [ds.klass(x['text']) for x in tweets]
            _ = [[x['text'], label] for label, x in zip(labels, tweets)
                 if len(klasses.intersection(label))]
            D.extend(_)
        X = tm.transform([x for x, _ in D])
        y = [y for _, y in D]
        hy = []
        for k, emo in enumerate(emoji):
            output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
            m = load_model(f'{output}.LinearSVC')
            hy.append(m.predict(X))
        return y, hy

    def performance(emo, y, hy):
        y_emo = [emo in i for i in y]
        perf = recall_score(y_emo, hy > 0, pos_label=True)
        return perf, sum(y_emo) / len(y)

    info = load_model(join('models', f'{lang}_emo.info'))
    info = [[k, v] for k, (v, _) in enumerate(info.most_common())
            if _ >= 2**10]
    klasses = set([v for k, v in info])
    fnames = glob(join('data', lang, 'test', '*.gz'))
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    dd = load_model(join('models', f'{lang}_emo.info'))
    emoji = [x for x, v in dd.most_common() if v >= 2**10]
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    predictions = Parallel(n_jobs=n_jobs)(
        delayed(predict)(fname, ds, tm, emoji) for fname in fnames)
    y = []
    [y.extend(x) for x, hy in predictions]
    hys = np.vstack([np.vstack(hy).T for _, hy in predictions])
    output = dict()
    _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy)
                                for emo, hy in zip(emoji, hys.T))
    output = {
        emo: {
            'recall': perf,
            'ratio': ratio
        }
        for emo, (perf, ratio) in zip(emoji, _)
    }
    save_model(output, join('models', f'{lang}_emo.perf'))
Exemplo n.º 16
0
def store_tweets(lang, date, output_path=create_output_path):
    output = output_path(lang)
    _ = '{year:d}{month:02d}{day:02d}.gz'.format(**date)
    output = join(output, _)
    if isfile(output):
        return
    tw_iterator = TweetIterator(lang)
    freq = GeoFrequency([], reader=tw_iterator.tweet_iterator)
    freq.data = defaultdict(Process)
    freq.compute_file(date)
    output_dict = defaultdict(list)
    for k, v in freq.data.items():
        key = k.split('-')[0]
        output_dict[key].extend(v.data)
    save_model(output_dict, output)
    return output
Exemplo n.º 17
0
 def main(self):
     fnames = self.data.training_set
     if not isinstance(fnames, list):
         fnames = [fnames]
     D = []
     Y = []
     for fname in fnames:
         _ = [[x, x[self._klass]] for x in tweet_iterator(fname)]
         D.append([x[0] for x in _])
         Y.append([x[1] for x in _])
     if self.data.test_set is not None:
         if os.path.isfile(self.data.test_set):
             test_set = [x for x in tweet_iterator(self.data.test_set)]
         else:
             test_set = self.data.test_set
     else:
         test_set = None
     kwargs = dict(n_jobs=self.data.n_jobs)
     if self.data.kwargs is not None:
         _ = json.loads(self.data.kwargs)
         kwargs.update(_)
     evo_kwargs = dict(tmpdir=self.data.output_file + '_dir',
                       fitness_function='macro-F1')
     if self.data.evo_kwargs is not None:
         _ = json.loads(self.data.evo_kwargs)
         evo_kwargs.update(_)
     b4msa_kwargs = {}
     if self.data.b4msa_kwargs is not None:
         _ = json.loads(self.data.b4msa_kwargs)
         b4msa_kwargs.update(_)
     evo = base.EvoMSA(b4msa_args=b4msa_kwargs,
                       evodag_args=evo_kwargs,
                       **kwargs)
     evo.exogenous = self._exogenous
     if self.data.exogenous_model is not None:
         evo.exogenous_model = [
             self.load_model(x) for x in self.data.exogenous_model
         ]
     evo.fit(D, Y, test_set=test_set)
     evo.exogenous = None
     save_model(evo, self.data.output_file)
Exemplo n.º 18
0
    def kfold_supervised_learning(self, X_vector_space, y):
        """KFold to train the stacked_method, i.e., training set

        :rtype: np.array
        """

        D = None
        for (_, cl), Xvs, output in zip(self.models, X_vector_space,
                                        self.cache.ml_kfold()):
            if output is not None and os.path.isfile(output):
                d = load_model(output)
            else:
                d = self.kfold_decision_function(cl, Xvs, y)
                if output is not None:
                    save_model(d, output)
            if D is None:
                D = d
            else:
                [v.__iadd__(w) for v, w in zip(D, d)]
        D = np.array(D)
        D[~np.isfinite(D)] = 0
        return D
Exemplo n.º 19
0
def test_EvoMSA_fit():
    from EvoMSA.model import Bernulli
    from EvoDAG.model import EvoDAGE
    from microtc.utils import load_model, save_model
    X, y = get_data()
    print('iniciando')
    evo = EvoMSA(evodag_args=dict(popsize=10,
                                  early_stopping_rounds=10,
                                  time_limit=5,
                                  n_estimators=5),
                 models=[['EvoMSA.model.Corpus', 'EvoMSA.model.Bernulli']],
                 n_jobs=1).fit(X, y)
    print("Termine fit")
    assert evo
    assert isinstance(evo._svc_models[1], Bernulli)
    assert isinstance(evo._evodag_model, EvoDAGE)
    save_model(evo, 'test.evomodel')
    print("Guarde modelo")
    evo = load_model('test.evomodel')
    print("Cargue modelo")
    assert isinstance(evo._svc_models[1], Bernulli)
    assert isinstance(evo._evodag_model, EvoDAGE)
    os.unlink('test.evomodel')
Exemplo n.º 20
0
def emo_data(lang='zh'):
    fnames = glob(join('data', lang, '*.gz'))
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    for fname in fnames:
        output = dict()
        for key, tweets in load_model(fname).items():
            labels = [ds.klass(x['text']) for x in tweets]
            inner = []
            for tweet, label in zip(tweets, labels):
                if len(label) == 0:
                    continue
                tweet['klass'] = label
                inner.append(tweet)
            if len(inner):
                output[key] = inner
        if len(output) == 0:
            continue
        output_fname = join(dirname(fname), 'emo')
        if not isdir(output_fname):
            os.mkdir(output_fname)
        output_fname = join(output_fname, basename(fname))
        save_model(output, output_fname)
Exemplo n.º 21
0
def emo(k, lang='zh', size=2**19):
    ds = Dataset(text_transformations=False)
    ds.add(ds.load_emojis())
    output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}')
    dd = load_model(join('models', f'{lang}_emo.info'))
    _ = [x for x, v in dd.most_common() if v >= 2**10]
    tot = sum([v for x, v in dd.most_common() if v >= 2**10])
    if k >= len(_):
        return
    pos = _[k]
    neg = set([x for i, x in enumerate(_) if i != k])
    POS, NEG, ADD = [], [], []
    for fname in glob(join('data', lang, 'emo', '*.gz')):
        for key, data in load_model(fname).items():
            for d in data:
                klass = d['klass']
                if len(klass) == 1:
                    klass = klass.pop()
                    if klass == pos:
                        POS.append(ds.process(d['text']))
                    elif klass in neg:
                        NEG.append(ds.process(d['text']))
                elif tot < size:
                    if pos not in klass and len(klass.intersection(neg)):
                        ADD.append(ds.process(d['text']))
    shuffle(POS), shuffle(NEG), shuffle(ADD)
    size2 = size // 2
    POS = POS[:size2]
    if len(NEG) < size2:
        NEG.extend(ADD)
    NEG = NEG[:size2]
    y = [1] * len(POS)
    y.extend([-1] * len(NEG))
    tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc'))
    X = tm.transform(POS + NEG)
    m = LinearSVC().fit(X, y)
    save_model(m, f'{output}.LinearSVC')
Exemplo n.º 22
0
# Multivariate Normal

pos = multivariate_normal([2, 2], [[1, -1], [-1, 5]])
neg = multivariate_normal([-2, -2], [[3, .5], [.5, 0.3]])

D = [(x, 1) for x in pos.rvs(size=1000)]
D += [(x, 0) for x in neg.rvs(size=5000)]

plt.plot([x[0] for x, y in D if y == 1], [x[1] for x, y in D if y == 1], 'b.')
plt.plot([x[0] for x, y in D if y == 0], [x[1] for x, y in D if y == 0], 'r.')
plt.grid()
plt.legend(['Positive', 'Negative'])
plt.tight_layout()
plt.savefig('two_classes_multivariate.png', dpi=300)

save_model(D, join('dataset', 'two_classes_multivariate.gz'))

D = load_model(join('dataset', 'two_classes_multivariate.gz'))

l_pos_m = np.mean(np.array([x for x, y in D if y == 1]), axis=0)
l_pos_cov = np.cov(np.array([x for x, y in D if y == 1]).T)
l_pos = multivariate_normal(mean=l_pos_m, cov=l_pos_cov)
l_neg_m = np.mean(np.array([x for x, y in D if y == 0]), axis=0)
l_neg_cov = np.cov(np.array([x for x, y in D if y == 0]).T)
l_neg = multivariate_normal(mean=l_neg_m, cov=l_neg_cov)

_, priors = np.unique([k for _, k in D], return_counts=True)
N = priors.sum()
prior_pos = priors[1] / N
prior_neg = priors[0] / N
Exemplo n.º 23
0
 def compute(self, output: str) -> None:
     for fname in tqdm(self._fnames):
         self.compute_file(fname)
     save_model([self.matrix(), self.num_users], output)
Exemplo n.º 24
0
    def func(data, output):
        from b4msa.textmodel import TextModel
        from microtc.utils import tweet_iterator, save_model

        tm = TextModel().fit(list(tweet_iterator(data)))
        save_model(tm, output)
Exemplo n.º 25
0
for x in emojis['fully-qualified']:
    key = remove_components(x)
    emojis_filter[key].append(x)

components.add('FE0F')
m_qual = {remove_components(x): x for x in emojis_filter.keys()}

for x in emojis['minimally-qualified']:
    key = remove_components(x)
    value = m_qual[key]
    emojis_filter[value].append(x)

for x in emojis['unqualified']:
    key = remove_components(x)
    value = m_qual[key]
    emojis_filter[value].append(x)

output = dict()

for k, v in emojis_filter.items():
    ident = convert_emoji(k).strip()
    for item in v:
        output[convert_emoji(item).strip()] = ident

save_model(output, 'emojis.dict')
ds = Dataset()
ds.add(output)

ds.process('buenos xx 12 dias. {} todos! acción'.format(
    convert_emoji('1F44B 1F3FC')))