def run(self, X, y): """Perform the search using X and y to guide it :param X: Dataset set - independent variables :type X: list :param y: Dataset set - dependent variable :type y: list or np.array :rtype: :py:class:`EvoMSA.model_selection.Node` """ self._logger.info("Starting the search") self._le = self._le.fit(y) y = self._le.transform(y) r = [(node.performance(X, y), node) for node in self._nodes] node = max(r, key=lambda x: x[0])[1] while True: self._logger.info("Model: %s perf: %0.4f" % (node, node.perf)) nodes = list(node) if len(nodes) == 0: if self._output: save_model(node, self._output) return node r = [(xx.performance(X, y), xx) for xx in nodes] perf, comp = max(r, key=lambda x: x[0]) if perf < node.perf: break node = comp if self._output: save_model(node, self._output) return node
def main(self): fnames = self.data.training_set fname = fnames[0] _ = [[x, x[self._klass]] for x in tweet_iterator(fname)] D = [x[0] for x in _] Y = [x[1] for x in _] if self.data.test_set is not None: if os.path.isfile(self.data.test_set): test_set = [x for x in tweet_iterator(self.data.test_set)] else: test_set = self.data.test_set else: test_set = None kwargs = dict(n_jobs=self.data.n_jobs) if self.data.kwargs is not None: _ = json.loads(self.data.kwargs) kwargs.update(_) evo_kwargs = dict() if kwargs.get("stacked_method", "EvoDAG.model.EvoDAGE") == "EvoDAG.model.EvoDAGE": evo_kwargs = dict(tmpdir=self.data.output_file + '_dir') if "stacked_method_args" in kwargs: evo_kwargs.update(kwargs["stacked_method_args"]) del kwargs["stacked_method_args"] evo = base.EvoMSA(stacked_method_args=evo_kwargs, **kwargs) evo.fit(D, Y, test_set=test_set) save_model(evo, self.data.output_file)
def bow(lang='zh', num_terms=2**14): tweets = data_bow(lang=lang) token_min_filter = 0 if lang == 'zh': token_list = [1, 2, 3] q_grams_words = False # token_min_filter=0.0005 else: token_list = [-1, 2, 3, 4] q_grams_words = True # token_min_filter=0.001 tm = TextModel(token_list=token_list, token_max_filter=len(tweets), token_min_filter=token_min_filter, q_grams_words=q_grams_words, **TM_ARGS).fit(tweets) model = tm.model id2word = {v: k for k, v in model.word2id.items()} N = model._ndocs word_weight = [[N / 2**v, id2word[k]] for k, v in model.wordWeight.items()] word_weight.sort(key=lambda x: x[0], reverse=True) word_weight = word_weight[:num_terms] model.word2id = {token: k for k, (w, token) in enumerate(word_weight)} model.wordWeight = {k: w for k, (w, token) in enumerate(word_weight)} save_model(tm, join('models', f'{lang}_{microtc.__version__}.microtc')) return tm
def run(self, X, y, early_stopping=1000, **kwargs): """ :param early_stopping: Number of rounds to perform early stopping :type early_stopping: int :rtype: :py:class:`EvoMSA.model_selection.Node` """ visited = set([(node.performance(X, y, **kwargs), node) for node in self._nodes]) _ = max(visited, key=lambda x: x[0])[1] best = None nodes = LifoQueue() nodes.put(_) index = len(visited) while not nodes.empty() and (len(visited) - index) < early_stopping: node = nodes.get() if best is None or node > best: index = len(visited) best = node if self._output: save_model(best, self._output) self._logger.info("Model: %s perf: %0.4f " % (best, best.perf) + "visited: %s " % len(visited) + "size: %s " % nodes.qsize() + "Rounds: %s" % (len(visited) - index)) nn = [(xx, xx.fit(self._X, self._y, **self._kwargs).performance(X, y, **kwargs)) for xx in node if xx not in visited] [visited.add(x) for x, _ in nn] nn = [xx for xx, perf in nn if perf >= self.perf(node)] if len(nn) == 0: continue nn.sort() [nodes.put(x) for x in nn] return best
def test_textmodel_save_load(): import os from microtc.textmodel import TextModel from microtc.utils import tweet_iterator, save_model, load_model fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) tm = TextModel().fit(tw) save_model(tm, 't.model') assert isinstance(load_model('t.model'), TextModel) os.unlink('t.model')
def vector_space(args): k, t, X, output = args if output is not None and os.path.isfile(output): return k, load_model(output) try: res = t.transform(X) except AttributeError: res = t.tonp([t[_] for _ in X]) if output is not None: save_model(res, output) return k, res
def create_space(cls, fname, output=None, **kwargs): """Create the space from a file of json :param fname: Path to the file containing the json :type fname: str :param output: Path to store the model, it is cls.model_fname if None :type output: str :param kwargs: Keywords pass to TextModel """ tm, coef, intercept, klass = cls._create_space(fname, **kwargs) if output is None: output = cls.model_fname() save_model([tm, coef, intercept, klass], output)
def test_counter(): from microtc.utils import Counter, save_model, load_model import os c = Counter() c.update([1, 2, 3, 1]) c.update([3]) assert c[1] == 2 print(c.update_calls) assert c.update_calls == 2 save_model(c, "t.voc") cc = load_model("t.voc") os.unlink("t.voc") print(cc.update_calls, "**") assert cc.update_calls == 2
def get_model(basename, data, labels, args): modelfile = get_filename(args, os.path.join("models", os.path.basename(basename))) if not os.path.exists(modelfile): if not os.path.isdir("models"): os.mkdir("models") args['docs'] = data model = TextModel(**args) save_model(model, modelfile) else: model = load_model(modelfile) return model
def main(self): self.data = self.parser.parse_args() params_fname = self.data.params_fname if params_fname is not None: best = load_json(params_fname) if isinstance(best, list): best = best[0] else: best = dict() best = clean_params(best) kw = json.loads(self.data.kwargs) if self.data.kwargs is not None else dict() best.update(kw) svc = SVC.fit_from_file(self.data.training_set, best) save_model(svc, self.get_output())
def create_space(cls, fname, output, **kwargs): """Create the model from a file of json :param fname: Path to the file containing the json :type fname: str :param output: Path to store the model :type output: str :param kwargs: Keywords pass to TextModel """ X = [x for x in tweet_iterator(fname)] m = cls(**kwargs) m.fit(X, [x['klass'] for x in X]) save_model(m, output)
def test_evomsa_wrapper(): from microtc.utils import save_model from EvoMSA.base import EvoMSA from test_base import get_data X, y = get_data() model = EvoMSA(stacked_method="sklearn.naive_bayes.GaussianNB", n_jobs=2).fit(X, y) save_model(model, 'tmp.evomsa') assert os.path.isfile('tmp.evomsa') evo = EvoMSA(models=[["tmp.evomsa", "EvoMSA.model.Identity"]], stacked_method="sklearn.naive_bayes.GaussianNB", n_jobs=2).fit(X, y) assert evo os.unlink("tmp.evomsa")
def fit_svm(self, Xvs, y): svc_models = [] for (_, cl), X, output in zip(self.models, Xvs, self.cache.ml_train()): if output is not None and os.path.isfile(output): svc_models.append(load_model(output)) continue try: c = cl(random_state=self._seed) except TypeError: c = cl() c.fit(X, y) svc_models.append(c) if output is not None: save_model(c, output) self._svc_models = svc_models
def get_model(basename, data, labels, args): modelfile = get_filename( args, os.path.join("models", os.path.basename(basename))) if not os.path.exists(modelfile): if not os.path.isdir("models"): os.mkdir("models") args['docs'] = data model = TextModel(**args) save_model(model, modelfile) else: model = load_model(modelfile) return model
def recall_emo(lang='zh', n_jobs=1): def predict(fname, ds, tm, emoji): D = [] for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] _ = [[x['text'], label] for label, x in zip(labels, tweets) if len(klasses.intersection(label))] D.extend(_) X = tm.transform([x for x, _ in D]) y = [y for _, y in D] hy = [] for k, emo in enumerate(emoji): output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') m = load_model(f'{output}.LinearSVC') hy.append(m.predict(X)) return y, hy def performance(emo, y, hy): y_emo = [emo in i for i in y] perf = recall_score(y_emo, hy > 0, pos_label=True) return perf, sum(y_emo) / len(y) info = load_model(join('models', f'{lang}_emo.info')) info = [[k, v] for k, (v, _) in enumerate(info.most_common()) if _ >= 2**10] klasses = set([v for k, v in info]) fnames = glob(join('data', lang, 'test', '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) dd = load_model(join('models', f'{lang}_emo.info')) emoji = [x for x, v in dd.most_common() if v >= 2**10] tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) predictions = Parallel(n_jobs=n_jobs)( delayed(predict)(fname, ds, tm, emoji) for fname in fnames) y = [] [y.extend(x) for x, hy in predictions] hys = np.vstack([np.vstack(hy).T for _, hy in predictions]) output = dict() _ = Parallel(n_jobs=n_jobs)(delayed(performance)(emo, y, hy) for emo, hy in zip(emoji, hys.T)) output = { emo: { 'recall': perf, 'ratio': ratio } for emo, (perf, ratio) in zip(emoji, _) } save_model(output, join('models', f'{lang}_emo.perf'))
def store_tweets(lang, date, output_path=create_output_path): output = output_path(lang) _ = '{year:d}{month:02d}{day:02d}.gz'.format(**date) output = join(output, _) if isfile(output): return tw_iterator = TweetIterator(lang) freq = GeoFrequency([], reader=tw_iterator.tweet_iterator) freq.data = defaultdict(Process) freq.compute_file(date) output_dict = defaultdict(list) for k, v in freq.data.items(): key = k.split('-')[0] output_dict[key].extend(v.data) save_model(output_dict, output) return output
def main(self): fnames = self.data.training_set if not isinstance(fnames, list): fnames = [fnames] D = [] Y = [] for fname in fnames: _ = [[x, x[self._klass]] for x in tweet_iterator(fname)] D.append([x[0] for x in _]) Y.append([x[1] for x in _]) if self.data.test_set is not None: if os.path.isfile(self.data.test_set): test_set = [x for x in tweet_iterator(self.data.test_set)] else: test_set = self.data.test_set else: test_set = None kwargs = dict(n_jobs=self.data.n_jobs) if self.data.kwargs is not None: _ = json.loads(self.data.kwargs) kwargs.update(_) evo_kwargs = dict(tmpdir=self.data.output_file + '_dir', fitness_function='macro-F1') if self.data.evo_kwargs is not None: _ = json.loads(self.data.evo_kwargs) evo_kwargs.update(_) b4msa_kwargs = {} if self.data.b4msa_kwargs is not None: _ = json.loads(self.data.b4msa_kwargs) b4msa_kwargs.update(_) evo = base.EvoMSA(b4msa_args=b4msa_kwargs, evodag_args=evo_kwargs, **kwargs) evo.exogenous = self._exogenous if self.data.exogenous_model is not None: evo.exogenous_model = [ self.load_model(x) for x in self.data.exogenous_model ] evo.fit(D, Y, test_set=test_set) evo.exogenous = None save_model(evo, self.data.output_file)
def kfold_supervised_learning(self, X_vector_space, y): """KFold to train the stacked_method, i.e., training set :rtype: np.array """ D = None for (_, cl), Xvs, output in zip(self.models, X_vector_space, self.cache.ml_kfold()): if output is not None and os.path.isfile(output): d = load_model(output) else: d = self.kfold_decision_function(cl, Xvs, y) if output is not None: save_model(d, output) if D is None: D = d else: [v.__iadd__(w) for v, w in zip(D, d)] D = np.array(D) D[~np.isfinite(D)] = 0 return D
def test_EvoMSA_fit(): from EvoMSA.model import Bernulli from EvoDAG.model import EvoDAGE from microtc.utils import load_model, save_model X, y = get_data() print('iniciando') evo = EvoMSA(evodag_args=dict(popsize=10, early_stopping_rounds=10, time_limit=5, n_estimators=5), models=[['EvoMSA.model.Corpus', 'EvoMSA.model.Bernulli']], n_jobs=1).fit(X, y) print("Termine fit") assert evo assert isinstance(evo._svc_models[1], Bernulli) assert isinstance(evo._evodag_model, EvoDAGE) save_model(evo, 'test.evomodel') print("Guarde modelo") evo = load_model('test.evomodel') print("Cargue modelo") assert isinstance(evo._svc_models[1], Bernulli) assert isinstance(evo._evodag_model, EvoDAGE) os.unlink('test.evomodel')
def emo_data(lang='zh'): fnames = glob(join('data', lang, '*.gz')) ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) for fname in fnames: output = dict() for key, tweets in load_model(fname).items(): labels = [ds.klass(x['text']) for x in tweets] inner = [] for tweet, label in zip(tweets, labels): if len(label) == 0: continue tweet['klass'] = label inner.append(tweet) if len(inner): output[key] = inner if len(output) == 0: continue output_fname = join(dirname(fname), 'emo') if not isdir(output_fname): os.mkdir(output_fname) output_fname = join(output_fname, basename(fname)) save_model(output, output_fname)
def emo(k, lang='zh', size=2**19): ds = Dataset(text_transformations=False) ds.add(ds.load_emojis()) output = join('models', f'{lang}_emo_{k}_mu{microtc.__version__}') dd = load_model(join('models', f'{lang}_emo.info')) _ = [x for x, v in dd.most_common() if v >= 2**10] tot = sum([v for x, v in dd.most_common() if v >= 2**10]) if k >= len(_): return pos = _[k] neg = set([x for i, x in enumerate(_) if i != k]) POS, NEG, ADD = [], [], [] for fname in glob(join('data', lang, 'emo', '*.gz')): for key, data in load_model(fname).items(): for d in data: klass = d['klass'] if len(klass) == 1: klass = klass.pop() if klass == pos: POS.append(ds.process(d['text'])) elif klass in neg: NEG.append(ds.process(d['text'])) elif tot < size: if pos not in klass and len(klass.intersection(neg)): ADD.append(ds.process(d['text'])) shuffle(POS), shuffle(NEG), shuffle(ADD) size2 = size // 2 POS = POS[:size2] if len(NEG) < size2: NEG.extend(ADD) NEG = NEG[:size2] y = [1] * len(POS) y.extend([-1] * len(NEG)) tm = load_model(join('models', f'{lang}_{microtc.__version__}.microtc')) X = tm.transform(POS + NEG) m = LinearSVC().fit(X, y) save_model(m, f'{output}.LinearSVC')
# Multivariate Normal pos = multivariate_normal([2, 2], [[1, -1], [-1, 5]]) neg = multivariate_normal([-2, -2], [[3, .5], [.5, 0.3]]) D = [(x, 1) for x in pos.rvs(size=1000)] D += [(x, 0) for x in neg.rvs(size=5000)] plt.plot([x[0] for x, y in D if y == 1], [x[1] for x, y in D if y == 1], 'b.') plt.plot([x[0] for x, y in D if y == 0], [x[1] for x, y in D if y == 0], 'r.') plt.grid() plt.legend(['Positive', 'Negative']) plt.tight_layout() plt.savefig('two_classes_multivariate.png', dpi=300) save_model(D, join('dataset', 'two_classes_multivariate.gz')) D = load_model(join('dataset', 'two_classes_multivariate.gz')) l_pos_m = np.mean(np.array([x for x, y in D if y == 1]), axis=0) l_pos_cov = np.cov(np.array([x for x, y in D if y == 1]).T) l_pos = multivariate_normal(mean=l_pos_m, cov=l_pos_cov) l_neg_m = np.mean(np.array([x for x, y in D if y == 0]), axis=0) l_neg_cov = np.cov(np.array([x for x, y in D if y == 0]).T) l_neg = multivariate_normal(mean=l_neg_m, cov=l_neg_cov) _, priors = np.unique([k for _, k in D], return_counts=True) N = priors.sum() prior_pos = priors[1] / N prior_neg = priors[0] / N
def compute(self, output: str) -> None: for fname in tqdm(self._fnames): self.compute_file(fname) save_model([self.matrix(), self.num_users], output)
def func(data, output): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator, save_model tm = TextModel().fit(list(tweet_iterator(data))) save_model(tm, output)
for x in emojis['fully-qualified']: key = remove_components(x) emojis_filter[key].append(x) components.add('FE0F') m_qual = {remove_components(x): x for x in emojis_filter.keys()} for x in emojis['minimally-qualified']: key = remove_components(x) value = m_qual[key] emojis_filter[value].append(x) for x in emojis['unqualified']: key = remove_components(x) value = m_qual[key] emojis_filter[value].append(x) output = dict() for k, v in emojis_filter.items(): ident = convert_emoji(k).strip() for item in v: output[convert_emoji(item).strip()] = ident save_model(output, 'emojis.dict') ds = Dataset() ds.add(output) ds.process('buenos xx 12 dias. {} todos! acción'.format( convert_emoji('1F44B 1F3FC')))