def test_negations_italian(): from b4msa.textmodel import TextModel text = ["XD"] model = TextModel( text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'italian', 'url_option': 'delete' }) text = """@User Come non condividere; me ne frega niente""" a = model.tokenize(text) print("Input:", text) print("Output:", a) b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient'] assert a == b
def test_lang(): from b4msa.textmodel import TextModel text = [ "Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD" ] model = TextModel(text, **{ "del_dup": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "del_diac": False, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "spanish", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" print(model.tokenize) a = model.tokenize(text) b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
def test_negations(): from b4msa.textmodel import TextModel text = ["el alma de la fiesta XD"] model = TextModel( text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete' }) text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto""" a = model.tokenize(text) b = [ '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect' ] print(a, b) assert a == b
def __init__(self, corpus, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) self.fit(corpus)
def test_lang(): from b4msa.textmodel import TextModel text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"] model = TextModel( text, **{ "del_dup": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "del_diac": False, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "spanish", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" print(model.tokenize) a = model.tokenize(text) b = [ '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda' ] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
class HA(BaseTextModel): """Wrapper of b4msa.textmodel.TextModel and LinearSVC""" def __init__(self, **kwargs): self._tm = TextModel(**kwargs) self._cl = LinearSVC() def fit(self, X, y): self._tm.fit(X) self._cl.fit(self._tm.transform(X), y) return self def tonp(self, X): return X def transform(self, X): res = self._cl.decision_function(self._tm.transform(X)) if res.ndim == 1: return np.atleast_2d(res).T return res @classmethod def create_space(cls, fname, output, **kwargs): """Create the model from a file of json :param fname: Path to the file containing the json :type fname: str :param output: Path to store the model :type output: str :param kwargs: Keywords pass to TextModel """ X = [x for x in tweet_iterator(fname)] m = cls(**kwargs) m.fit(X, [x['klass'] for x in X]) save_model(m, output)
def test_negations_italian(): from b4msa.textmodel import TextModel text = [ "XD" ] model = TextModel(text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'italian', 'url_option': 'delete' }) text = """@User Come non condividere; me ne frega niente""" a = model.tokenize(text) print("Input:", text) print("Output:", a) b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient'] assert a == b
def train_predict_pool(cls, args): X, y, tr, ts, textModel_params = args params = TextModel.params() textModel_params = { k: v for k, v in textModel_params.items() if k in params } t = TextModel([X[x] for x in tr], **textModel_params) m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr]) return ts, np.array(m.predict([t[X[x]] for x in ts]))
def test_textmodel_default(): from b4msa.textmodel import TextModel for lang in ['spanish', 'english', 'arabic']: text = TextModel(lang=lang) print(text.token_list, TextModel.default_parameters(lang=lang)['token_list']) for a, b in zip(text.token_list, TextModel.default_parameters(lang=lang)['token_list']): print(a, b) assert a == b text = TextModel(lang='arabic', stopwords='xxx') assert text._lang_kw['stopwords'] == 'xxx'
def test_stopwords(): from b4msa.textmodel import TextModel tm = TextModel(lang='es', del_dup=False) text = tm.text_transformations('como esta mi carro') print(text) text1 = tm.lang.transform(text, stopwords='delete') print(text1) assert text1 == '~carro~' text1 = tm.lang.transform(text, stopwords='group') print(text1) assert text1 == '~_sw~_sw~_sw~carro~'
def test_stopwords(): from b4msa.textmodel import TextModel tm = TextModel(lang='es', del_dup=False) text = tm.text_transformations('como esta mi carro') print(text) text1 = tm.lang.transform(text, stopwords='delete') print(text1) assert text1 == '~carro~' text1 = tm.lang.transform(text, stopwords='group') print(text1) assert text1 == '~_sw~_sw~_sw~carro~'
def test_textmodel_default(): from b4msa.textmodel import TextModel for lang in ['spanish', 'english', 'arabic']: text = TextModel(lang=lang) print(text.token_list, TextModel.default_parameters(lang=lang)['token_list']) for a, b in zip(text.token_list, TextModel.default_parameters(lang=lang)['token_list']): print(a, b) assert a == b text = TextModel(lang='arabic', stopwords='xxx') assert text._lang_kw['stopwords'] == 'xxx'
def test_textmodel_token_min_filter(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, token_min_filter=1) print(len(text.model._w2id)) assert len(text.model._w2id) == 62 text = TextModel(tw, token_min_filter=0.3) print(len(text.model._w2id)) assert len(text.model._w2id) == 13 text = TextModel(tw, token_min_filter=1, threshold=0.01)
class Corpus(BaseTextModel): """Text model using only words""" def __init__(self, corpus, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) self.fit(corpus) def get_text(self, text): return text[self._text] def fit(self, c): r = [self.__getitem__(x) for x in c] self._training = False return r @property def num_terms(self): return self._num_terms def tokenize(self, text): if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._textModel.tokenize(_text)) return tokens else: return self._textModel.tokenize(text) def __getitem__(self, d): tokens = [] for t in self.tokenize(d): try: index, k = self._m[t] if self._training: self._m[t] = [index, k + 1] except KeyError: if not self._training: continue index, k = self._num_terms, 1 self._m[t] = [index, k] self._num_terms += 1 tokens.append([index, k]) return tokens
def fit_from_file(cls, fname, textModel_params={}): D = [x for x in tweet_iterator(fname)] # X, y = read_data_labels(fname) y = [x['klass'] for x in D] model = TextModel(D, **textModel_params) svc = cls(model) return svc.fit([model[x] for x in D], y)
def tokenize(self, text): """Tokenize a text :param text: Text :type text: dict or str """ if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(TextModel.tokenize(self, _text)) return tokens else: return TextModel.tokenize(self, text)
def train_predict_pool(cls, args): X, y, tr, ts, textModel_params = args params = TextModel.params() textModel_params = {k: v for k, v in textModel_params.items() if k in params} t = TextModel([X[x] for x in tr], **textModel_params) m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr]) return ts, np.array(m.predict([t[X[x]] for x in ts]))
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) best = load_json(self.data.params_fname)[0] print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def textModel(self): "Text model used to process the texts" try: return self._tm except AttributeError: self._tm = TextModel(**TM_ARGS) return self._tm
def _create_space(cls, fname, **kwargs): """Create the space from a file of json :param fname: Path to the file containing the json :type fname: str :param kwargs: Keywords pass to TextModel """ import random from .utils import linearSVC_array from collections import Counter try: from tqdm import tqdm except ImportError: def tqdm(x, **kwargs): return x data = [x for x in tweet_iterator(fname)] random.shuffle(data) tm = TextModel(**kwargs).fit([x['text'] for x in data[:128000]]) tm._num_terms = tm.model.num_terms # klass, nele = np.unique([x['klass'] for x in data], return_counts=True) _ = [(k, v) for k, v in Counter([x['klass'] for x in data]).items()] _.sort(key=lambda x: x[0]) klass = [x[0] for x in _] nele = [x[1] for x in _] h = {v: k for k, v in enumerate(klass)} MODELS = [] for ident, k in tqdm(enumerate(klass)): elepklass = [0 for __ in klass] cnt = nele[ident] cntpklass = int(cnt / (len(klass) - 1)) D = [(x, 1) for x in data if x['klass'] == k] for x in data: if x['klass'] == k: continue if elepklass[h[x['klass']]] > cntpklass: continue elepklass[h[x['klass']]] = elepklass[h[x['klass']]] + 1 D.append((x, -1)) m = LinearSVC().fit(tm.tonp([tm[x[0]['text']] for x in D]), [x[1] for x in D]) MODELS.append(m) coef, intercept = linearSVC_array(MODELS) return tm, coef, intercept, klass
def test_textmodel_entropy(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel(tw, threshold=0.01) assert isinstance(text, TextModel) print(len(text.model._w2id)) assert len(text.model._w2id) == 39
def test_textmodel(): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator import os fname = os.path.dirname(__file__) + '/text.json' tw = list(tweet_iterator(fname)) text = TextModel([x['text'] for x in tw]) # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle")) # assert False assert isinstance(text[tw[0]['text']], list)
def test_model_instance(): from microtc.textmodel import TextModel X, y = get_data() tm = TextModel().fit(X) evo = EvoMSA(tm_n_jobs=1, n_jobs=1, TR=False, lang="es", models=[[tm, "sklearn.svm.LinearSVC"]], stacked_method="sklearn.svm.LinearSVC").fit(X, y) assert evo.models[0][0] == tm
def test_negations(): from b4msa.textmodel import TextModel text = [ "el alma de la fiesta XD" ] model = TextModel(text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete' }) text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto""" a = model.tokenize(text) b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect'] print(a, b) assert a == b
def test_SVC_predict(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_text('Excelente dia b4msa') assert y == 'POS'
def test_lang(): from b4msa.textmodel import TextModel #text = [ # "Hi :) :P XD", # "excelente dia xc", # "el alma de la fiesta XD" #] text = [ "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :(" ] model = TextModel( text, **{ "del_dup1": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "strip_diac": True, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "portuguese", }) #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :(" a = model.tokenize(text) b = [ '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda' ] print a
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file(fname) for i in y: assert i in ['POS', 'NEU', 'NEG']
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os #fname = os.path.dirname(__file__) + '/text.json' fname = 'text.json' #fname = 'test_text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file("test_text.json") print "Final Labels" print y
def test_params(): import os import itertools from b4msa.params import BASIC_OPTIONS from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator params = dict(del_diac=[True, False], usr_option=BASIC_OPTIONS, url_option=BASIC_OPTIONS) params = sorted(params.items()) fname = os.path.dirname(__file__) + '/text.json' tw = [x for x in tweet_iterator(fname)] text = [x['text'] for x in tw] for x in itertools.product(*[x[1] for x in params]): args = dict(zip([x[0] for x in params], x)) ins = TextModel(text, **args) assert isinstance(ins[text[0]], list)
def __init__(self, corpus, threshold=0.001, token_min_filter=0.001, token_list=[-2, -1], num_option='delete', usr_option='delete', url_option='delete', emo_option='delete', **kwargs): self._text = os.getenv('TEXT', default='text') self._textmodel = TextModel(None, token_list=token_list, threshold=threshold, token_min_filter=token_min_filter, num_option=num_option, usr_option=usr_option, url_option=url_option, emo_option=emo_option, **kwargs) self._threshold = threshold self.init(corpus)
text = 'I like playing football' output = [] for word in text.split(): w = stemmer.stem(word) output.append(w) output = " ".join(output) output text = 'I like playing football on Saturday' words = text.split() n = 3 n_grams = [] for a in zip(*[words[i:] for i in range(n)]): n_grams.append("~".join(a)) n_grams text = 'I like playing' q = 4 q_grams = [] for a in zip(*[text[i:] for i in range(q)]): q_grams.append("".join(a)) q_grams text = 'I like playing football with @mgraffg' tm = TextModel(token_list=[-1, 5], lang='english', usr_option=OPTION_GROUP, stemming=True) tm.text_transformations(text) tm.tokenize(text)
class Corpus(BaseTextModel): """Text model using only words""" def __init__(self, corpus=None, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) if corpus is not None: self.fit(corpus) def get_text(self, text): return text[self._text] def fit(self, c): [self.__getitem__(x) for x in c] self._training = False return self @property def num_terms(self): return self._num_terms def tokenize(self, text): if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._textModel.tokenize(_text)) return tokens else: return self._textModel.tokenize(text) def transform(self, texts): """Convert test into a vector :param texts: List of text to be transformed :type text: list :rtype: list Example: >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias catedras', 'catedras conacyt'] >>> textmodel = TextModel().fit(corpus) >>> X = textmodel.transform(corpus) """ return self._textModel.tonp([self.__getitem__(x) for x in texts]) def __getitem__(self, d): tokens = [] for t in self.tokenize(d): try: index, k = self._m[t] if self._training: self._m[t] = [index, k + 1] except KeyError: if not self._training: continue index, k = self._num_terms, 1 self._m[t] = [index, k] self._num_terms += 1 tokens.append([index, k]) return tokens
from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from matplotlib import pylab as plt from wordcloud import WordCloud as WC from collections import Counter from scipy.stats import rankdata import numpy as np import Orange D = [(x['text'], x['klass']) for x in tweet_iterator(TWEETS)] y = [y for _, y in D] le = LabelEncoderWrapper().fit(y) y = le.transform(y) tm = TextModel(token_list=[-1], weighting='tf').fit([x for x, _ in D]) folds = StratifiedKFold(shuffle=True, random_state=0) hy = np.empty(len(D)) for tr, val in folds.split(D, y): _ = [D[x][0] for x in tr] X = tm.transform(_) m = LogisticRegression(multi_class='multinomial').fit(X, y[tr]) # m = LinearSVC().fit(X, y[tr]) _ = [D[x][0] for x in val] hy[val] = m.predict(tm.transform(_)) ci = bootstrap_confidence_interval(y, hy) ci (0.2839760475399691, 0.30881116416736665)
def func(data, output): from b4msa.textmodel import TextModel from microtc.utils import tweet_iterator, save_model tm = TextModel().fit(list(tweet_iterator(data))) save_model(tm, output)
def train_predict_pool(cls, args): X, y, tr, ts, textModel_params = args t = TextModel([X[x] for x in tr], **textModel_params) m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr]) return ts, np.array(m.predict([t[X[x]] for x in ts]))
def fit_from_file(cls, fname, textModel_params={}): X, y = read_data_labels(fname) model = TextModel(X, **textModel_params) svc = cls(model) return svc.fit([model[x] for x in X], y)
def clean_params(kw): params = TextModel.params() return {k: v for k, v in kw.items() if k in params}