class Corpus(BaseTextModel): """Text model using only words""" def __init__(self, corpus, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) self.fit(corpus) def get_text(self, text): return text[self._text] def fit(self, c): r = [self.__getitem__(x) for x in c] self._training = False return r @property def num_terms(self): return self._num_terms def tokenize(self, text): if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._textModel.tokenize(_text)) return tokens else: return self._textModel.tokenize(text) def __getitem__(self, d): tokens = [] for t in self.tokenize(d): try: index, k = self._m[t] if self._training: self._m[t] = [index, k + 1] except KeyError: if not self._training: continue index, k = self._num_terms, 1 self._m[t] = [index, k] self._num_terms += 1 tokens.append([index, k]) return tokens
def test_lang(): from b4msa.textmodel import TextModel text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"] model = TextModel( text, **{ "del_dup": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "del_diac": False, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "spanish", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" print(model.tokenize) a = model.tokenize(text) b = [ '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda' ] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
def test_negations_italian(): from b4msa.textmodel import TextModel text = ["XD"] model = TextModel( text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'italian', 'url_option': 'delete' }) text = """@User Come non condividere; me ne frega niente""" a = model.tokenize(text) print("Input:", text) print("Output:", a) b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient'] assert a == b
def test_negations(): from b4msa.textmodel import TextModel text = ["el alma de la fiesta XD"] model = TextModel( text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete' }) text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto""" a = model.tokenize(text) b = [ '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect' ] print(a, b) assert a == b
def test_lang(): from b4msa.textmodel import TextModel text = [ "Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD" ] model = TextModel(text, **{ "del_dup": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "del_diac": False, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "spanish", }) text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" print(model.tokenize) a = model.tokenize(text) b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'] print(text) assert a == b, "got: {0}, expected: {1}".format(a, b)
def test_negations_italian(): from b4msa.textmodel import TextModel text = [ "XD" ] model = TextModel(text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'italian', 'url_option': 'delete' }) text = """@User Come non condividere; me ne frega niente""" a = model.tokenize(text) print("Input:", text) print("Output:", a) b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient'] assert a == b
def tokenize(self, text): """Tokenize a text :param text: Text :type text: dict or str """ if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(TextModel.tokenize(self, _text)) return tokens else: return TextModel.tokenize(self, text)
def test_negations(): from b4msa.textmodel import TextModel text = [ "el alma de la fiesta XD" ] model = TextModel(text, **{ 'num_option': 'group', 'del_diac': False, 'stopwords': 'delete', 'negation': True, 'stemming': True, 'lc': False, 'token_list': [-1], 'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete' }) text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto""" a = model.tokenize(text) b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect'] print(a, b) assert a == b
def test_lang(): from b4msa.textmodel import TextModel #text = [ # "Hi :) :P XD", # "excelente dia xc", # "el alma de la fiesta XD" #] text = [ "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :(" ] model = TextModel( text, **{ "del_dup1": True, "emo_option": "group", "lc": True, "negation": True, "num_option": "group", "stemming": True, "stopwords": "group", "strip_diac": True, "token_list": [ -1, # 5, ], "url_option": "group", "usr_option": "group", "lang": "portuguese", }) #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda" text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :(" a = model.tokenize(text) b = [ '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda' ] print a
ci D = list(tweet_iterator('../../../datasets/semeval/semeval2017_En_train.json')) tm = TextModel(token_list=[-1]).fit(D) id2word = {v:k for k, v in tm.model.word2id.items()} _ = {id2word[k]:v for k, v in tm.model.wordWeight.items()} wc = WC().generate_from_frequencies(_) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('semeval2017_idf.png', dpi=300) cnt = Counter() _ = [cnt.update(tm.tokenize(x)) for x in D] wc = WC().generate_from_frequencies(cnt) plt.imshow(wc) plt.axis('off') plt.tight_layout() plt.savefig('semeval2017_tf.png', dpi=300) perf = load_model('dataset/performance.gz') algs = list(perf.keys()) datasets = list(perf[algs[0]][1].keys()) data = [] for alg in algs: _ = [perf[alg][1][dataset] for dataset in datasets] data.append(_) data = np.array(data)
class Corpus(BaseTextModel): """Text model using only words""" def __init__(self, corpus=None, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) if corpus is not None: self.fit(corpus) def get_text(self, text): return text[self._text] def fit(self, c): [self.__getitem__(x) for x in c] self._training = False return self @property def num_terms(self): return self._num_terms def tokenize(self, text): if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._textModel.tokenize(_text)) return tokens else: return self._textModel.tokenize(text) def transform(self, texts): """Convert test into a vector :param texts: List of text to be transformed :type text: list :rtype: list Example: >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias catedras', 'catedras conacyt'] >>> textmodel = TextModel().fit(corpus) >>> X = textmodel.transform(corpus) """ return self._textModel.tonp([self.__getitem__(x) for x in texts]) def __getitem__(self, d): tokens = [] for t in self.tokenize(d): try: index, k = self._m[t] if self._training: self._m[t] = [index, k + 1] except KeyError: if not self._training: continue index, k = self._num_terms, 1 self._m[t] = [index, k] self._num_terms += 1 tokens.append([index, k]) return tokens
output = [] for word in text.split(): w = stemmer.stem(word) output.append(w) output = " ".join(output) output text = 'I like playing football on Saturday' words = text.split() n = 3 n_grams = [] for a in zip(*[words[i:] for i in range(n)]): n_grams.append("~".join(a)) n_grams text = 'I like playing' q = 4 q_grams = [] for a in zip(*[text[i:] for i in range(q)]): q_grams.append("".join(a)) q_grams text = 'I like playing football with @mgraffg' tm = TextModel(token_list=[-1, 5], lang='english', usr_option=OPTION_GROUP, stemming=True) tm.text_transformations(text) tm.tokenize(text)