Exemplo n.º 1
0
def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = ["XD"]

    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'italian',
            'url_option': 'delete'
        })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b
Exemplo n.º 2
0
def test_lang():
    from b4msa.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup": True,
        "emo_option": "group",
        "lc": True,
        "negation": True,
        "num_option": "group",
        "stemming": True,
        "stopwords": "group",
        "del_diac": False,
        "token_list": [
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
        "lang": "spanish",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)
Exemplo n.º 3
0
def test_negations():
    from b4msa.textmodel import TextModel

    text = ["el alma de la fiesta XD"]
    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'spanish',
            'url_option': 'delete'
        })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = [
        '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech',
        'no_respect'
    ]
    print(a, b)
    assert a == b
Exemplo n.º 4
0
 def __init__(self, corpus, **kwargs):
     self._text = os.getenv('TEXT', default='text')
     self._m = {}
     self._num_terms = 0
     self._training = True
     self._textModel = TextModel([''], token_list=[-1])
     self.fit(corpus)
Exemplo n.º 5
0
def test_lang():
    from b4msa.textmodel import TextModel

    text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"]
    model = TextModel(
        text,
        **{
            "del_dup": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "del_diac": False,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "spanish",
        })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)
Exemplo n.º 6
0
class HA(BaseTextModel):
    """Wrapper of b4msa.textmodel.TextModel and LinearSVC"""
    def __init__(self, **kwargs):
        self._tm = TextModel(**kwargs)
        self._cl = LinearSVC()

    def fit(self, X, y):
        self._tm.fit(X)
        self._cl.fit(self._tm.transform(X), y)
        return self

    def tonp(self, X):
        return X

    def transform(self, X):
        res = self._cl.decision_function(self._tm.transform(X))
        if res.ndim == 1:
            return np.atleast_2d(res).T
        return res

    @classmethod
    def create_space(cls, fname, output, **kwargs):
        """Create the model from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param output: Path to store the model
        :type output: str
        :param kwargs: Keywords pass to TextModel
        """

        X = [x for x in tweet_iterator(fname)]
        m = cls(**kwargs)
        m.fit(X, [x['klass'] for x in X])
        save_model(m, output)
Exemplo n.º 7
0
def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = [
        "XD"
    ]

    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group',
        'del_dup': False,
        'emo_option': 'group',
        'lang': 'italian',
        'url_option': 'delete'
    })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b
Exemplo n.º 8
0
 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     params = TextModel.params()
     textModel_params = {
         k: v
         for k, v in textModel_params.items() if k in params
     }
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))
Exemplo n.º 9
0
def test_textmodel_default():
    from b4msa.textmodel import TextModel
    for lang in ['spanish', 'english', 'arabic']:
        text = TextModel(lang=lang)
        print(text.token_list, TextModel.default_parameters(lang=lang)['token_list'])
        for a, b in zip(text.token_list,
                        TextModel.default_parameters(lang=lang)['token_list']):
            print(a, b)
            assert a == b
    text = TextModel(lang='arabic', stopwords='xxx')
    assert text._lang_kw['stopwords'] == 'xxx'
Exemplo n.º 10
0
def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'
Exemplo n.º 11
0
def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'
Exemplo n.º 12
0
def test_textmodel_default():
    from b4msa.textmodel import TextModel
    for lang in ['spanish', 'english', 'arabic']:
        text = TextModel(lang=lang)
        print(text.token_list,
              TextModel.default_parameters(lang=lang)['token_list'])
        for a, b in zip(text.token_list,
                        TextModel.default_parameters(lang=lang)['token_list']):
            print(a, b)
            assert a == b
    text = TextModel(lang='arabic', stopwords='xxx')
    assert text._lang_kw['stopwords'] == 'xxx'
Exemplo n.º 13
0
def test_textmodel_token_min_filter():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, token_min_filter=1)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 62
    text = TextModel(tw, token_min_filter=0.3)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 13
    text = TextModel(tw, token_min_filter=1, threshold=0.01)
Exemplo n.º 14
0
class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        r = [self.__getitem__(x) for x in c]
        self._training = False
        return r

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens
Exemplo n.º 15
0
 def fit_from_file(cls, fname, textModel_params={}):
     D = [x for x in tweet_iterator(fname)]
     # X, y = read_data_labels(fname)
     y = [x['klass'] for x in D]
     model = TextModel(D, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in D], y)
Exemplo n.º 16
0
    def tokenize(self, text):
        """Tokenize a text

        :param text: Text
        :type text: dict or str
        """

        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(TextModel.tokenize(self, _text))
            return tokens
        else:
            return TextModel.tokenize(self, text)
Exemplo n.º 17
0
 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     params = TextModel.params()
     textModel_params = {k: v for k, v in textModel_params.items() if k in params}
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))
Exemplo n.º 18
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        logging.basicConfig(level=self.data.verbose)
        logger = logging.getLogger('b4msa')
        logger.setLevel(self.data.verbose)
        best = load_json(self.data.params_fname)[0]
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet) + "\n")
        return hy
Exemplo n.º 19
0
    def textModel(self):
        "Text model used to process the texts"

        try:
            return self._tm
        except AttributeError:
            self._tm = TextModel(**TM_ARGS)
        return self._tm
Exemplo n.º 20
0
    def _create_space(cls, fname, **kwargs):
        """Create the space from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param kwargs: Keywords pass to TextModel
        """
        import random
        from .utils import linearSVC_array
        from collections import Counter
        try:
            from tqdm import tqdm
        except ImportError:

            def tqdm(x, **kwargs):
                return x

        data = [x for x in tweet_iterator(fname)]
        random.shuffle(data)
        tm = TextModel(**kwargs).fit([x['text'] for x in data[:128000]])
        tm._num_terms = tm.model.num_terms
        # klass, nele = np.unique([x['klass'] for x in data], return_counts=True)
        _ = [(k, v) for k, v in Counter([x['klass'] for x in data]).items()]
        _.sort(key=lambda x: x[0])
        klass = [x[0] for x in _]
        nele = [x[1] for x in _]
        h = {v: k for k, v in enumerate(klass)}
        MODELS = []
        for ident, k in tqdm(enumerate(klass)):
            elepklass = [0 for __ in klass]
            cnt = nele[ident]
            cntpklass = int(cnt / (len(klass) - 1))
            D = [(x, 1) for x in data if x['klass'] == k]
            for x in data:
                if x['klass'] == k:
                    continue
                if elepklass[h[x['klass']]] > cntpklass:
                    continue
                elepklass[h[x['klass']]] = elepklass[h[x['klass']]] + 1
                D.append((x, -1))
            m = LinearSVC().fit(tm.tonp([tm[x[0]['text']] for x in D]),
                                [x[1] for x in D])
            MODELS.append(m)
        coef, intercept = linearSVC_array(MODELS)
        return tm, coef, intercept, klass
Exemplo n.º 21
0
def test_textmodel_entropy():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, threshold=0.01)
    assert isinstance(text, TextModel)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 39
Exemplo n.º 22
0
def test_textmodel():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)
Exemplo n.º 23
0
def test_model_instance():
    from microtc.textmodel import TextModel
    X, y = get_data()
    tm = TextModel().fit(X)
    evo = EvoMSA(tm_n_jobs=1,
                 n_jobs=1,
                 TR=False,
                 lang="es",
                 models=[[tm, "sklearn.svm.LinearSVC"]],
                 stacked_method="sklearn.svm.LinearSVC").fit(X, y)
    assert evo.models[0][0] == tm
Exemplo n.º 24
0
def test_negations():
    from b4msa.textmodel import TextModel

    text = [
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete'
    })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect']
    print(a, b)
    assert a == b
Exemplo n.º 25
0
def test_SVC_predict():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_text('Excelente dia b4msa')
    assert y == 'POS'
Exemplo n.º 26
0
def test_lang():
    from b4msa.textmodel import TextModel

    #text = [
    #    "Hi :) :P XD",
    #    "excelente dia xc",
    #    "el alma de la fiesta XD"
    #]
    text = [
        "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    ]

    model = TextModel(
        text,
        **{
            "del_dup1": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "strip_diac": True,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "portuguese",
        })
    #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print a
Exemplo n.º 27
0
def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file(fname)
    for i in y:
        assert i in ['POS', 'NEU', 'NEG']
Exemplo n.º 28
0
def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    #fname = os.path.dirname(__file__) + '/text.json'
    fname = 'text.json'
    #fname = 'test_text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file("test_text.json")
    print "Final Labels"
    print y
Exemplo n.º 29
0
def test_params():
    import os
    import itertools
    from b4msa.params import BASIC_OPTIONS
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator

    params = dict(del_diac=[True, False],
                  usr_option=BASIC_OPTIONS,
                  url_option=BASIC_OPTIONS)
    params = sorted(params.items())
    fname = os.path.dirname(__file__) + '/text.json'
    tw = [x for x in tweet_iterator(fname)]
    text = [x['text'] for x in tw]
    for x in itertools.product(*[x[1] for x in params]):
        args = dict(zip([x[0] for x in params], x))
        ins = TextModel(text, **args)
        assert isinstance(ins[text[0]], list)
Exemplo n.º 30
0
 def __init__(self,
              corpus,
              threshold=0.001,
              token_min_filter=0.001,
              token_list=[-2, -1],
              num_option='delete',
              usr_option='delete',
              url_option='delete',
              emo_option='delete',
              **kwargs):
     self._text = os.getenv('TEXT', default='text')
     self._textmodel = TextModel(None,
                                 token_list=token_list,
                                 threshold=threshold,
                                 token_min_filter=token_min_filter,
                                 num_option=num_option,
                                 usr_option=usr_option,
                                 url_option=url_option,
                                 emo_option=emo_option,
                                 **kwargs)
     self._threshold = threshold
     self.init(corpus)
Exemplo n.º 31
0
text = 'I like playing football'
output = []
for word in text.split():
    w = stemmer.stem(word)
    output.append(w)
output = " ".join(output)
output

text = 'I like playing football on Saturday'
words = text.split()
n = 3
n_grams = []
for a in zip(*[words[i:] for i in range(n)]):
    n_grams.append("~".join(a))
n_grams

text = 'I like playing'
q = 4
q_grams = []
for a in zip(*[text[i:] for i in range(q)]):
    q_grams.append("".join(a))
q_grams

text = 'I like playing football with @mgraffg'
tm = TextModel(token_list=[-1, 5],
               lang='english',
               usr_option=OPTION_GROUP,
               stemming=True)
tm.text_transformations(text)

tm.tokenize(text)
Exemplo n.º 32
0
class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus=None, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        if corpus is not None:
            self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        [self.__getitem__(x) for x in c]
        self._training = False
        return self

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def transform(self, texts):
        """Convert test into a vector

        :param texts: List of text to be transformed
        :type text: list

        :rtype: list

        Example:

        >>> from microtc.textmodel import TextModel
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
        >>> textmodel = TextModel().fit(corpus)
        >>> X = textmodel.transform(corpus)
        """
        return self._textModel.tonp([self.__getitem__(x) for x in texts])

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens
Exemplo n.º 33
0
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from matplotlib import pylab as plt
from wordcloud import WordCloud as WC
from collections import Counter
from scipy.stats import rankdata
import numpy as np
import Orange


D = [(x['text'], x['klass']) for x in tweet_iterator(TWEETS)]
y = [y for _, y in D]
le = LabelEncoderWrapper().fit(y)
y = le.transform(y)

tm = TextModel(token_list=[-1], 
               weighting='tf').fit([x for x, _ in D])

folds = StratifiedKFold(shuffle=True, random_state=0)

hy = np.empty(len(D))
for tr, val in folds.split(D, y):
    _ = [D[x][0] for x in tr]
    X = tm.transform(_)
    m = LogisticRegression(multi_class='multinomial').fit(X, y[tr])
    # m = LinearSVC().fit(X, y[tr])
    _ = [D[x][0] for x in val]
    hy[val] = m.predict(tm.transform(_))

ci = bootstrap_confidence_interval(y, hy)
ci
(0.2839760475399691, 0.30881116416736665)
Exemplo n.º 34
0
    def func(data, output):
        from b4msa.textmodel import TextModel
        from microtc.utils import tweet_iterator, save_model

        tm = TextModel().fit(list(tweet_iterator(data)))
        save_model(tm, output)
Exemplo n.º 35
0
 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))
Exemplo n.º 36
0
 def fit_from_file(cls, fname, textModel_params={}):
     X, y = read_data_labels(fname)
     model = TextModel(X, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in X], y)
Exemplo n.º 37
0
def clean_params(kw):
    params = TextModel.params()
    return {k: v for k, v in kw.items() if k in params}