Exemplos de TextModel em Python, exemplos de b4msa.textmodel.TextModel em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = ["XD"]

    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'italian',
            'url_option': 'delete'
        })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: INGEOTEC/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup": True,
        "emo_option": "group",
        "lc": True,
        "negation": True,
        "num_option": "group",
        "stemming": True,
        "stopwords": "group",
        "del_diac": False,
        "token_list": [
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
        "lang": "spanish",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_negations():
    from b4msa.textmodel import TextModel

    text = ["el alma de la fiesta XD"]
    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'spanish',
            'url_option': 'delete'
        })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = [
        '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech',
        'no_respect'
    ]
    print(a, b)
    assert a == b

Exemplo n.º 4

0

Exibir arquivo

Arquivo: model.py Projeto: MDAlvarezH/EvoMSA

 def __init__(self, corpus, **kwargs):
     self._text = os.getenv('TEXT', default='text')
     self._m = {}
     self._num_terms = 0
     self._training = True
     self._textModel = TextModel([''], token_list=[-1])
     self.fit(corpus)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"]
    model = TextModel(
        text,
        **{
            "del_dup": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "del_diac": False,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "spanish",
        })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: model.py Projeto: MDAlvarezH/EvoMSA

class HA(BaseTextModel):
    """Wrapper of b4msa.textmodel.TextModel and LinearSVC"""
    def __init__(self, **kwargs):
        self._tm = TextModel(**kwargs)
        self._cl = LinearSVC()

    def fit(self, X, y):
        self._tm.fit(X)
        self._cl.fit(self._tm.transform(X), y)
        return self

    def tonp(self, X):
        return X

    def transform(self, X):
        res = self._cl.decision_function(self._tm.transform(X))
        if res.ndim == 1:
            return np.atleast_2d(res).T
        return res

    @classmethod
    def create_space(cls, fname, output, **kwargs):
        """Create the model from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param output: Path to store the model
        :type output: str
        :param kwargs: Keywords pass to TextModel
        """

        X = [x for x in tweet_iterator(fname)]
        m = cls(**kwargs)
        m.fit(X, [x['klass'] for x in X])
        save_model(m, output)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: INGEOTEC/b4msa

def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = [
        "XD"
    ]

    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group',
        'del_dup': False,
        'emo_option': 'group',
        'lang': 'italian',
        'url_option': 'delete'
    })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b

Exemplo n.º 8

0

Exibir arquivo

Arquivo: classifier.py Projeto: valjime95/b4msa

 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     params = TextModel.params()
     textModel_params = {
         k: v
         for k, v in textModel_params.items() if k in params
     }
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: INGEOTEC/b4msa

def test_textmodel_default():
    from b4msa.textmodel import TextModel
    for lang in ['spanish', 'english', 'arabic']:
        text = TextModel(lang=lang)
        print(text.token_list, TextModel.default_parameters(lang=lang)['token_list'])
        for a, b in zip(text.token_list,
                        TextModel.default_parameters(lang=lang)['token_list']):
            print(a, b)
            assert a == b
    text = TextModel(lang='arabic', stopwords='xxx')
    assert text._lang_kw['stopwords'] == 'xxx'

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_lang_dependency.py Projeto: INGEOTEC/b4msa

def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_lang_dependency.py Projeto: valjime95/b4msa

def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_textmodel_default():
    from b4msa.textmodel import TextModel
    for lang in ['spanish', 'english', 'arabic']:
        text = TextModel(lang=lang)
        print(text.token_list,
              TextModel.default_parameters(lang=lang)['token_list'])
        for a, b in zip(text.token_list,
                        TextModel.default_parameters(lang=lang)['token_list']):
            print(a, b)
            assert a == b
    text = TextModel(lang='arabic', stopwords='xxx')
    assert text._lang_kw['stopwords'] == 'xxx'

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_textmodel_token_min_filter():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, token_min_filter=1)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 62
    text = TextModel(tw, token_min_filter=0.3)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 13
    text = TextModel(tw, token_min_filter=1, threshold=0.01)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: model.py Projeto: MDAlvarezH/EvoMSA

class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        r = [self.__getitem__(x) for x in c]
        self._training = False
        return r

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens

Exemplo n.º 15

0

Exibir arquivo

Arquivo: classifier.py Projeto: valjime95/b4msa

 def fit_from_file(cls, fname, textModel_params={}):
     D = [x for x in tweet_iterator(fname)]
     # X, y = read_data_labels(fname)
     y = [x['klass'] for x in D]
     model = TextModel(D, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in D], y)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: model.py Projeto: MDAlvarezH/EvoMSA

    def tokenize(self, text):
        """Tokenize a text

        :param text: Text
        :type text: dict or str
        """

        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(TextModel.tokenize(self, _text))
            return tokens
        else:
            return TextModel.tokenize(self, text)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: classifier.py Projeto: INGEOTEC/b4msa

 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     params = TextModel.params()
     textModel_params = {k: v for k, v in textModel_params.items() if k in params}
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))

Exemplo n.º 18

0

Exibir arquivo

Arquivo: command_line.py Projeto: vashisht7/b4msa

    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        logging.basicConfig(level=self.data.verbose)
        logger = logging.getLogger('b4msa')
        logger.setLevel(self.data.verbose)
        best = load_json(self.data.params_fname)[0]
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet) + "\n")
        return hy

Exemplo n.º 19

0

Exibir arquivo

Arquivo: dataset.py Projeto: INGEOTEC/text_models

    def textModel(self):
        "Text model used to process the texts"

        try:
            return self._tm
        except AttributeError:
            self._tm = TextModel(**TM_ARGS)
        return self._tm

Exemplo n.º 20

0

Exibir arquivo

Arquivo: model.py Projeto: JuanCalderon/EvoMSA

    def _create_space(cls, fname, **kwargs):
        """Create the space from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param kwargs: Keywords pass to TextModel
        """
        import random
        from .utils import linearSVC_array
        from collections import Counter
        try:
            from tqdm import tqdm
        except ImportError:

            def tqdm(x, **kwargs):
                return x

        data = [x for x in tweet_iterator(fname)]
        random.shuffle(data)
        tm = TextModel(**kwargs).fit([x['text'] for x in data[:128000]])
        tm._num_terms = tm.model.num_terms
        # klass, nele = np.unique([x['klass'] for x in data], return_counts=True)
        _ = [(k, v) for k, v in Counter([x['klass'] for x in data]).items()]
        _.sort(key=lambda x: x[0])
        klass = [x[0] for x in _]
        nele = [x[1] for x in _]
        h = {v: k for k, v in enumerate(klass)}
        MODELS = []
        for ident, k in tqdm(enumerate(klass)):
            elepklass = [0 for __ in klass]
            cnt = nele[ident]
            cntpklass = int(cnt / (len(klass) - 1))
            D = [(x, 1) for x in data if x['klass'] == k]
            for x in data:
                if x['klass'] == k:
                    continue
                if elepklass[h[x['klass']]] > cntpklass:
                    continue
                elepklass[h[x['klass']]] = elepklass[h[x['klass']]] + 1
                D.append((x, -1))
            m = LinearSVC().fit(tm.tonp([tm[x[0]['text']] for x in D]),
                                [x[1] for x in D])
            MODELS.append(m)
        coef, intercept = linearSVC_array(MODELS)
        return tm, coef, intercept, klass

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_textmodel_entropy():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel(tw, threshold=0.01)
    assert isinstance(text, TextModel)
    print(len(text.model._w2id))
    assert len(text.model._w2id) == 39

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_textmodel():
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    tw = list(tweet_iterator(fname))
    text = TextModel([x['text'] for x in tw])
    # print(text.tokenize("hola amiguitos gracias por venir :) http://hello.com @chanfle"))
    # assert False
    assert isinstance(text[tw[0]['text']], list)

Exemplo n.º 23

0

Exibir arquivo

def test_model_instance():
    from microtc.textmodel import TextModel
    X, y = get_data()
    tm = TextModel().fit(X)
    evo = EvoMSA(tm_n_jobs=1,
                 n_jobs=1,
                 TR=False,
                 lang="es",
                 models=[[tm, "sklearn.svm.LinearSVC"]],
                 stacked_method="sklearn.svm.LinearSVC").fit(X, y)
    assert evo.models[0][0] == tm

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: INGEOTEC/b4msa

def test_negations():
    from b4msa.textmodel import TextModel

    text = [
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete'
    })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect']
    print(a, b)
    assert a == b

Exemplo n.º 25

0

Exibir arquivo

def test_SVC_predict():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_text('Excelente dia b4msa')
    assert y == 'POS'

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: dougcirqueira/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    #text = [
    #    "Hi :) :P XD",
    #    "excelente dia xc",
    #    "el alma de la fiesta XD"
    #]
    text = [
        "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    ]

    model = TextModel(
        text,
        **{
            "del_dup1": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "strip_diac": True,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "portuguese",
        })
    #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print a

Exemplo n.º 27

0

Exibir arquivo

def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file(fname)
    for i in y:
        assert i in ['POS', 'NEU', 'NEG']

Exemplo n.º 28

0

Exibir arquivo

def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    #fname = os.path.dirname(__file__) + '/text.json'
    fname = 'text.json'
    #fname = 'test_text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file("test_text.json")
    print "Final Labels"
    print y

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_textmodel.py Projeto: valjime95/b4msa

def test_params():
    import os
    import itertools
    from b4msa.params import BASIC_OPTIONS
    from b4msa.textmodel import TextModel
    from microtc.utils import tweet_iterator

    params = dict(del_diac=[True, False],
                  usr_option=BASIC_OPTIONS,
                  url_option=BASIC_OPTIONS)
    params = sorted(params.items())
    fname = os.path.dirname(__file__) + '/text.json'
    tw = [x for x in tweet_iterator(fname)]
    text = [x['text'] for x in tw]
    for x in itertools.product(*[x[1] for x in params]):
        args = dict(zip([x[0] for x in params], x))
        ins = TextModel(text, **args)
        assert isinstance(ins[text[0]], list)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: model.py Projeto: MDAlvarezH/EvoMSA

 def __init__(self,
              corpus,
              threshold=0.001,
              token_min_filter=0.001,
              token_list=[-2, -1],
              num_option='delete',
              usr_option='delete',
              url_option='delete',
              emo_option='delete',
              **kwargs):
     self._text = os.getenv('TEXT', default='text')
     self._textmodel = TextModel(None,
                                 token_list=token_list,
                                 threshold=threshold,
                                 token_min_filter=token_min_filter,
                                 num_option=num_option,
                                 usr_option=usr_option,
                                 url_option=url_option,
                                 emo_option=emo_option,
                                 **kwargs)
     self._threshold = threshold
     self.init(corpus)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: 05TextNormalization.py Projeto: INGEOTEC/NLP-Course

text = 'I like playing football'
output = []
for word in text.split():
    w = stemmer.stem(word)
    output.append(w)
output = " ".join(output)
output

text = 'I like playing football on Saturday'
words = text.split()
n = 3
n_grams = []
for a in zip(*[words[i:] for i in range(n)]):
    n_grams.append("~".join(a))
n_grams

text = 'I like playing'
q = 4
q_grams = []
for a in zip(*[text[i:] for i in range(q)]):
    q_grams.append("".join(a))
q_grams

text = 'I like playing football with @mgraffg'
tm = TextModel(token_list=[-1, 5],
               lang='english',
               usr_option=OPTION_GROUP,
               stemming=True)
tm.text_transformations(text)

tm.tokenize(text)

Exemplo n.º 32

0

Exibir arquivo

Arquivo: model.py Projeto: JuanCalderon/EvoMSA

class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus=None, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        if corpus is not None:
            self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        [self.__getitem__(x) for x in c]
        self._training = False
        return self

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def transform(self, texts):
        """Convert test into a vector

        :param texts: List of text to be transformed
        :type text: list

        :rtype: list

        Example:

        >>> from microtc.textmodel import TextModel
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
        >>> textmodel = TextModel().fit(corpus)
        >>> X = textmodel.transform(corpus)
        """
        return self._textModel.tonp([self.__getitem__(x) for x in texts])

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens

Exemplo n.º 33

0

Exibir arquivo

Arquivo: 07BoW.py Projeto: INGEOTEC/NLP-Course

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from matplotlib import pylab as plt
from wordcloud import WordCloud as WC
from collections import Counter
from scipy.stats import rankdata
import numpy as np
import Orange


D = [(x['text'], x['klass']) for x in tweet_iterator(TWEETS)]
y = [y for _, y in D]
le = LabelEncoderWrapper().fit(y)
y = le.transform(y)

tm = TextModel(token_list=[-1], 
               weighting='tf').fit([x for x, _ in D])

folds = StratifiedKFold(shuffle=True, random_state=0)

hy = np.empty(len(D))
for tr, val in folds.split(D, y):
    _ = [D[x][0] for x in tr]
    X = tm.transform(_)
    m = LogisticRegression(multi_class='multinomial').fit(X, y[tr])
    # m = LinearSVC().fit(X, y[tr])
    _ = [D[x][0] for x in val]
    hy[val] = m.predict(tm.transform(_))

ci = bootstrap_confidence_interval(y, hy)
ci
(0.2839760475399691, 0.30881116416736665)

Exemplo n.º 34

0

Exibir arquivo

    def func(data, output):
        from b4msa.textmodel import TextModel
        from microtc.utils import tweet_iterator, save_model

        tm = TextModel().fit(list(tweet_iterator(data)))
        save_model(tm, output)

Exemplo n.º 35

0

Exibir arquivo

Arquivo: classifier.py Projeto: dougcirqueira/b4msa

 def train_predict_pool(cls, args):
     X, y, tr, ts, textModel_params = args
     t = TextModel([X[x] for x in tr], **textModel_params)
     m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
     return ts, np.array(m.predict([t[X[x]] for x in ts]))

Exemplo n.º 36

0

Exibir arquivo

Arquivo: classifier.py Projeto: dougcirqueira/b4msa

 def fit_from_file(cls, fname, textModel_params={}):
     X, y = read_data_labels(fname)
     model = TextModel(X, **textModel_params)
     svc = cls(model)
     return svc.fit([model[x] for x in X], y)

Exemplo n.º 37

0

Exibir arquivo

Arquivo: command_line.py Projeto: INGEOTEC/b4msa

def clean_params(kw):
    params = TextModel.params()
    return {k: v for k, v in kw.items() if k in params}