Exemplo n.º 1
0
class HA(BaseTextModel):
    """Wrapper of b4msa.textmodel.TextModel and LinearSVC"""
    def __init__(self, **kwargs):
        self._tm = TextModel(**kwargs)
        self._cl = LinearSVC()

    def fit(self, X, y):
        self._tm.fit(X)
        self._cl.fit(self._tm.transform(X), y)
        return self

    def tonp(self, X):
        return X

    def transform(self, X):
        res = self._cl.decision_function(self._tm.transform(X))
        if res.ndim == 1:
            return np.atleast_2d(res).T
        return res

    @classmethod
    def create_space(cls, fname, output, **kwargs):
        """Create the model from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param output: Path to store the model
        :type output: str
        :param kwargs: Keywords pass to TextModel
        """

        X = [x for x in tweet_iterator(fname)]
        m = cls(**kwargs)
        m.fit(X, [x['klass'] for x in X])
        save_model(m, output)
Exemplo n.º 2
0

D = [(x['text'], x['klass']) for x in tweet_iterator(TWEETS)]
y = [y for _, y in D]
le = LabelEncoderWrapper().fit(y)
y = le.transform(y)

tm = TextModel(token_list=[-1], 
               weighting='tf').fit([x for x, _ in D])

folds = StratifiedKFold(shuffle=True, random_state=0)

hy = np.empty(len(D))
for tr, val in folds.split(D, y):
    _ = [D[x][0] for x in tr]
    X = tm.transform(_)
    m = LogisticRegression(multi_class='multinomial').fit(X, y[tr])
    # m = LinearSVC().fit(X, y[tr])
    _ = [D[x][0] for x in val]
    hy[val] = m.predict(tm.transform(_))

ci = bootstrap_confidence_interval(y, hy)
ci
(0.2839760475399691, 0.30881116416736665)

tm = TextModel(token_list=[-1]).fit([x for x, _ in D])
hy = np.empty(len(D))
for tr, val in folds.split(D, y):
    _ = [D[x][0] for x in tr]
    X = tm.transform(_)
    m = LogisticRegression(multi_class='multinomial').fit(X, y[tr])
Exemplo n.º 3
0
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
import numpy as np
from microtc.utils import tweet_iterator
from b4msa.textmodel import TextModel
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold
from EvoMSA.utils import bootstrap_confidence_interval

D = list(tweet_iterator("semeval2017_En_train.json"))
tm = TextModel(lang="english").fit(D)
# ,
#               token_list=[-1]).fit(D)

le = LabelEncoder().fit([x['klass'] for x in D])
X = tm.transform(D)
y = le.transform([x['klass'] for x in D])

m = LinearSVC().fit(X, y)
recall_score(y, m.predict(X), average=None)

kf = KFold()
hy = np.empty_like(y)
for train, test in kf.split(X):
    m = LinearSVC().fit(X[train], y[train])
    hy[test] = m.predict(X[test])

bootstrap_confidence_interval(
    y, hy, metric=lambda y, hy: recall_score(y, hy, average=None)[0])