Exemplo n.º 1
0
    def fit(self, sentences, labels):

        if self.verbose:
            print("Building NN")
        model = Sequential()
        model.add(Flatten(input_shape=(self.window, self.word_dim)))

        for l in self.layers:
            model.add(Dense(l, activation='relu'))

        model.add(Dense(3, activation='sigmoid'))
        model.compile(
            loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

        self.model = model

        if self.verbose:
            model.summary()

        p1 = lower_pipe(sentences)
        p2 = tokenize_pipe(p1)
        p3 = stem_pipe(p2)
        p4 = lemmatize_pipe(p3)
        p5 = uncommon_pipe(p4)

        clean_sens = list(p5)
        self.clean_sens = clean_sens

        if self.verbose:
            print("Building word embedding")

        self.encoder = None
        if self.pte:
            self.encoder = pte()
        else:
            self.encoder = Word2Vec(
                clean_sens,
                size=self.word_dim,
                min_count=0)

        enc = encode_pipe(clean_sens, self.encoder)
        self.enc = enc
        windows = list(window_pipe(enc, labels, self.window))

        win_sens = [w[0] for w in windows]
        win_labs = [w[1] for w in windows]

        y_inp = to_categorical(win_labs)

        if self.verbose:
            print("Training NN")

        model.fit(
            np.array(win_sens),
            np.array(y_inp),
            epochs=self.epochs,
            batch_size=self.batch,
            verbose=self.verbose)
Exemplo n.º 2
0
 def pipeline_factory(self, sens):
     p = lower_pipe(sens)
     p = tokenize_pipe(p)
     if self.stem:
         p = stem_pipe(p)
     if self.lemma:
         p = lemmatize_pipe(p)
     return p
Exemplo n.º 3
0
    def _pred_sen(self, s):
        s_array = [s]
        p1 = lower_pipe(s_array)
        p2 = tokenize_pipe(p1)
        p3 = stem_pipe(p2)
        p4 = lemmatize_pipe(p3)
        p5 = p4
        if not self.pte:
            p5 = cull_words_pipe(p4, self.encoder.wv.vocab)
        p6 = encode_pipe(p5, self.encoder)
        windows = np.array(list(window_pipe_nolabel(p6, self.window)))

        preds = self.model.predict(windows, batch_size=len(windows))

        logs = np.log(preds)
        flat = np.sum(logs, 0)

        winner_index = np.argmax(flat)
        if self.index_out:
            return winner_index
        else:
            return flat
Exemplo n.º 4
0
                            lemmatize_pipe, strip_stopwords_pipe)

font = {'size': 12}

rc('font', **font)

tr, te = import_data()

sens = list(tr['text'])
authors = list(tr['author'])

n_auths = Counter(authors)

s_by_a = {a: [s for s, a1 in zip(sens, authors) if a1 == a] for a in n_auths}

tok_s_by_a = {k: list(tokenize_pipe(lower_pipe(v))) for k, v in s_by_a.items()}


def senlens(X):
    out = Counter([len(s) for s in X])
    return out


def wordcounts(X):
    ctr = Counter([w for s in X for w in s])
    return ctr


def wordfreq(X):
    count = wordcounts(X)
    sumc = sum(count.values())
Exemplo n.º 5
0
 def pipeline(text):
     return stem_pipe(strip_stopwords_pipe(lower_pipe(text)))