示例#1
0
def make_sig_words(stem=False, lemma=False, other_data=None):
    words = other_data
    if other_data is None:
        words = tok_s_by_a

    classes = words.keys()

    if stem:
        words = {k: list(stem_pipe(v)) for k, v in words.items()}

    if lemma:
        words = {k: list(lemmatize_pipe(v)) for k, v in words.items()}

    res = {c: {} for c in classes}

    betas = {c: wordbeta(words[c]) for c in classes}

    vocab = set([w for X in words.values() for s in X for w in s])

    for w in vocab:
        for cc in classes:
            num = betas[cc][w]
            denom = 1 - sqrt(
                prod([1 - betas[c][w] for c in classes if c is not cc]))

            res[cc][w] = num / denom

    return res
示例#2
0
    def fit(self, sentences, labels):

        if self.verbose:
            print("Building NN")
        model = Sequential()
        model.add(Flatten(input_shape=(self.window, self.word_dim)))

        for l in self.layers:
            model.add(Dense(l, activation='relu'))

        model.add(Dense(3, activation='sigmoid'))
        model.compile(
            loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

        self.model = model

        if self.verbose:
            model.summary()

        p1 = lower_pipe(sentences)
        p2 = tokenize_pipe(p1)
        p3 = stem_pipe(p2)
        p4 = lemmatize_pipe(p3)
        p5 = uncommon_pipe(p4)

        clean_sens = list(p5)
        self.clean_sens = clean_sens

        if self.verbose:
            print("Building word embedding")

        self.encoder = None
        if self.pte:
            self.encoder = pte()
        else:
            self.encoder = Word2Vec(
                clean_sens,
                size=self.word_dim,
                min_count=0)

        enc = encode_pipe(clean_sens, self.encoder)
        self.enc = enc
        windows = list(window_pipe(enc, labels, self.window))

        win_sens = [w[0] for w in windows]
        win_labs = [w[1] for w in windows]

        y_inp = to_categorical(win_labs)

        if self.verbose:
            print("Training NN")

        model.fit(
            np.array(win_sens),
            np.array(y_inp),
            epochs=self.epochs,
            batch_size=self.batch,
            verbose=self.verbose)
 def pipeline_factory(self, sens):
     p = lower_pipe(sens)
     p = tokenize_pipe(p)
     if self.stem:
         p = stem_pipe(p)
     if self.lemma:
         p = lemmatize_pipe(p)
     return p
示例#4
0
    def _pred_sen(self, s):
        s_array = [s]
        p1 = lower_pipe(s_array)
        p2 = tokenize_pipe(p1)
        p3 = stem_pipe(p2)
        p4 = lemmatize_pipe(p3)
        p5 = p4
        if not self.pte:
            p5 = cull_words_pipe(p4, self.encoder.wv.vocab)
        p6 = encode_pipe(p5, self.encoder)
        windows = np.array(list(window_pipe_nolabel(p6, self.window)))

        preds = self.model.predict(windows, batch_size=len(windows))

        logs = np.log(preds)
        flat = np.sum(logs, 0)

        winner_index = np.argmax(flat)
        if self.index_out:
            return winner_index
        else:
            return flat
示例#5
0
 def pipeline(text):
     return stem_pipe(strip_stopwords_pipe(lower_pipe(text)))