def fit(self, sentences, labels): if self.verbose: print("Building NN") model = Sequential() model.add(Flatten(input_shape=(self.window, self.word_dim))) for l in self.layers: model.add(Dense(l, activation='relu')) model.add(Dense(3, activation='sigmoid')) model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) self.model = model if self.verbose: model.summary() p1 = lower_pipe(sentences) p2 = tokenize_pipe(p1) p3 = stem_pipe(p2) p4 = lemmatize_pipe(p3) p5 = uncommon_pipe(p4) clean_sens = list(p5) self.clean_sens = clean_sens if self.verbose: print("Building word embedding") self.encoder = None if self.pte: self.encoder = pte() else: self.encoder = Word2Vec( clean_sens, size=self.word_dim, min_count=0) enc = encode_pipe(clean_sens, self.encoder) self.enc = enc windows = list(window_pipe(enc, labels, self.window)) win_sens = [w[0] for w in windows] win_labs = [w[1] for w in windows] y_inp = to_categorical(win_labs) if self.verbose: print("Training NN") model.fit( np.array(win_sens), np.array(y_inp), epochs=self.epochs, batch_size=self.batch, verbose=self.verbose)
def make_sig_words(stem=False, lemma=False, other_data=None): words = other_data if other_data is None: words = tok_s_by_a classes = words.keys() if stem: words = {k: list(stem_pipe(v)) for k, v in words.items()} if lemma: words = {k: list(lemmatize_pipe(v)) for k, v in words.items()} res = {c: {} for c in classes} betas = {c: wordbeta(words[c]) for c in classes} vocab = set([w for X in words.values() for s in X for w in s]) for w in vocab: for cc in classes: num = betas[cc][w] denom = 1 - sqrt( prod([1 - betas[c][w] for c in classes if c is not cc])) res[cc][w] = num / denom return res
def pipeline_factory(self, sens): p = lower_pipe(sens) p = tokenize_pipe(p) if self.stem: p = stem_pipe(p) if self.lemma: p = lemmatize_pipe(p) return p
def _pred_sen(self, s): s_array = [s] p1 = lower_pipe(s_array) p2 = tokenize_pipe(p1) p3 = stem_pipe(p2) p4 = lemmatize_pipe(p3) p5 = p4 if not self.pte: p5 = cull_words_pipe(p4, self.encoder.wv.vocab) p6 = encode_pipe(p5, self.encoder) windows = np.array(list(window_pipe_nolabel(p6, self.window))) preds = self.model.predict(windows, batch_size=len(windows)) logs = np.log(preds) flat = np.sum(logs, 0) winner_index = np.argmax(flat) if self.index_out: return winner_index else: return flat