def inference_html(self, tag): input_tag_tokenizer = tokenizer.InputTagTokenizer() tokens = input_tag_tokenizer.get_attrs_value(tag.html) bow = self.dictionary.doc2bow(tokens) vec = matutils.corpus2dense([bow], self.in_units).T[0] x = chainer.Variable(np.asarray(np.array([vec]).astype(np.float32))) with chainer.using_config('train', False): y = self.classifier.predictor(x) i = np.argmax(y.data, axis=1).tolist()[0] return self._label_name_from_id(i)
def inference_html(self, tag): input_tag_tokenizer = tokenizer.InputTagTokenizer() tokens = input_tag_tokenizer.get_attrs_value(tag.html) vec_bow = self.dictionary.doc2bow(tokens) vec_lsi = self.__sparse_to_dense(self.lsi[vec_bow]) if len(vec_lsi) == 0: return 'unknown' else: predict_value = self.lr.predict([vec_lsi])[0] return self._label_name_from_id(predict_value)
def __convert_to_word_vecs(self, records, with_topic=False): input_tag_tokenizer = tokenizer.InputTagTokenizer() word_vecs = [] topics = [] test_topics = [] for r in records: word_vecs.append(input_tag_tokenizer.get_attrs_value(r.html)) if with_topic: # Note: use canonical topic instead of raw topic in mysql topics.append(r.canonical_topic) return (word_vecs, topics)
def __convert_training(self, training): input_tag_tokenizer = tokenizer.InputTagTokenizer() word_vecs = [] labels = [] test_labels = [] for r in training: word_vecs.append(input_tag_tokenizer.get_attrs_value(r.html)) labels.append(r.label) label_types = list(set(labels)) label_ids = [label_types.index(x) for x in labels] return (word_vecs, label_ids, label_types)
def __convert_tests(self, tests): data = [] labels = [] for r in tests: input_tag_tokenizer = tokenizer.InputTagTokenizer() tokens = input_tag_tokenizer.get_attrs_value(r.html) bow = self.dictionary.doc2bow(tokens) vec = matutils.corpus2dense([bow], self.in_units).T[0] if r.label not in self.label_types: continue # skip labels undefined in training data label_id = self.label_types.index(r.label) data.append(np.array(vec).astype(np.float32)) labels.append(np.int32(label_id)) return tuple_dataset.TupleDataset(data, labels)