def build_token2id_label2id_dict(self, x_train: List[List[str]], y_train: List[str], x_validate: List[List[str]] = None, y_validate: List[str] = None): if x_validate: x_data = [*x_train, *x_validate] y_data = [*y_train, *y_validate] else: x_data = x_train y_data = y_train x_data_level = depth_count(x_data) if x_data_level > 2: for _ in range(x_data_level-2): x_data = list(chain(*x_data)) self.embedding.build_token2idx_dict(x_data, 3) if self.multi_label: label_set = set() for i in y_data: label_set = label_set.union(list(i)) else: label_set = set(y_data) if not len(self.label2idx): label2idx = { k.PAD: 0, } for idx, label in enumerate(label_set): label2idx[label] = idx + 1 self._label2idx = label2idx self._idx2label = dict([(val, key) for (key, val) in label2idx.items()]) self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
def build_token2idx_dict(self, x_data: List[TextSeqType], min_count: int = 5): if self.token2idx is None: #word_set: Dict[str, int] = {} # for x_item in x_data: # for word in x_item: # word_set[word] = word_set.get(word, 0) + 1 data_depth = helper.depth_count(x_data) if data_depth > 1: x_items = x_data for _ in range(data_depth - 1): x_items = list(chain(*x_items)) word_freq = Counter(x_items) # word_set = {word: freq for word, freq in word_freq.items() if freq >= min_count} # word2idx_list = sorted(word_set.items(), key=lambda kv: -kv[1]) word2idx_list = sorted(word_freq.items(), key=lambda kv: -kv[1]) word2idx = self.base_dict.copy() offset = len(word2idx) # for word, count in word2idx_list: # if count >= min_count: # word2idx[word] = len(word2idx) for idx, (word, freq) in enumerate(word2idx_list): if freq >= min_count: word2idx[word] = idx + offset self.token2idx = word2idx self.build()
def predict(self, sentence: Union[List[str], List[List[str]], List[List[List[str]]]], batch_size=None, output_dict=False, multi_label_threshold=0.6, debug_info=False) -> Union[List[str], str, List[Dict], Dict]: """ predict with model :param sentence: single sentence as List[str] or list of sentence as List[List[str]] :param batch_size: predict batch_size :param output_dict: return dict with result with confidence :param multi_label_threshold: :param debug_info: print debug info using logging.debug when True :return: """ sentence_level = depth_count(sentence) if sentence_level == 2: sentence = [sentence] elif sentence_level == 1: sentence = [[sentence]] padded_tokens = [] for i, sent_part in enumerate(sentence): tokens = self.embedding.tokenize(sent_part) if isinstance(self.embedding.sequence_length, int): padded_tokens_part = sequence.pad_sequences(tokens, maxlen=self.embedding.sequence_length, padding='post') padded_tokens.append(padded_tokens_part) if self.embedding.is_bert: padded_tokens.append(np.zeros(shape=(len(padded_tokens_part), self.embedding.sequence_length))) elif isinstance(self.embedding.sequence_length, list): padded_tokens_part = sequence.pad_sequences(tokens, maxlen=self.embedding.sequence_length[i], padding='post') padded_tokens.append(padded_tokens_part) if self.embedding.is_bert: padded_tokens.append(np.zeros(shape=(len(padded_tokens_part), self.embedding.sequence_length[i]))) x = padded_tokens res = self.model.predict(x, batch_size=batch_size) if self.multi_label: if debug_info: logging.info('raw output: {}'.format(res)) res[res >= multi_label_threshold] = 1 res[res < multi_label_threshold] = 0 predict_result = res else: predict_result = res.argmax(-1) if debug_info: logging.info('input: {}'.format(x)) logging.info('output: {}'.format(res)) logging.info('output argmax: {}'.format(predict_result)) if output_dict: words_list: List[List[str]] = sentence[0] results = [] for index in range(len(words_list)): results.append(self._format_output_dic(words_list[index], res[index])) if sentence_level >= 2: return results elif sentence_level == 1: return results[0] else: if self.multi_label: results = self.multi_label_binarizer.inverse_transform(predict_result) else: results = self.convert_idx_to_label(predict_result) if sentence_level >= 2: return results elif sentence_level == 1: return results[0]
def fit(self, x_train: Union[List[List[str]], List[List[List[str]]]], y_train: Union[List[str], List[List[str]], List[Tuple[str]]], x_validate: Union[List[List[str]], List[List[List[str]]]] = None, y_validate: Union[List[str], List[List[str]], List[Tuple[str]]] = None, batch_size: int = 64, epochs: int = 5, class_weight: bool = False, fit_kwargs: Dict = None, **kwargs): """ :param x_train: list of training data. :param y_train: list of training target label data. :param x_validate: list of validation data. :param y_validate: list of validation target label data. :param batch_size: batch size for trainer model :param epochs: Number of epochs to train the model. :param class_weight: set class weights for imbalanced classes :param fit_kwargs: additional kwargs to be passed to :func:`~keras.models.Model.fit` :param kwargs: :return: """ x_train_level = depth_count(x_train) if x_train_level == 2: assert len(x_train) == len(y_train) elif x_train_level > 2: for x_part in x_train: assert len(x_part) == len(y_train) else: raise Exception('x_train type error') if len(y_train) < batch_size: batch_size = len(y_train) // 2 if not self.model: if isinstance(self.embedding.sequence_length, int): if self.embedding.sequence_length == 0: self.embedding.sequence_length = sorted([len(x) for x in x_train])[int(0.95 * len(x_train))] logging.info('sequence length set to {}'.format(self.embedding.sequence_length)) elif isinstance(self.embedding.sequence_length, list): seq_len = [] for i, x_part in enumerate(x_train): if self.embedding.sequence_length[i] == 0: seq_len.append(max(sorted([len(x) for x in x_part])[int(0.95 * len(x_part))], 1)) logging.info(f'sequence_{i} length set to {self.embedding.sequence_length[i]}') else: seq_len.append(self.embedding.sequence_length[i]) self.embedding.sequence_length = seq_len self.build_model(x_train, y_train, x_validate, y_validate) train_generator = self.get_data_generator(x_train, y_train, batch_size, is_bert=self.embedding.is_bert) if fit_kwargs is None: fit_kwargs = {} if x_validate: validation_generator = self.get_data_generator(x_validate, y_validate, batch_size, is_bert=self.embedding.is_bert) fit_kwargs['validation_data'] = validation_generator fit_kwargs['validation_steps'] = max(len(y_validate) // batch_size, 1) if class_weight: if self.multi_label: y_list = [self.convert_label_to_idx(y) for y in y_train] y_list = [y for ys in y_list for y in ys] else: y_list = self.convert_label_to_idx(y_train) class_weights = class_weight_calculte.compute_class_weight('balanced', np.unique(y_list), y_list) else: class_weights = None self.model.fit_generator(train_generator, steps_per_epoch=len(y_train) // batch_size, epochs=epochs, class_weight=class_weights, **fit_kwargs)
def get_data_generator(self, x_data: Union[List[List[str]], List[List[List[str]]]], y_data: List[str], batch_size: int = 64, is_bert: bool = False): x_data_level = depth_count(x_data) if x_data_level == 2: x_data = [x_data] data_len = len(y_data) for x in x_data: assert len(x) == data_len while True: page_list = list(range((data_len // batch_size) + 1)) random.shuffle(page_list) for page in page_list: start_index = page * batch_size end_index = start_index + batch_size target_x = [] for x in x_data: target_x.append(x[start_index: end_index]) target_y = y_data[start_index: end_index] if len(target_x[0]) == 0: target_x.pop() for x in x_data: target_x.append(x[0: batch_size]) target_y = y_data[0: batch_size] padded_x = [] for i, x in enumerate(target_x): tokenized_x = self.embedding.tokenize(x) if isinstance(self.embedding.sequence_length, int): padded_x.append(sequence.pad_sequences(tokenized_x, maxlen=self.embedding.sequence_length, padding='post') ) elif isinstance(self.embedding.sequence_length, list): padded_x.append(sequence.pad_sequences(tokenized_x, maxlen=self.embedding.sequence_length[i], padding='post') ) if self.multi_label: padded_y = self.multi_label_binarizer.fit_transform(target_y) else: tokenized_y = self.convert_label_to_idx(target_y) padded_y = to_categorical(tokenized_y, num_classes=len(self.label2idx), dtype=np.int) if is_bert: if isinstance(self.embedding.sequence_length, int): padded_x_seg = [np.zeros(shape=(len(padded_x_i), self.embedding.sequence_length)) for padded_x_i in padded_x] elif isinstance(self.embedding.sequence_length, list): padded_x_seg = [np.zeros(shape=(len(padded_x_i), self.embedding.sequence_length[i])) for i, padded_x_i in enumerate(padded_x)] x_input_data = list(chain(*[(x, x_seg) for x, x_seg in zip(padded_x, padded_x_seg)])) else: x_input_data = padded_x[0] if x_data_level == 2 else padded_x yield (x_input_data, padded_y)