def __init__(self, model_path, tag_lookup_file=None, vocabulary_lookup_file=None): self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file) self.vocabulary_lookup_table = Lookuper.load_from_file( vocabulary_lookup_file) super(KerasInferenceBase, self).__init__(model_path)
def __init__(self, model_path, tag_lookup_file=None, vocabulary_lookup_file=None): # load model self.model_dir = model_path # TODO: temp bugfix self.model = tf.keras.models.load_model(model_path, custom_objects={"crf_accuracy": crf_accuracy, "sequence_span_accuracy": sequence_span_accuracy}) self.predict_fn = self.model.predict self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file) self.vocabulary_lookup_table = Lookuper.load_from_file(vocabulary_lookup_file)
def __init__(self, model_path, tag_lookup_file=None, vocabulary_lookup_file=None): # load model self.model_dir = model_path # TODO: temp bugfix self.model = tf.keras.models.load_model(model_path) self.predict_fn = self.model.predict self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file) self.vocabulary_lookup_table = Lookuper.load_from_file( vocabulary_lookup_file)
def _keras_train(self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any) -> None: from tensorflow.python.keras.layers import Input, Masking from tensorflow.python.keras.models import Sequential from tf_crf_layer.layer import CRF from tf_crf_layer.loss import crf_loss from tf_crf_layer.metrics import crf_accuracy from seq2annotation.input import generate_tagset from seq2annotation.input import build_input_func from seq2annotation.input import Lookuper config = self.component_config if 'result_dir' not in config: config['result_dir'] = tempfile.mkdtemp() # read data according configure train_data_generator_func = kwargs.get('addons_tf_input_fn') corpus_meta_data = kwargs.get('addons_tf_input_meta') config['tags_data'] = generate_tagset(corpus_meta_data['tags']) # train and evaluate model train_input_func = build_input_func(train_data_generator_func, config) tag_lookuper = Lookuper( {v: i for i, v in enumerate(config['tags_data'])}) maxlen = 25 offset_data = train_input_func() train_x, train_y = self._keras_data_preprocss(offset_data, tag_lookuper, maxlen) EPOCHS = 1 tag_size = tag_lookuper.size() model = Sequential() model.add(Input(shape=(25, 768))) model.add(Masking()) model.add(CRF(tag_size)) model.compile('adam', loss=crf_loss) model.summary() model.compile('adam', loss=crf_loss, metrics=[crf_accuracy]) model.fit(train_x, train_y, epochs=EPOCHS)
def get_input_data(): config = read_configure() corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() tags_data = generate_tagset(corpus_meta_data['tags']) train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) tag_lookup = Lookuper({v: i for i, v in enumerate(tags_data)}) vocab_data_file = './data/unicode_char_list.txt' vocabulary_lookup = index_table_from_file(vocab_data_file) train_x, train_y = preprocss(train_data, tag_lookup, vocabulary_lookup) if eval_data: test_x, test_y = preprocss(eval_data, tag_lookup, vocabulary_lookup) else: test_x, test_y = None, None return config, (train_x, train_y), (test_x, test_y), tag_lookup, vocabulary_lookup
def preprocss(data, maxlen=None, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.extra_attr[ config['intent_field']] if config['intent_field'] not in [ "label" ] else getattr(offset_data, config['intent_field']) tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] if not maxlen: maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') intent_np_array = np.array(intent_int_list) intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1) return x, intent_one_hot, y, intent_lookup_table
def test_build(datadir, tmpdir): processor_builder = ProcessorBuilder() # setup test processor lookup_processor = LookupProcessor() vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3}) lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table) tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3}) lookup_processor.add_tag_lookup_table(tag_lookup_table) lookup_processor_handle = processor_builder.add_processor(lookup_processor) processor_builder.add_preprocess(lookup_processor_handle) processor_builder.add_postprocess(lookup_processor_handle) processor_builder.save() config = processor_builder.serialize(tmpdir) dircmp_obj = filecmp.dircmp(datadir, tmpdir) assert not dircmp_obj.diff_files assert config == { "version": "1.0", "instance": { "LookupProcessor_0": { "class": "deliverable_model.builtin.processor.lookup_processor.LookupProcessor", "parameter": { "lookup_table": ["vocabulary", "tag"], "padding_parameter": {}, }, } }, "pipeline": { "pre": ["LookupProcessor_0"], "post": ["LookupProcessor_0"] }, } assert processor_builder.get_dependency() == ["seq2annotation"]
def index_table_from_corpus(corpus=None): from seq2annotation.input import Lookuper index_table = {} tmp_text_list = [sample.text for sample in corpus] text_list = [] for text in tmp_text_list: text_list.extend(text) text_list = sorted(set(text_list)) for index, word in enumerate(text_list): index_table[word] = index + 1 return Lookuper(index_table)
def test_build(datadir, tmpdir): lookup_processor = LookupProcessor() vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3}) lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table) tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3}) lookup_processor.add_tag_lookup_table(tag_lookup_table) assert lookup_processor.get_config() == { "lookup_table": ["vocabulary", "tag"], "padding_parameter": {}, } lookup_processor.serialize(tmpdir) match, mismatch, errors = filecmp.cmpfiles(datadir, tmpdir, ["tag", "vocabulary"], shallow=False) assert len(match) == 2
def preprocss(data, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.label tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') return x, numpy.array(intent_int_list), y, intent_lookup_table
def build_vacablookuper_from_corpus(*corpus_tuples): char_list = [] for corpus in corpus_tuples: for sample in corpus: char_list.extend(sample.text) char_list = sorted(list(set(char_list))) char_list.insert(0, '<pad>') index_table = {} index_counter = 0 for key in char_list: index_table[key] = index_counter index_counter += 1 return Lookuper(index_table)
def preprocess( data: List[Sequence], tag_lookup_table: Lookuper, vocabulary_look_table: Lookuper, seq_maxlen: Union[None, int] = None, ) -> Tuple[np.ndarray, np.ndarray, int]: raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookup_table.lookup(i) for i in tags] word_ids = [vocabulary_look_table.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if not seq_maxlen: seq_maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(seq_maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, seq_maxlen, padding="post") # right seq_maxlen # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, seq_maxlen, value=0, padding="post") return x, y, seq_maxlen
def build_vacablookuper_from_list(*lists): char_list = [] for ls in lists: char_list.extend(ls) char_list = sorted(list(set(char_list))) if char_list[0] != '<pad>': char_list.insert(0, '<pad>') index_table = {} index_counter = 1 for key in char_list: index_table[key] = index_counter index_counter += 1 return Lookuper(index_table)
def load(cls, parameter: dict, asset_dir) -> "ProcessorBase": from seq2annotation.input import Lookuper lookup_table_registry = {} for instance_name in parameter["lookup_table"]: instance_asset = asset_dir / instance_name lookup_table_instance = Lookuper.load_from_file(instance_asset) lookup_table_registry[instance_name] = lookup_table_instance init_parameter = copy.deepcopy(parameter) init_parameter.pop("lookup_table") init_parameter["lookup_table_registry"] = lookup_table_registry self = cls(**init_parameter) return self
config = read_configure() corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() raw_tag_data = corpus_meta_data['tags'] tags_data = generate_tagset(corpus_meta_data['tags']) train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)}) vocab_data_file = os.path.join(os.path.dirname(__file__), '../data/unicode_char_list.txt') vocabulary_lookuper = index_table_from_file(vocab_data_file) def classification_report(y_true, y_pred, labels): """ Similar to the one in sklearn.metrics, reports per classs recall, precision and F1 score """ y_true = np.asarray(y_true).ravel() y_pred = np.asarray(y_pred).ravel() corrects = Counter(yt for yt, yp in zip(y_true, y_pred) if yt == yp) y_true_counts = Counter(y_true) y_pred_counts = Counter(y_pred)
def test_build(datadir, tmpdir): deliverable_model_builder = DeliverableModelBuilder(tmpdir) # metadata builder metadata_builder = MetadataBuilder() meta_content = MetaContent("algorithmId-corpusId-configId-runId") metadata_builder.set_meta_content(meta_content) metadata_builder.save() # processor builder processor_builder = ProcessorBuilder() lookup_processor = LookupProcessor() vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3}) lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table) tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3}) lookup_processor.add_tag_lookup_table(tag_lookup_table) lookup_processor_handle = processor_builder.add_processor(lookup_processor) processor_builder.add_preprocess(lookup_processor_handle) processor_builder.add_postprocess(lookup_processor_handle) processor_builder.save() # model builder model_builder = ModelBuilder() model_builder.add_keras_h5_model(datadir / "fixture" / "keras_h5_model") model_builder.save() # deliverable_model_builder.add_processor(processor_builder) deliverable_model_builder.add_metadata(metadata_builder) deliverable_model_builder.add_model(model_builder) metadata = deliverable_model_builder.save() assert metadata == { "version": "1.0", "dependency": ["seq2annotation", "tensorflow"], "processor": { "version": "1.0", "instance": { "LookupProcessor_0": { "class": "deliverable_model.builtin.processor.lookup_processor.LookupProcessor", "parameter": { "lookup_table": ["vocabulary", "tag"], "padding_parameter": {}, }, } }, "pipeline": { "pre": ["LookupProcessor_0"], "post": ["LookupProcessor_0"] }, }, "model": { "version": "1.0", "type": "keras_h5_model", "custom_object_dependency": [], "converter_for_request": { "class_name": "deliverable_model.builder.model.model_builder.SimpleConverterForRequest", "config": {}, }, "converter_for_response": { "class_name": "deliverable_model.builder.model.model_builder.SimpleConverterForResponse", "config": {}, }, }, "metadata": { "version": "1.0", "id": "algorithmId-corpusId-configId-runId" }, } # dircmp_obj = filecmp.dircmp(datadir / "expected", tmpdir) assert not dircmp_obj.diff_files
def main(): # get configure config = read_configure() # get train/test corpus corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() # process str data to onehot ner_tags_data = generate_tagset(corpus_meta_data["tags"]) cls_tags_data = corpus_meta_data["labels"] train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)}) cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)}) vocab_data_file = config.get("vocabulary_file") if not vocab_data_file: # load built in vocabulary file vocab_data_file = os.path.join( os.path.dirname(__file__), "../data/unicode_char_list.txt" ) vocabulary_lookuper = index_table_from_file(vocab_data_file) def preprocss(data, maxlen, **kwargs): raw_x = [] raw_y_ner = [] raw_y_cls = [] for offset_data in data: tags = offset_to_biluo(offset_data) label = offset_data.label words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] label_id = cls_tag_lookuper.lookup(label) word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y_ner.append(tag_ids) raw_y_cls.append(label_id) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post" ) # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences( raw_y_ner, maxlen, value=0, padding="post" ) from keras.utils import to_categorical y_cls = np.array(raw_y_cls) y_cls = y_cls[:, np.newaxis] y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81)) return x, y_ner, y_cls # get Parameters (controller) EPOCHS = config.get("epochs", 10) BATCHSIZE = config.get("batch_size", 32) LEARNINGRATE = config.get("learning_rate", 0.001) MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) # get Parameters (model structure) EMBED_DIM = config.get("embedding_dim", 300) USE_ATTENTION_LAYER = config.get("use_attention_layer", False) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get( "use_batch_normalization_after_embedding", False) BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get( "use_batch_normalization_after_bilstm", False) CRF_PARAMS = config.get("crf_params", {}) # get train/test data for training model vacab_size = vocabulary_lookuper.size() tag_size = ner_tag_lookuper.size() label_size = cls_tag_lookuper.size() train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size}) test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size}) # build model input_length = MAX_SENTENCE_LEN input_layer = Input(shape=(input_length,), dtype='float', name='input_layer') # encoder with tf.keras.backend.name_scope("Encoder"): embedding_layer = Embedding(vacab_size, EMBED_DIM, mask_zero=True, input_length=input_length, name='embedding')(input_layer) # feature extractor with tf.keras.backend.name_scope("biLSTM"): if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG: embedding_layer = BatchNormalization()(embedding_layer) biLSTM = embedding_layer for bilstm_config in BiLSTM_STACK_CONFIG: biLSTM = Bidirectional(LSTM(return_sequences=True, **bilstm_config, name='biLSTM'))(biLSTM) if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG: biLSTM = BatchNormalization()(biLSTM) if USE_ATTENTION_LAYER: biLSTM = GlobalAttentionLayer()(biLSTM) # NER branch with tf.keras.backend.name_scope("NER_branch"): crf = CRF(tag_size, name="crf", **CRF_PARAMS)(biLSTM) loss_func = ConditionalRandomFieldLoss() # classification branch chosen = 'lstm_cls' with tf.keras.backend.name_scope("CLS_branch"): from tensorflow.keras.layers import Dense, Flatten, Dropout # add paragraph vector #paragraph_vector = get_paragraph_vector(embedding_layer) if chosen == "lstm_cls": cls_flat_lstm = Flatten()(biLSTM) #cls_flat_lstm = tf.keras.layers.concatenate([cls_flat_lstm, paragraph_vector]) classification_dense = Dropout(0.2)(cls_flat_lstm) classification_dense = SetLearningRate(Dense(label_size, activation='sigmoid', name='CLS'), lr=0.001, is_ada=True)(classification_dense) elif chosen == "conv_cls": from tensorflow.keras.layers import Conv1D, MaxPooling1D embedding_layer = BatchNormalization()(embedding_layer) cls_conv_emb = Conv1D(32, 3, activation='relu', padding='same')(embedding_layer) cls_conv_emb = Conv1D(64, 3, activation='relu', padding='same')(cls_conv_emb) cls_conv_emb = MaxPooling1D(2)(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=1, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=2, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=5, padding='same')(cls_conv_emb) cls_conv_emb = Conv1D(256, 1, activation='relu', padding='same')(cls_conv_emb) cls_conv_emb = MaxPooling1D(2)(cls_conv_emb) cls_flat = BatchNormalization()(cls_conv_emb) cls_flat = Flatten()(cls_flat) classification_dense = Dropout(0.2)(cls_flat) classification_dense = Dense(label_size, activation='sigmoid', name='CLS')(classification_dense) # merge NER and Classification model = Model(inputs=[input_layer], outputs=[crf, classification_dense]) model.summary() callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( #log_dir=create_dir_if_needed(config["summary_log_dir"]) log_dir='.\\results\\summary_log_dir', batch_size=BATCHSIZE, ) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(crf_accuracy) metrics_list.append(SequenceCorrectness()) metrics_list.append(sequence_span_accuracy) # early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', # early stop index # patience=3, # early stop delay epoch # verbose=2, # display mode # mode='auto') # callbacks_list.append(early_stop) from mtnlpmodel.trainer.loss_func_util import FocalLoss adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False) model.compile(optimizer=adam_optimizer, #loss={'crf': loss_func, 'CLS': 'sparse_categorical_crossentropy'}, loss={'crf': loss_func, 'CLS': FocalLoss()}, loss_weights={'crf': 1., 'CLS': 100}, # set weight of loss #metrics={'crf': SequenceCorrectness(), 'CLS': 'sparse_categorical_accuracy'} ) metrics={'crf': SequenceCorrectness(), 'CLS': 'categorical_accuracy'}) model.fit( train_x, {'crf': train_y_ner, 'CLS': train_y_cls}, epochs=EPOCHS, batch_size=BATCHSIZE, validation_data=[test_x, {'crf': test_y_ner, 'CLS': test_y_cls}], callbacks=callbacks_list, ) model.save(create_file_dir_if_needed(config["h5_model_file"])) model.save_weights(create_file_dir_if_needed(config["h5_weights_file"])) tf.keras.experimental.export_saved_model( model, create_or_rm_dir_if_needed(config["saved_model_dir"]) ) mt_export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], converter_for_request=ConverterForRequest(), converter_for_response=ConverterForMTResponse(), lookup_tables={'vocab_lookup':vocabulary_lookuper, 'tag_lookup':ner_tag_lookuper, 'label_lookup':cls_tag_lookuper}, padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"}, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )
def id_to_str(id_list: List[int], vocabulary_look_table: Lookuper) -> List[str]: str_list = [vocabulary_look_table.inverse_lookup(id) for id in id_list] return str_list
def str_to_id(string: Union[str, List[str]], vocabulary_look_table: Lookuper) -> List[int]: id_list = [vocabulary_look_table.lookup(i) for i in string] return id_list
def main(): # get configure config = read_configure() # get train/test corpus corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() # process str data to onehot ner_tags_data = generate_tagset(corpus_meta_data["tags"]) cls_tags_data = corpus_meta_data["labels"] train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)}) cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)}) vocab_data_file = config.get("vocabulary_file") if not vocab_data_file: # load built in vocabulary file vocab_data_file = os.path.join(os.path.dirname(__file__), "../data/unicode_char_list.txt") vocabulary_lookuper = index_table_from_file(vocab_data_file) def preprocss(data, maxlen): raw_x = [] raw_y_ner = [] raw_y_cls = [] for offset_data in data: tags = offset_to_biluo(offset_data) label = offset_data.label words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] label_id = cls_tag_lookuper.lookup(label) word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y_ner.append(tag_ids) raw_y_cls.append(label_id) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post") # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner, maxlen, value=0, padding="post") y_cls = np.array(raw_y_cls) y_cls = y_cls[:, np.newaxis] return x, y_ner, y_cls # get Parameters (controller) EPOCHS = config.get("epochs", 10) BATCHSIZE = config.get("batch_size", 32) LEARNINGRATE = config.get("learning_rate", 0.0001) MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) # get Parameters (model structure) EMBED_DIM = config.get("embedding_dim", 300) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) # get train/test data for training model train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN) test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN) vacab_size = vocabulary_lookuper.size() # tag_size = ner_tag_lookuper.size() label_size = cls_tag_lookuper.size() # finetuning correlation code adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False) index_dict = { 'optimizer': adam_optimizer, 'loss': 'sparse_categorical_crossentropy', 'metrics': ['sparse_categorical_accuracy'] } warm_start_list = ['embedding', 'bidirectional', 'batch_normalization'] # layer in list is frozen backbone_model_path = './mtnlpmodel/trainer/fine_tuning_trainer/save_weights/weights.h5' output_dims = label_size # model structure correlation code # define new_layer for the task new_task_output_layer = Dense( output_dims, activation='softmax') # new softmax layer -> output input_shape = MAX_SENTENCE_LEN input_layer = Input(shape=(input_shape, ), dtype='int32', name='input_layer') # input # backbone + transfer_learning function can use backbone to do some different job # what you need is to define a new layer which is new_task_output_layer below # this code is a sample for only text classification # backbone output is biLSTM's output, so transfer_learning add a flatten layer to connect dense layer # you can modify the structure in function transfer_learning to match your task demand base_model = backbone_network( BiLSTM_STACK_CONFIG, input_layer=input_layer, vacab_size=vacab_size, EMBED_DIM=EMBED_DIM, input_length=MAX_SENTENCE_LEN, ) new_model = transfer_learning(input_shape, input_layer, base_model, new_task_output_layer, index_dict, backbone_model_path, warm_start_list) # model output info correlation code new_model.summary() callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( #log_dir=create_dir_if_needed(config["summary_log_dir"]) log_dir='.\\results\\summary_log_dir', batch_size=BATCHSIZE, ) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(crf_accuracy) metrics_list.append(SequenceCorrectness()) metrics_list.append(sequence_span_accuracy) early_stop = tf.keras.callbacks.EarlyStopping( monitor='val_loss', # early stop index patience=3, # early stop delay epoch verbose=2, # display mode mode='auto') callbacks_list.append(early_stop) new_model.fit( train_x, train_y_cls, epochs=EPOCHS, batch_size=BATCHSIZE, validation_data=[test_x, test_y_cls], callbacks=callbacks_list, ) new_model.save(create_file_dir_if_needed(config["h5_model_file"])) tf.keras.experimental.export_saved_model( new_model, create_or_rm_dir_if_needed(config["saved_model_dir"])) mt_export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], vocabulary_lookup_table=vocabulary_lookuper, tag_lookup_table=ner_tag_lookuper, label_lookup_table=cls_tag_lookuper, padding_parameter={ "maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post" }, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )
def input_data_process(config, **hyperparams): # read NER/CLS individually (only support *.conllx) input_mode = config['input_mode'] if input_mode == 'multi': # multi input data_ner = Corpus.read_from_file(config['ner_data']) data_cls = Corpus.read_from_file(config['cls_data']) else: # single input data = Corpus.read_from_file(config['data']) data_ner = data data_cls = data # get train/test corpus ner_tags = get_tag_from_corpus(data_ner) cls_labels = get_label_from_corpus(data_cls) test_ratio = config['test_ratio'] ner_train_data, ner_eval_data = data_ner.train_test_split( test_size=test_ratio, random_state=50) cls_train_data, cls_eval_data = data_cls.train_test_split( test_size=test_ratio, random_state=50) ner_data_tuple, cls_data_tuple = random_sampling_to_samesize( (ner_train_data, ner_eval_data), # mainly for multi input (cls_train_data, cls_eval_data)) # make sure cls & ner have # same size dataset ner_train_data, ner_eval_data = ner_data_tuple cls_train_data, cls_eval_data = cls_data_tuple # build lookupers ner_tag_lookuper = Lookuper( {v: i for i, v in enumerate(generate_tagset(ner_tags))}) cls_label_lookuper = Lookuper({v: i for i, v in enumerate(cls_labels)}) vocab_data_file = config.get("vocabulary_file", None) if not vocab_data_file: # get vacab_data for corpus vocabulary_lookuper = build_vacablookuper_from_corpus( *(data_ner, data_cls)) # from corpus else: vocabulary_lookuper = index_table_from_file( vocab_data_file) # from vocab_file # ner (data&tag) str->int def ner_preprocss(data, maxlen, cls_info_len): raw_x_ner = [] raw_y_ner = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x_ner.append(word_ids) raw_y_ner.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x_ner) maxlen_mt = maxlen + cls_info_len print(">>> maxlen: {}".format(maxlen)) x_ner = tf.keras.preprocessing.sequence.pad_sequences( raw_x_ner, maxlen, padding="post") # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner, maxlen, value=0, padding="post") y_ner = tf.keras.preprocessing.sequence.pad_sequences(y_ner, maxlen_mt, value=0, padding="pre") return x_ner, y_ner # cls (data&label) str->int def cls_preprocss(data, maxlen, **kwargs): raw_x_cls = [] raw_y_cls = [] for offset_data in data: label = offset_data.label words = offset_data.text label_id = cls_label_lookuper.lookup(label) word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x_cls.append(word_ids) raw_y_cls.append(label_id) if maxlen is None: maxlen = max(len(s) for s in raw_x_cls) print(">>> maxlen: {}".format(maxlen)) x_cls = tf.keras.preprocessing.sequence.pad_sequences( raw_x_cls, maxlen, padding="post") # right padding from keras.utils import to_categorical y_cls = np.array(raw_y_cls) y_cls = y_cls[:, np.newaxis] y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81)) return x_cls, y_cls ner_train_x, ner_train_y = ner_preprocss( ner_train_data, hyperparams['MAX_SENTENCE_LEN'], hyperparams['CLS2NER_KEYWORD_LEN']) ner_test_x, ner_test_y = ner_preprocss(ner_eval_data, hyperparams['MAX_SENTENCE_LEN'], hyperparams['CLS2NER_KEYWORD_LEN']) cls_train_x, cls_train_y = cls_preprocss( cls_train_data, hyperparams['MAX_SENTENCE_LEN'], **{'cls_dims': cls_label_lookuper.size()}) cls_test_x, cls_test_y = cls_preprocss( cls_eval_data, hyperparams['MAX_SENTENCE_LEN'], **{'cls_dims': cls_label_lookuper.size()}) #cls_class_weight = get_class_weight(cls_train_data, cls_label_lookuper) output_dict = { 'ner_train_x': ner_train_x, 'ner_train_y': ner_train_y, 'ner_test_x': ner_test_x, 'ner_test_y': ner_test_y, 'cls_train_x': cls_train_x, 'cls_train_y': cls_train_y, 'cls_test_x': cls_test_x, 'cls_test_y': cls_test_y, 'ner_tag_lookuper': ner_tag_lookuper, 'cls_label_lookuper': cls_label_lookuper, 'vocabulary_lookuper': vocabulary_lookuper, } return output_dict
def main(): config = read_configure() corpus = get_corpus_processor(config) corpus.prepare() train_data_generator_func = corpus.get_generator_func(corpus.TRAIN) eval_data_generator_func = corpus.get_generator_func(corpus.EVAL) corpus_meta_data = corpus.get_meta_info() tags_data = generate_tagset(corpus_meta_data["tags"]) train_data = list(train_data_generator_func()) eval_data = list(eval_data_generator_func()) tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)}) vocab_data_file = config.get("vocabulary_file") if not vocab_data_file: # load built in vocabulary file vocab_data_file = os.path.join(os.path.dirname(__file__), "../data/unicode_char_list.txt") vocabulary_lookuper = index_table_from_file(vocab_data_file) def preprocss(data, maxlen): raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post") # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding="post") return x, y MAX_SENTENCE_LEN = config.get("max_sentence_len", 25) train_x, train_y = preprocss(train_data, MAX_SENTENCE_LEN) test_x, test_y = preprocss(eval_data, MAX_SENTENCE_LEN) EPOCHS = config["epochs"] EMBED_DIM = config["embedding_dim"] USE_ATTENTION_LAYER = config.get("use_attention_layer", False) BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", []) BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get( "use_batch_normalization_after_embedding", False) BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get( "use_batch_normalization_after_bilstm", False) CRF_PARAMS = config.get("crf_params", {}) vacab_size = vocabulary_lookuper.size() tag_size = tag_lookuper.size() model = Sequential() model.add( Embedding(vacab_size, EMBED_DIM, mask_zero=True, input_length=MAX_SENTENCE_LEN)) if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG: model.add(BatchNormalization()) for bilstm_config in BiLSTM_STACK_CONFIG: model.add(Bidirectional(LSTM(return_sequences=True, **bilstm_config))) if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG: model.add(BatchNormalization()) if USE_ATTENTION_LAYER: model.add(GlobalAttentionLayer()) model.add(CRF(tag_size, name="crf", **CRF_PARAMS)) # print model summary model.summary() callbacks_list = [] tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=create_dir_if_needed(config["summary_log_dir"])) callbacks_list.append(tensorboard_callback) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"), load_weights_on_restart=True, verbose=1, ) callbacks_list.append(checkpoint_callback) metrics_list = [] metrics_list.append(SequenceCorrectness()) metrics_list.append(SequenceSpanAccuracy()) loss_func = ConditionalRandomFieldLoss() # loss_func = crf_loss model.compile("adam", loss={"crf": loss_func}, metrics=metrics_list) model.fit( train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y], callbacks=callbacks_list, ) # Save the model model.save(create_file_dir_if_needed(config["h5_model_file"])) tf.keras.experimental.export_saved_model( model, create_dir_if_needed(config["saved_model_dir"])) export_as_deliverable_model( create_dir_if_needed(config["deliverable_model_dir"]), keras_saved_model=config["saved_model_dir"], vocabulary_lookup_table=vocabulary_lookuper, tag_lookup_table=tag_lookuper, padding_parameter={ "maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post" }, addition_model_dependency=["tf-crf-layer"], custom_object_dependency=["tf_crf_layer"], )