示例#1
0
    def __init__(self,
                 model_path,
                 tag_lookup_file=None,
                 vocabulary_lookup_file=None):
        self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file)
        self.vocabulary_lookup_table = Lookuper.load_from_file(
            vocabulary_lookup_file)

        super(KerasInferenceBase, self).__init__(model_path)
示例#2
0
    def __init__(self, model_path, tag_lookup_file=None, vocabulary_lookup_file=None):
        # load model
        self.model_dir = model_path

        # TODO: temp bugfix
        self.model = tf.keras.models.load_model(model_path, custom_objects={"crf_accuracy": crf_accuracy, "sequence_span_accuracy": sequence_span_accuracy})
        self.predict_fn = self.model.predict

        self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file)
        self.vocabulary_lookup_table = Lookuper.load_from_file(vocabulary_lookup_file)
    def __init__(self,
                 model_path,
                 tag_lookup_file=None,
                 vocabulary_lookup_file=None):
        # load model
        self.model_dir = model_path

        # TODO: temp bugfix
        self.model = tf.keras.models.load_model(model_path)
        self.predict_fn = self.model.predict

        self.tag_lookup_table = Lookuper.load_from_file(tag_lookup_file)
        self.vocabulary_lookup_table = Lookuper.load_from_file(
            vocabulary_lookup_file)
示例#4
0
    def _keras_train(self, training_data: TrainingData,
                     cfg: RasaNLUModelConfig, **kwargs: Any) -> None:
        from tensorflow.python.keras.layers import Input, Masking
        from tensorflow.python.keras.models import Sequential
        from tf_crf_layer.layer import CRF
        from tf_crf_layer.loss import crf_loss
        from tf_crf_layer.metrics import crf_accuracy
        from seq2annotation.input import generate_tagset
        from seq2annotation.input import build_input_func
        from seq2annotation.input import Lookuper

        config = self.component_config

        if 'result_dir' not in config:
            config['result_dir'] = tempfile.mkdtemp()

        # read data according configure
        train_data_generator_func = kwargs.get('addons_tf_input_fn')
        corpus_meta_data = kwargs.get('addons_tf_input_meta')

        config['tags_data'] = generate_tagset(corpus_meta_data['tags'])

        # train and evaluate model
        train_input_func = build_input_func(train_data_generator_func, config)

        tag_lookuper = Lookuper(
            {v: i
             for i, v in enumerate(config['tags_data'])})

        maxlen = 25

        offset_data = train_input_func()
        train_x, train_y = self._keras_data_preprocss(offset_data,
                                                      tag_lookuper, maxlen)

        EPOCHS = 1

        tag_size = tag_lookuper.size()

        model = Sequential()
        model.add(Input(shape=(25, 768)))
        model.add(Masking())
        model.add(CRF(tag_size))
        model.compile('adam', loss=crf_loss)
        model.summary()

        model.compile('adam', loss=crf_loss, metrics=[crf_accuracy])
        model.fit(train_x, train_y, epochs=EPOCHS)
示例#5
0
def get_input_data():
    config = read_configure()

    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    tags_data = generate_tagset(corpus_meta_data['tags'])

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    tag_lookup = Lookuper({v: i for i, v in enumerate(tags_data)})
    vocab_data_file = './data/unicode_char_list.txt'
    vocabulary_lookup = index_table_from_file(vocab_data_file)

    train_x, train_y = preprocss(train_data, tag_lookup, vocabulary_lookup)

    if eval_data:
        test_x, test_y = preprocss(eval_data, tag_lookup, vocabulary_lookup)
    else:
        test_x, test_y = None, None

    return config, (train_x, train_y), (test_x,
                                        test_y), tag_lookup, vocabulary_lookup
示例#6
0
def preprocss(data, maxlen=None, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.extra_attr[
            config['intent_field']] if config['intent_field'] not in [
                "label"
            ] else getattr(offset_data, config['intent_field'])

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    if not maxlen:
        maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    intent_np_array = np.array(intent_int_list)
    intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1)

    return x, intent_one_hot, y, intent_lookup_table
示例#7
0
def test_build(datadir, tmpdir):
    processor_builder = ProcessorBuilder()

    # setup test processor
    lookup_processor = LookupProcessor()

    vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3})
    lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table)

    tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3})
    lookup_processor.add_tag_lookup_table(tag_lookup_table)

    lookup_processor_handle = processor_builder.add_processor(lookup_processor)
    processor_builder.add_preprocess(lookup_processor_handle)
    processor_builder.add_postprocess(lookup_processor_handle)

    processor_builder.save()

    config = processor_builder.serialize(tmpdir)

    dircmp_obj = filecmp.dircmp(datadir, tmpdir)

    assert not dircmp_obj.diff_files

    assert config == {
        "version": "1.0",
        "instance": {
            "LookupProcessor_0": {
                "class":
                "deliverable_model.builtin.processor.lookup_processor.LookupProcessor",
                "parameter": {
                    "lookup_table": ["vocabulary", "tag"],
                    "padding_parameter": {},
                },
            }
        },
        "pipeline": {
            "pre": ["LookupProcessor_0"],
            "post": ["LookupProcessor_0"]
        },
    }

    assert processor_builder.get_dependency() == ["seq2annotation"]
def index_table_from_corpus(corpus=None):
    from seq2annotation.input import Lookuper
    index_table = {}
    tmp_text_list = [sample.text for sample in corpus]
    text_list = []
    for text in tmp_text_list:
        text_list.extend(text)
    text_list = sorted(set(text_list))
    for index, word in enumerate(text_list):
        index_table[word] = index + 1
    return Lookuper(index_table)
示例#9
0
def test_build(datadir, tmpdir):
    lookup_processor = LookupProcessor()

    vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3})
    lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table)

    tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3})
    lookup_processor.add_tag_lookup_table(tag_lookup_table)

    assert lookup_processor.get_config() == {
        "lookup_table": ["vocabulary", "tag"],
        "padding_parameter": {},
    }

    lookup_processor.serialize(tmpdir)

    match, mismatch, errors = filecmp.cmpfiles(datadir,
                                               tmpdir, ["tag", "vocabulary"],
                                               shallow=False)

    assert len(match) == 2
def preprocss(data, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.label

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    return x, numpy.array(intent_int_list), y, intent_lookup_table
def build_vacablookuper_from_corpus(*corpus_tuples):
    char_list = []
    for corpus in corpus_tuples:
        for sample in corpus:
            char_list.extend(sample.text)
    char_list = sorted(list(set(char_list)))
    char_list.insert(0, '<pad>')

    index_table = {}
    index_counter = 0
    for key in char_list:
        index_table[key] = index_counter
        index_counter += 1

    return Lookuper(index_table)
示例#12
0
def preprocess(
    data: List[Sequence],
    tag_lookup_table: Lookuper,
    vocabulary_look_table: Lookuper,
    seq_maxlen: Union[None, int] = None,
) -> Tuple[np.ndarray, np.ndarray, int]:
    raw_x = []
    raw_y = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text

        tag_ids = [tag_lookup_table.lookup(i) for i in tags]
        word_ids = [vocabulary_look_table.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)

    if not seq_maxlen:
        seq_maxlen = max(len(s) for s in raw_x)

    print(">>> maxlen: {}".format(seq_maxlen))

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, seq_maxlen, padding="post")  # right seq_maxlen

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      seq_maxlen,
                                                      value=0,
                                                      padding="post")

    return x, y, seq_maxlen
def build_vacablookuper_from_list(*lists):
    char_list = []
    for ls in lists:
        char_list.extend(ls)
    char_list = sorted(list(set(char_list)))
    if char_list[0] != '<pad>':
        char_list.insert(0, '<pad>')

    index_table = {}
    index_counter = 1
    for key in char_list:
        index_table[key] = index_counter
        index_counter += 1

    return Lookuper(index_table)
    def load(cls, parameter: dict, asset_dir) -> "ProcessorBase":
        from seq2annotation.input import Lookuper

        lookup_table_registry = {}

        for instance_name in parameter["lookup_table"]:
            instance_asset = asset_dir / instance_name
            lookup_table_instance = Lookuper.load_from_file(instance_asset)

            lookup_table_registry[instance_name] = lookup_table_instance

        init_parameter = copy.deepcopy(parameter)
        init_parameter.pop("lookup_table")
        init_parameter["lookup_table_registry"] = lookup_table_registry

        self = cls(**init_parameter)

        return self
config = read_configure()

corpus = get_corpus_processor(config)
corpus.prepare()
train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

corpus_meta_data = corpus.get_meta_info()

raw_tag_data = corpus_meta_data['tags']
tags_data = generate_tagset(corpus_meta_data['tags'])

train_data = list(train_data_generator_func())
eval_data = list(eval_data_generator_func())

tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)})

vocab_data_file = os.path.join(os.path.dirname(__file__), '../data/unicode_char_list.txt')
vocabulary_lookuper = index_table_from_file(vocab_data_file)


def classification_report(y_true, y_pred, labels):
    """
    Similar to the one in sklearn.metrics,
    reports per classs recall, precision and F1 score
    """
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    corrects = Counter(yt for yt, yp in zip(y_true, y_pred) if yt == yp)
    y_true_counts = Counter(y_true)
    y_pred_counts = Counter(y_pred)
示例#16
0
def test_build(datadir, tmpdir):
    deliverable_model_builder = DeliverableModelBuilder(tmpdir)

    # metadata builder
    metadata_builder = MetadataBuilder()

    meta_content = MetaContent("algorithmId-corpusId-configId-runId")

    metadata_builder.set_meta_content(meta_content)

    metadata_builder.save()

    # processor builder
    processor_builder = ProcessorBuilder()

    lookup_processor = LookupProcessor()

    vocabulary_lookup_table = Lookuper({"a": 1, "b": 2, "c": 3})
    lookup_processor.add_vocabulary_lookup_table(vocabulary_lookup_table)

    tag_lookup_table = Lookuper({"tag-a": 1, "tag-b": 2, "tag-c": 3})
    lookup_processor.add_tag_lookup_table(tag_lookup_table)

    lookup_processor_handle = processor_builder.add_processor(lookup_processor)
    processor_builder.add_preprocess(lookup_processor_handle)
    processor_builder.add_postprocess(lookup_processor_handle)

    processor_builder.save()

    # model builder
    model_builder = ModelBuilder()
    model_builder.add_keras_h5_model(datadir / "fixture" / "keras_h5_model")
    model_builder.save()

    #
    deliverable_model_builder.add_processor(processor_builder)
    deliverable_model_builder.add_metadata(metadata_builder)
    deliverable_model_builder.add_model(model_builder)

    metadata = deliverable_model_builder.save()

    assert metadata == {
        "version": "1.0",
        "dependency": ["seq2annotation", "tensorflow"],
        "processor": {
            "version": "1.0",
            "instance": {
                "LookupProcessor_0": {
                    "class":
                    "deliverable_model.builtin.processor.lookup_processor.LookupProcessor",
                    "parameter": {
                        "lookup_table": ["vocabulary", "tag"],
                        "padding_parameter": {},
                    },
                }
            },
            "pipeline": {
                "pre": ["LookupProcessor_0"],
                "post": ["LookupProcessor_0"]
            },
        },
        "model": {
            "version": "1.0",
            "type": "keras_h5_model",
            "custom_object_dependency": [],
            "converter_for_request": {
                "class_name":
                "deliverable_model.builder.model.model_builder.SimpleConverterForRequest",
                "config": {},
            },
            "converter_for_response": {
                "class_name":
                "deliverable_model.builder.model.model_builder.SimpleConverterForResponse",
                "config": {},
            },
        },
        "metadata": {
            "version": "1.0",
            "id": "algorithmId-corpusId-configId-runId"
        },
    }

    #
    dircmp_obj = filecmp.dircmp(datadir / "expected", tmpdir)
    assert not dircmp_obj.diff_files
示例#17
0
def main():

    # get configure
    config = read_configure()

    # get train/test corpus
    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    # process str data to onehot
    ner_tags_data = generate_tagset(corpus_meta_data["tags"])
    cls_tags_data = corpus_meta_data["labels"]

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)})
    cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(
            os.path.dirname(__file__), "../data/unicode_char_list.txt"
        )

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen, **kwargs):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post"
        )  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(
            raw_y_ner, maxlen, value=0, padding="post"
        )

        from keras.utils import to_categorical
        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]
        y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81))

        return x, y_ner, y_cls


    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    BATCHSIZE = config.get("batch_size", 32)
    LEARNINGRATE = config.get("learning_rate", 0.001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    # get Parameters (model structure)
    EMBED_DIM = config.get("embedding_dim", 300)
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False)
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False)
    CRF_PARAMS = config.get("crf_params", {})


    # get train/test data for training model
    vacab_size = vocabulary_lookuper.size()
    tag_size = ner_tag_lookuper.size()
    label_size = cls_tag_lookuper.size()

    train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})
    test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN, **{'cls_dims':label_size})


    # build model
    input_length = MAX_SENTENCE_LEN
    input_layer = Input(shape=(input_length,), dtype='float', name='input_layer')

    # encoder
    with tf.keras.backend.name_scope("Encoder"):

        embedding_layer = Embedding(vacab_size,
                                    EMBED_DIM,
                                    mask_zero=True,
                                    input_length=input_length,
                                    name='embedding')(input_layer)

    # feature extractor
    with tf.keras.backend.name_scope("biLSTM"):
        if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
            embedding_layer = BatchNormalization()(embedding_layer)

        biLSTM = embedding_layer
        for bilstm_config in BiLSTM_STACK_CONFIG:
               biLSTM = Bidirectional(LSTM(return_sequences=True, **bilstm_config, name='biLSTM'))(biLSTM)

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        biLSTM = BatchNormalization()(biLSTM)

    if USE_ATTENTION_LAYER:
        biLSTM = GlobalAttentionLayer()(biLSTM)

    # NER branch
    with tf.keras.backend.name_scope("NER_branch"):
        crf = CRF(tag_size, name="crf", **CRF_PARAMS)(biLSTM)
        loss_func = ConditionalRandomFieldLoss()


    # classification branch

    chosen = 'lstm_cls'
    with tf.keras.backend.name_scope("CLS_branch"):
        from tensorflow.keras.layers import Dense, Flatten, Dropout
        # add paragraph vector
        #paragraph_vector = get_paragraph_vector(embedding_layer)

        if chosen == "lstm_cls":
            cls_flat_lstm = Flatten()(biLSTM)
            #cls_flat_lstm = tf.keras.layers.concatenate([cls_flat_lstm, paragraph_vector])
            classification_dense = Dropout(0.2)(cls_flat_lstm)
            classification_dense = SetLearningRate(Dense(label_size, activation='sigmoid', name='CLS'), lr=0.001, is_ada=True)(classification_dense)

        elif chosen == "conv_cls":
            from tensorflow.keras.layers import Conv1D, MaxPooling1D
            embedding_layer = BatchNormalization()(embedding_layer)
            cls_conv_emb = Conv1D(32, 3, activation='relu', padding='same')(embedding_layer)
            cls_conv_emb = Conv1D(64, 3, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=1, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=2, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(128, 3, activation='relu', dilation_rate=5, padding='same')(cls_conv_emb)
            cls_conv_emb = Conv1D(256, 1, activation='relu', padding='same')(cls_conv_emb)
            cls_conv_emb = MaxPooling1D(2)(cls_conv_emb)

            cls_flat = BatchNormalization()(cls_conv_emb)
            cls_flat = Flatten()(cls_flat)
            classification_dense = Dropout(0.2)(cls_flat)
            classification_dense = Dense(label_size, activation='sigmoid', name='CLS')(classification_dense)



    # merge NER and Classification
    model = Model(inputs=[input_layer], outputs=[crf, classification_dense])


    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]), "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(crf_accuracy)
    metrics_list.append(SequenceCorrectness())
    metrics_list.append(sequence_span_accuracy)

    # early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  # early stop index
    #                                               patience=3,          # early stop delay epoch
    #                                               verbose=2,           # display mode
    #                                               mode='auto')
    # callbacks_list.append(early_stop)

    from mtnlpmodel.trainer.loss_func_util import FocalLoss
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(optimizer=adam_optimizer,
                  #loss={'crf': loss_func, 'CLS': 'sparse_categorical_crossentropy'},
                  loss={'crf': loss_func, 'CLS': FocalLoss()},
                  loss_weights={'crf': 1., 'CLS': 100},  # set weight of loss
                  #metrics={'crf': SequenceCorrectness(), 'CLS': 'sparse_categorical_accuracy'} )
                  metrics={'crf': SequenceCorrectness(), 'CLS': 'categorical_accuracy'})

    model.fit(
        train_x,
        {'crf': train_y_ner, 'CLS': train_y_cls},
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        validation_data=[test_x,  {'crf': test_y_ner, 'CLS': test_y_cls}],
        callbacks=callbacks_list,
    )


    model.save(create_file_dir_if_needed(config["h5_model_file"]))
    model.save_weights(create_file_dir_if_needed(config["h5_weights_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_or_rm_dir_if_needed(config["saved_model_dir"])
    )


    mt_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        converter_for_request=ConverterForRequest(),
        converter_for_response=ConverterForMTResponse(),
        lookup_tables={'vocab_lookup':vocabulary_lookuper,
                       'tag_lookup':ner_tag_lookuper,
                       'label_lookup':cls_tag_lookuper},
        padding_parameter={"maxlen": MAX_SENTENCE_LEN, "value": 0, "padding": "post"},
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
示例#18
0
def id_to_str(id_list: List[int],
              vocabulary_look_table: Lookuper) -> List[str]:
    str_list = [vocabulary_look_table.inverse_lookup(id) for id in id_list]

    return str_list
示例#19
0
def str_to_id(string: Union[str, List[str]],
              vocabulary_look_table: Lookuper) -> List[int]:
    id_list = [vocabulary_look_table.lookup(i) for i in string]

    return id_list
示例#20
0
def main():

    # get configure

    config = read_configure()

    # get train/test corpus
    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    # process str data to onehot
    ner_tags_data = generate_tagset(corpus_meta_data["tags"])
    cls_tags_data = corpus_meta_data["labels"]

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    ner_tag_lookuper = Lookuper({v: i for i, v in enumerate(ner_tags_data)})
    cls_tag_lookuper = Lookuper({v: i for i, v in enumerate(cls_tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(os.path.dirname(__file__),
                                       "../data/unicode_char_list.txt")

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post")  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner,
                                                              maxlen,
                                                              value=0,
                                                              padding="post")

        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]

        return x, y_ner, y_cls

    # get Parameters (controller)
    EPOCHS = config.get("epochs", 10)
    BATCHSIZE = config.get("batch_size", 32)
    LEARNINGRATE = config.get("learning_rate", 0.0001)
    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    # get Parameters (model structure)
    EMBED_DIM = config.get("embedding_dim", 300)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])

    # get train/test data for training model
    train_x, train_y_ner, train_y_cls = preprocss(train_data, MAX_SENTENCE_LEN)
    test_x, test_y_ner, test_y_cls = preprocss(eval_data, MAX_SENTENCE_LEN)

    vacab_size = vocabulary_lookuper.size()
    # tag_size = ner_tag_lookuper.size()
    label_size = cls_tag_lookuper.size()

    # finetuning correlation code

    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNINGRATE,
                                              beta_1=0.9,
                                              beta_2=0.999,
                                              amsgrad=False)
    index_dict = {
        'optimizer': adam_optimizer,
        'loss': 'sparse_categorical_crossentropy',
        'metrics': ['sparse_categorical_accuracy']
    }

    warm_start_list = ['embedding', 'bidirectional',
                       'batch_normalization']  # layer in list is frozen

    backbone_model_path = './mtnlpmodel/trainer/fine_tuning_trainer/save_weights/weights.h5'

    output_dims = label_size

    # model structure correlation code

    # define new_layer for the task
    new_task_output_layer = Dense(
        output_dims, activation='softmax')  # new softmax layer -> output

    input_shape = MAX_SENTENCE_LEN
    input_layer = Input(shape=(input_shape, ),
                        dtype='int32',
                        name='input_layer')  # input

    # backbone + transfer_learning function can use backbone to do some different job
    # what you need is to define a new layer which is new_task_output_layer below
    # this code is a sample for only text classification
    # backbone output is biLSTM's output, so transfer_learning add a flatten layer to connect dense layer
    # you can modify the structure in function transfer_learning to match your task demand
    base_model = backbone_network(
        BiLSTM_STACK_CONFIG,
        input_layer=input_layer,
        vacab_size=vacab_size,
        EMBED_DIM=EMBED_DIM,
        input_length=MAX_SENTENCE_LEN,
    )

    new_model = transfer_learning(input_shape, input_layer, base_model,
                                  new_task_output_layer, index_dict,
                                  backbone_model_path, warm_start_list)

    # model output info correlation code
    new_model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #log_dir=create_dir_if_needed(config["summary_log_dir"])
        log_dir='.\\results\\summary_log_dir',
        batch_size=BATCHSIZE,
    )
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(crf_accuracy)
    metrics_list.append(SequenceCorrectness())
    metrics_list.append(sequence_span_accuracy)

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',  # early stop index
        patience=3,  # early stop delay epoch
        verbose=2,  # display mode
        mode='auto')
    callbacks_list.append(early_stop)

    new_model.fit(
        train_x,
        train_y_cls,
        epochs=EPOCHS,
        batch_size=BATCHSIZE,
        validation_data=[test_x, test_y_cls],
        callbacks=callbacks_list,
    )

    new_model.save(create_file_dir_if_needed(config["h5_model_file"]))

    tf.keras.experimental.export_saved_model(
        new_model, create_or_rm_dir_if_needed(config["saved_model_dir"]))

    mt_export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        vocabulary_lookup_table=vocabulary_lookuper,
        tag_lookup_table=ner_tag_lookuper,
        label_lookup_table=cls_tag_lookuper,
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )
def input_data_process(config, **hyperparams):
    # read NER/CLS individually (only support *.conllx)
    input_mode = config['input_mode']
    if input_mode == 'multi':
        # multi input
        data_ner = Corpus.read_from_file(config['ner_data'])
        data_cls = Corpus.read_from_file(config['cls_data'])
    else:
        # single input
        data = Corpus.read_from_file(config['data'])
        data_ner = data
        data_cls = data

    # get train/test corpus
    ner_tags = get_tag_from_corpus(data_ner)
    cls_labels = get_label_from_corpus(data_cls)

    test_ratio = config['test_ratio']
    ner_train_data, ner_eval_data = data_ner.train_test_split(
        test_size=test_ratio, random_state=50)
    cls_train_data, cls_eval_data = data_cls.train_test_split(
        test_size=test_ratio, random_state=50)

    ner_data_tuple, cls_data_tuple = random_sampling_to_samesize(
        (ner_train_data, ner_eval_data),  # mainly for multi input
        (cls_train_data, cls_eval_data))  # make sure cls & ner have
    # same size dataset
    ner_train_data, ner_eval_data = ner_data_tuple
    cls_train_data, cls_eval_data = cls_data_tuple

    # build lookupers
    ner_tag_lookuper = Lookuper(
        {v: i
         for i, v in enumerate(generate_tagset(ner_tags))})
    cls_label_lookuper = Lookuper({v: i for i, v in enumerate(cls_labels)})

    vocab_data_file = config.get("vocabulary_file", None)

    if not vocab_data_file:
        # get vacab_data for corpus
        vocabulary_lookuper = build_vacablookuper_from_corpus(
            *(data_ner, data_cls))  # from corpus
    else:
        vocabulary_lookuper = index_table_from_file(
            vocab_data_file)  # from vocab_file

    # ner (data&tag) str->int
    def ner_preprocss(data, maxlen, cls_info_len):
        raw_x_ner = []
        raw_y_ner = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x_ner.append(word_ids)
            raw_y_ner.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x_ner)

        maxlen_mt = maxlen + cls_info_len
        print(">>> maxlen: {}".format(maxlen))

        x_ner = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x_ner, maxlen, padding="post")  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner,
                                                              maxlen,
                                                              value=0,
                                                              padding="post")

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(y_ner,
                                                              maxlen_mt,
                                                              value=0,
                                                              padding="pre")

        return x_ner, y_ner

    # cls (data&label) str->int
    def cls_preprocss(data, maxlen, **kwargs):
        raw_x_cls = []
        raw_y_cls = []

        for offset_data in data:
            label = offset_data.label
            words = offset_data.text

            label_id = cls_label_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x_cls.append(word_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x_cls)

        print(">>> maxlen: {}".format(maxlen))

        x_cls = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x_cls, maxlen, padding="post")  # right padding

        from keras.utils import to_categorical
        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]
        y_cls = to_categorical(y_cls, kwargs.get('cls_dims', 81))

        return x_cls, y_cls

    ner_train_x, ner_train_y = ner_preprocss(
        ner_train_data, hyperparams['MAX_SENTENCE_LEN'],
        hyperparams['CLS2NER_KEYWORD_LEN'])
    ner_test_x, ner_test_y = ner_preprocss(ner_eval_data,
                                           hyperparams['MAX_SENTENCE_LEN'],
                                           hyperparams['CLS2NER_KEYWORD_LEN'])

    cls_train_x, cls_train_y = cls_preprocss(
        cls_train_data, hyperparams['MAX_SENTENCE_LEN'],
        **{'cls_dims': cls_label_lookuper.size()})
    cls_test_x, cls_test_y = cls_preprocss(
        cls_eval_data, hyperparams['MAX_SENTENCE_LEN'],
        **{'cls_dims': cls_label_lookuper.size()})

    #cls_class_weight = get_class_weight(cls_train_data, cls_label_lookuper)

    output_dict = {
        'ner_train_x': ner_train_x,
        'ner_train_y': ner_train_y,
        'ner_test_x': ner_test_x,
        'ner_test_y': ner_test_y,
        'cls_train_x': cls_train_x,
        'cls_train_y': cls_train_y,
        'cls_test_x': cls_test_x,
        'cls_test_y': cls_test_y,
        'ner_tag_lookuper': ner_tag_lookuper,
        'cls_label_lookuper': cls_label_lookuper,
        'vocabulary_lookuper': vocabulary_lookuper,
    }

    return output_dict
示例#22
0
def main():
    config = read_configure()

    corpus = get_corpus_processor(config)
    corpus.prepare()
    train_data_generator_func = corpus.get_generator_func(corpus.TRAIN)
    eval_data_generator_func = corpus.get_generator_func(corpus.EVAL)

    corpus_meta_data = corpus.get_meta_info()

    tags_data = generate_tagset(corpus_meta_data["tags"])

    train_data = list(train_data_generator_func())
    eval_data = list(eval_data_generator_func())

    tag_lookuper = Lookuper({v: i for i, v in enumerate(tags_data)})

    vocab_data_file = config.get("vocabulary_file")

    if not vocab_data_file:
        # load built in vocabulary file
        vocab_data_file = os.path.join(os.path.dirname(__file__),
                                       "../data/unicode_char_list.txt")

    vocabulary_lookuper = index_table_from_file(vocab_data_file)

    def preprocss(data, maxlen):
        raw_x = []
        raw_y = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post")  # right padding

        # lef padded with -1. Indeed, any integer works as it will be masked
        # y_pos = pad_sequences(y_pos, maxlen, value=-1)
        # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
        y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                          maxlen,
                                                          value=0,
                                                          padding="post")

        return x, y

    MAX_SENTENCE_LEN = config.get("max_sentence_len", 25)

    train_x, train_y = preprocss(train_data, MAX_SENTENCE_LEN)
    test_x, test_y = preprocss(eval_data, MAX_SENTENCE_LEN)

    EPOCHS = config["epochs"]
    EMBED_DIM = config["embedding_dim"]
    USE_ATTENTION_LAYER = config.get("use_attention_layer", False)
    BiLSTM_STACK_CONFIG = config.get("bilstm_stack_config", [])
    BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG = config.get(
        "use_batch_normalization_after_embedding", False)
    BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG = config.get(
        "use_batch_normalization_after_bilstm", False)
    CRF_PARAMS = config.get("crf_params", {})

    vacab_size = vocabulary_lookuper.size()
    tag_size = tag_lookuper.size()

    model = Sequential()

    model.add(
        Embedding(vacab_size,
                  EMBED_DIM,
                  mask_zero=True,
                  input_length=MAX_SENTENCE_LEN))

    if BATCH_NORMALIZATION_AFTER_EMBEDDING_CONFIG:
        model.add(BatchNormalization())

    for bilstm_config in BiLSTM_STACK_CONFIG:
        model.add(Bidirectional(LSTM(return_sequences=True, **bilstm_config)))

    if BATCH_NORMALIZATION_AFTER_BILSTM_CONFIG:
        model.add(BatchNormalization())

    if USE_ATTENTION_LAYER:
        model.add(GlobalAttentionLayer())

    model.add(CRF(tag_size, name="crf", **CRF_PARAMS))

    # print model summary
    model.summary()

    callbacks_list = []

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=create_dir_if_needed(config["summary_log_dir"]))
    callbacks_list.append(tensorboard_callback)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(create_dir_if_needed(config["model_dir"]),
                     "cp-{epoch:04d}.ckpt"),
        load_weights_on_restart=True,
        verbose=1,
    )
    callbacks_list.append(checkpoint_callback)

    metrics_list = []

    metrics_list.append(SequenceCorrectness())
    metrics_list.append(SequenceSpanAccuracy())

    loss_func = ConditionalRandomFieldLoss()
    # loss_func = crf_loss

    model.compile("adam", loss={"crf": loss_func}, metrics=metrics_list)
    model.fit(
        train_x,
        train_y,
        epochs=EPOCHS,
        validation_data=[test_x, test_y],
        callbacks=callbacks_list,
    )

    # Save the model
    model.save(create_file_dir_if_needed(config["h5_model_file"]))

    tf.keras.experimental.export_saved_model(
        model, create_dir_if_needed(config["saved_model_dir"]))

    export_as_deliverable_model(
        create_dir_if_needed(config["deliverable_model_dir"]),
        keras_saved_model=config["saved_model_dir"],
        vocabulary_lookup_table=vocabulary_lookuper,
        tag_lookup_table=tag_lookuper,
        padding_parameter={
            "maxlen": MAX_SENTENCE_LEN,
            "value": 0,
            "padding": "post"
        },
        addition_model_dependency=["tf-crf-layer"],
        custom_object_dependency=["tf_crf_layer"],
    )