Пример #1
0
def train_model(genre, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate,
                optimizer_type, model_name, n_epoch=50, add_features=False, scale_features=False, overwrite=False,
                lr_range_test=False, callbacks_to_add=None, eval_on_train=False, **kwargs):
    config = ModelConfig()
    config.genre = genre
    config.input_level = input_level
    config.max_len = config.word_max_len[genre] if input_level == 'word' else config.char_max_len[genre]
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.callbacks_to_add = callbacks_to_add or []
    config.add_features = add_features
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.n_epoch = n_epoch
    config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre,
                                                     word_embed_type))
    vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, input_level))
    config.idx2token = dict((idx, token) for token, idx in vocab.items())

    # experiment name configuration
    config.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}'.format(genre, model_name, input_level, word_embed_type,
                                                       'tune' if word_embed_trainable else 'fix', batch_size,
                                                       '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()]),
                                                       optimizer_type)
    if config.add_features:
        config.exp_name = config.exp_name + '_feature_scaled' if scale_features else config.exp_name + '_featured'
    if len(config.callbacks_to_add) > 0:
        callback_str = '_' + '_'.join(config.callbacks_to_add)
        callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '')
        config.exp_name += callback_str

    input_config = kwargs['input_config'] if 'input_config' in kwargs else 'token'  # input default is word embedding
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        # get elmo embedding based on cache, we first get a ELMoCache instance
        if 'elmo_model_type' in kwargs:
            elmo_model_type = kwargs['elmo_model_type']
            kwargs.pop('elmo_model_type')   # we don't need it in kwargs any more
        else:
            elmo_model_type = 'allennlp'
        if 'elmo_output_mode' in kwargs:
            elmo_output_mode = kwargs['elmo_output_mode']
            kwargs.pop('elmo_output_mode')  # we don't need it in kwargs any more
        else:
            elmo_output_mode ='elmo'
        elmo_cache = ELMoCache(options_file=config.elmo_options_file, weight_file=config.elmo_weight_file,
                               cache_dir=config.cache_dir, idx2token=config.idx2token,
                               max_sentence_length=config.max_len, elmo_model_type=elmo_model_type,
                               elmo_output_mode=elmo_output_mode)
    elif input_config in ['elmo_id', 'elmo_s', 'token_combine_elmo_id', 'token_combine_elmo_s']:
        # get elmo embedding using tensorflow_hub, we must provide a tfhub_url
        kwargs['elmo_model_url'] = config.elmo_model_url

    # logger to log output of training process
    train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch,
                 'learning_rate': learning_rate, 'other_params': kwargs}

    print('Logging Info - Experiment: %s' % config.exp_name)
    if model_name == 'KerasInfersent':
        model = KerasInfersentModel(config, **kwargs)
    elif model_name == 'KerasEsim':
        model = KerasEsimModel(config, **kwargs)
    elif model_name == 'KerasDecomposable':
        model = KerasDecomposableAttentionModel(config, **kwargs)
    elif model_name == 'KerasSiameseBiLSTM':
        model = KerasSimaeseBiLSTMModel(config, **kwargs)
    elif model_name == 'KerasSiameseCNN':
        model = KerasSiameseCNNModel(config, **kwargs)
    elif model_name == 'KerasIACNN':
        model = KerasIACNNModel(config, **kwargs)
    elif model_name == 'KerasSiameseLSTMCNNModel':
        model = KerasSiameseLSTMCNNModel(config, **kwargs)
    elif model_name == 'KerasRefinedSSAModel':
        model = KerasRefinedSSAModel(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))
    # model.summary()

    train_input, dev_input, test_input = None, None, None
    if lr_range_test:   # conduct lr range test to find optimal learning rate (not train model)
        train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
        dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
        model.lr_range_test(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'],
                            y_valid=dev_input['y'])
        return

    model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name))
    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()

        if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
            train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache,
                                        return_data=(input_config == 'token_combine_cache_elmo'),
                                        return_features=config.add_features)
            dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache,
                                      return_data=(input_config == 'token_combine_cache_elmo'),
                                      return_features=config.add_features)
            model.train_with_generator(train_input, dev_input)
        else:
            train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
            dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
            model.train(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'],
                        y_valid=dev_input['y'])
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

    def eval_on_data(eval_with_generator, input_data, data_type):
        model.load_best_model()
        if eval_with_generator:
            acc = model.evaluate_with_generator(generator=input_data, y=input_data.input_label)
        else:
            acc = model.evaluate(x=input_data['x'], y=input_data['y'])
        train_log['%s_acc' % data_type] = acc

        swa_type = None
        if 'swa' in config.callbacks_to_add:
            swa_type = 'swa'
        elif 'swa_clr' in config.callbacks_to_add:
            swa_type = 'swa_clr'
        if swa_type:
            print('Logging Info - %s Model' % swa_type)
            model.load_swa_model(swa_type=swa_type)
            swa_acc = model.evaluate(x=input_data['x'], y=input_data['y'])
            train_log['%s_%s_acc' % (swa_type, data_type)] = swa_acc

        ensemble_type = None
        if 'sse' in config.callbacks_to_add:
            ensemble_type = 'sse'
        elif 'fge' in config.callbacks_to_add:
            ensemble_type = 'fge'
        if ensemble_type:
            print('Logging Info - %s Ensemble Model' % ensemble_type)
            ensemble_predict = {}
            for model_file in os.listdir(config.checkpoint_dir):
                if model_file.startswith(config.exp_name+'_%s' % ensemble_type):
                    match = re.match(r'(%s_%s_)([\d+])(.hdf5)' % (config.exp_name, ensemble_type), model_file)
                    model_id = int(match.group(2))
                    model_path = os.path.join(config.checkpoint_dir, model_file)
                    print('Logging Info: Loading {} ensemble model checkpoint: {}'.format(ensemble_type, model_file))
                    model.load_model(model_path)
                    ensemble_predict[model_id] = model.predict(x=input_data['x'])
            '''
            we expect the models saved towards the end of run may have better performance than models saved earlier 
            in the run, we sort the models so that the older models ('s id) are first.
            '''
            sorted_ensemble_predict = sorted(ensemble_predict.items(), key=lambda x: x[0], reverse=True)
            model_predicts = []
            for model_id, model_predict in sorted_ensemble_predict:
                single_acc = eval_acc(model_predict, input_data['y'])
                print('Logging Info - %s_single_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, single_acc))
                train_log['%s_single_%d_%s_acc' % (ensemble_type, model_id, data_type)] = single_acc

                model_predicts.append(model_predict)
                ensemble_acc = eval_acc(np.mean(np.array(model_predicts), axis=0), input_data['y'])
                print('Logging Info - %s_ensemble_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, ensemble_acc))
                train_log['%s_ensemble_%d_%s_acc' % (ensemble_type, model_id, data_type)] = ensemble_acc

    if eval_on_train:
        # might take a long time
        print('Logging Info - Evaluate over train data:')
        if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
            train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache,
                                        return_data=(input_config == 'token_combine_cache_elmo'),
                                        return_features=config.add_features, return_label=False)
            eval_on_data(eval_with_generator=True, input_data=train_input, data_type='train')
        else:
            train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features)
            eval_on_data(eval_with_generator=False, input_data=train_input, data_type='train')

    print('Logging Info - Evaluate over valid data:')
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache,
                                  return_data=(input_config == 'token_combine_cache_elmo'),
                                  return_features=config.add_features, return_label=False)
        eval_on_data(eval_with_generator=True, input_data=dev_input, data_type='dev')
    else:
        if dev_input is None:
            dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features)
        eval_on_data(eval_with_generator=False, input_data=dev_input, data_type='dev')

    print('Logging Info - Evaluate over test data:')
    if input_config in ['cache_elmo', 'token_combine_cache_elmo']:
        test_input = ELMoGenerator(genre, input_level, 'test', config.batch_size, elmo_cache,
                                   return_data=(input_config == 'token_combine_cache_elmo'),
                                   return_features=config.add_features, return_label=False)
        eval_on_data(eval_with_generator=True, input_data=test_input, data_type='test')
    else:
        if test_input is None:
            test_input = load_input_data(genre, input_level, 'test', input_config, config.add_features, scale_features)
        eval_on_data(eval_with_generator=False, input_data=test_input, data_type='test')

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, genre), log=train_log, mode='a')
    return train_log
Пример #2
0
def train_ensemble_model(ensemble_models,
                         model_name,
                         variation,
                         dev_data,
                         train_data=None,
                         test_data=None,
                         binary_threshold=0.5,
                         checkpoint_dir=None,
                         overwrite=False,
                         log_error=False,
                         save_log=True,
                         **kwargs):
    config = ModelConfig()
    config.binary_threshold = binary_threshold
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
        if not path.exists(config.checkpoint_dir):
            os.makedirs(config.checkpoint_dir)
    config.exp_name = '{}_{}_ensemble_with_{}'.format(variation, model_name,
                                                      ensemble_models)
    train_log = {
        'exp_name': config.exp_name,
        'binary_threshold': binary_threshold
    }
    print('Logging Info - Ensemble Experiment: ', config.exp_name)
    if model_name == 'svm':
        model = SVMModel(config, **kwargs)
    elif model_name == 'lr':
        model = LRModel(config, **kwargs)
    elif model_name == 'sgd':
        model = SGDModel(config, **kwargs)
    elif model_name == 'gnb':
        model = GaussianNBModel(config, **kwargs)
    elif model_name == 'mnb':
        model = MultinomialNBModel(config, **kwargs)
    elif model_name == 'bnb':
        model = BernoulliNBModel(config, **kwargs)
    elif model_name == 'rf':
        model = RandomForestModel(config, **kwargs)
    elif model_name == 'gbdt':
        model = GBDTModel(config, **kwargs)
    elif model_name == 'xgboost':
        model = XGBoostModel(config, **kwargs)
    elif model_name == 'lda':
        model = LDAModel(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if train_data is not None and (not path.exists(model_save_path)
                                   or overwrite):
        model.train(train_data)

    model.load_best_model()
    print('Logging Info - Evaluate over valid data:')
    valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate(
        dev_data)
    train_log['valid_acc'] = valid_acc
    train_log['valid_f1'] = valid_f1
    train_log['valid_macro_f1'] = valid_macro_f1
    train_log['valid_p'] = valid_p
    train_log['valid_r'] = valid_r
    train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())

    if log_error:
        error_indexes, error_pred_probas = model.error_analyze(dev_data)
        dev_text_input = load_processed_text_data(variation, 'dev')
        for error_index, error_pred_prob in zip(error_indexes,
                                                error_pred_probas):
            train_log['error_%d' % error_index] = '{},{},{},{}'.format(
                error_index, dev_text_input['sentence'][error_index],
                dev_text_input['label'][error_index], error_pred_prob)
    if save_log:
        write_log(format_filename(LOG_DIR,
                                  PERFORMANCE_LOG_TEMPLATE,
                                  variation=variation),
                  log=train_log,
                  mode='a')

    if test_data is not None:
        test_predictions = model.predict(test_data)
        writer_predict(
            format_filename(PREDICT_DIR, config.exp_name + '.labels'),
            test_predictions)

    return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
Пример #3
0
def link(model_name,
         dev_pred_mentions,
         test_pred_mentions,
         predict_log,
         batch_size=32,
         n_epoch=50,
         learning_rate=0.001,
         optimizer_type='adam',
         embed_type=None,
         embed_trainable=True,
         use_relative_pos=False,
         n_neg=1,
         omit_one_cand=True,
         callbacks_to_add=None,
         swa_type=None,
         predict_on_final_test=True,
         **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    if embed_type:
        config.embeddings = np.load(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            type=embed_type))
        config.embed_trainable = embed_trainable
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or [
        'modelcheckpoint', 'earlystopping'
    ]

    config.vocab = pickle_load(
        format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))
    config.entity_desc = pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME))

    config.exp_name = '{}_{}_{}_{}_{}_{}'.format(
        model_name, embed_type if embed_type else 'random',
        'tune' if embed_trainable else 'fix', batch_size, optimizer_type,
        learning_rate)
    config.use_relative_pos = use_relative_pos
    if config.use_relative_pos:
        config.exp_name += '_rel'
    config.n_neg = n_neg
    if config.n_neg > 1:
        config.exp_name += '_neg_{}'.format(config.n_neg)
    config.omit_one_cand = omit_one_cand
    if not config.omit_one_cand:
        config.exp_name += '_not_omit'
    if kwargs:
        config.exp_name += '_' + '_'.join(
            [str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint',
                                        '').replace('_earlystopping', '')
    config.exp_name += callback_str

    # logger to log output of training process
    predict_log.update({
        'el_exp_name': config.exp_name,
        'el_batch_size': batch_size,
        'el_optimizer': optimizer_type,
        'el_epoch': n_epoch,
        'el_learning_rate': learning_rate,
        'el_other_params': kwargs
    })

    print('Logging Info - Experiment: %s' % config.exp_name)
    model = LinkModel(config, **kwargs)

    model_save_path = os.path.join(config.checkpoint_dir,
                                   '{}.hdf5'.format(config.exp_name))
    if not os.path.exists(model_save_path):
        raise FileNotFoundError(
            'Recognition model not exist: {}'.format(model_save_path))
    if swa_type is None:
        model.load_best_model()
    elif 'swa' in callbacks_to_add:
        model.load_swa_model(swa_type)
        predict_log['er_exp_name'] += '_{}'.format(swa_type)

    dev_data_type = 'dev'
    dev_data = load_data(dev_data_type)
    dev_text_data, dev_gold_mention_entities = [], []
    for data in dev_data:
        dev_text_data.append(data['text'])
        dev_gold_mention_entities.append(data['mention_data'])

    if predict_on_final_test:
        test_data_type = 'test_final'
    else:
        test_data_type = 'test'
    test_data = load_data(test_data_type)
    test_text_data = [data['text'] for data in test_data]

    if dev_pred_mentions is not None:
        print(
            'Logging Info - Evaluate over valid data based on predicted mention:'
        )
        r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                                  dev_gold_mention_entities)
        dev_performance = 'dev_performance' if swa_type is None else '%s_dev_performance' % swa_type
        predict_log[dev_performance] = (r, p, f1)
    print('Logging Info - Generate submission for test data:')
    test_pred_mention_entities = model.predict(test_text_data,
                                               test_pred_mentions)
    test_submit_file = predict_log[
        'er_exp_name'] + '_' + config.exp_name + '_%s%ssubmit.json' % (
            swa_type + '_' if swa_type else '',
            'final_' if predict_on_final_test else '')
    submit_result(test_submit_file, test_data, test_pred_mention_entities)

    predict_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step'),
              log=predict_log,
              mode='a')
    return predict_log
Пример #4
0
def process_data():
    config = ModelConfig()

    # create dir
    if not path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)
    if not path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    if not path.exists(MODEL_SAVED_DIR):
        os.makedirs(MODEL_SAVED_DIR)
    if not path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # load datasets
    data_train, data_dev = load_data()
    print('Logging Info - Data: train - {}, dev - {}'.format(
        data_train.shape, data_dev.shape))

    for variation in VARIATIONS:
        if variation not in data_train.index:
            continue

        analyze_result = {}
        variation_train = data_train.loc[variation]
        variation_dev = data_dev.loc[variation]

        print('Logging Info - Variation: {}, train - {}, dev - {}'.format(
            variation, variation_train.shape, variation_dev.shape))
        analyze_result.update({
            'train_set': len(variation_train),
            'dev_set': len(variation_train)
        })

        variation_train_data = get_sentence_label(variation_train)
        variation_dev_data = get_sentence_label(variation_dev)

        if config.data_augment:
            variation_train_data = augment_data(variation_train_data)
            variation += '_aug'

        # class distribution analysis
        train_label_distribution = analyze_class_distribution(
            variation_train_data['label'])
        analyze_result.update(
            dict(('train_cls_{}'.format(cls), percent)
                 for cls, percent in train_label_distribution.items()))
        dev_label_distribution = analyze_class_distribution(
            variation_dev_data['label'])
        analyze_result.update(
            dict(('dev_cls_{}'.format(cls), percent)
                 for cls, percent in dev_label_distribution.items()))

        # create tokenizer and vocabulary
        sentences_train = variation_train_data['sentence']
        sentences_dev = variation_dev_data['sentence']

        word_tokenizer = Tokenizer(char_level=False)
        char_tokenizer = Tokenizer(char_level=True)
        word_tokenizer.fit_on_texts(sentences_train)
        char_tokenizer.fit_on_texts(sentences_train)
        print('Logging Info - Variation: {}, word_vocab: {}, char_vocab: {}'.
              format(variation, len(word_tokenizer.word_index),
                     len(char_tokenizer.word_index)))
        analyze_result.update({
            'word_vocab': len(word_tokenizer.word_index),
            'char_vocab': len(char_tokenizer.word_index)
        })

        # length analysis
        word_len_distribution, word_max_len = analyze_len_distribution(
            sentences_train, level='word')
        analyze_result.update(
            dict(('word_{}'.format(k), v)
                 for k, v in word_len_distribution.items()))
        char_len_distribution, char_max_len = analyze_len_distribution(
            sentences_train, level='char')
        analyze_result.update(
            dict(('char_{}'.format(k), v)
                 for k, v in char_len_distribution.items()))

        one_hot = False if config.loss_function == 'binary_crossentropy' else True
        train_word_ids = create_data_matrices(word_tokenizer,
                                              variation_train_data,
                                              config.n_class, one_hot,
                                              word_max_len)
        train_char_ids = create_data_matrices(char_tokenizer,
                                              variation_train_data,
                                              config.n_class, one_hot,
                                              char_max_len)
        dev_word_ids = create_data_matrices(word_tokenizer, variation_dev_data,
                                            config.n_class, one_hot,
                                            word_max_len)
        dev_char_ids = create_data_matrices(char_tokenizer, variation_dev_data,
                                            config.n_class, one_hot,
                                            char_max_len)

        # create embedding matrix by training on dataset
        w2v_data = train_w2v(sentences_train + sentences_dev,
                             lambda x: x.split(), word_tokenizer.word_index)
        c2v_data = train_w2v(sentences_train + sentences_dev,
                             lambda x: list(x), char_tokenizer.word_index)
        w_fasttext_data = train_fasttext(sentences_train + sentences_dev,
                                         lambda x: x.split(),
                                         word_tokenizer.word_index)
        c_fasttext_data = train_fasttext(sentences_train + sentences_dev,
                                         lambda x: list(x),
                                         char_tokenizer.word_index)
        # w_glove_data = train_glove(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        # c_glove_data = train_glove(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index)

        # save pre-process data
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_DATA_TEMPLATE,
                            variation=variation), variation_train_data)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_DATA_TEMPLATE,
                            variation=variation), variation_dev_data)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='word'), train_word_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='char'), train_char_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='word'), dev_word_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='char'), dev_char_ids)

        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='w2v_data'), w2v_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='c2v_data'), c2v_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='w_fasttext_data'), w_fasttext_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='c_fasttext_data'), c_fasttext_data)
        # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation,
        # type='w_glove_data'), w_glove_data)
        # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation,
        # type='c_glove_data'), c_glove_data)

        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TOKENIZER_TEMPLATE,
                            variation=variation,
                            level='word'), word_tokenizer)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TOKENIZER_TEMPLATE,
                            variation=variation,
                            level='char'), char_tokenizer)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            variation=variation,
                            level='word'), word_tokenizer.word_index)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            variation=variation,
                            level='char'), char_tokenizer.word_index)

        # prepare ngram feature
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['char', 'word']:
                for ngram_range in [(1, 1), (2, 2), (3, 3), (2, 3), (1, 3),
                                    (2, 4), (1, 4), (4, 4), (5, 5), (6, 6),
                                    (7, 7), (8, 8)]:
                    prepare_ngram_feature(vectorizer_type, level, ngram_range,
                                          variation_train_data,
                                          variation_dev_data, variation)

        # prepare skip ngram features
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['word', 'char']:
                for ngram in [2, 3]:
                    for skip_k in [1, 2, 3]:
                        prepare_skip_ngram_feature(vectorizer_type, level,
                                                   ngram, skip_k,
                                                   variation_train_data,
                                                   variation_dev_data,
                                                   variation)

        # prepare pos ngram
        variation_train_pos_data = {
            'sentence': [
                get_pos(sentence)
                for sentence in variation_train_data['sentence']
            ],
            'label':
            variation_train_data['label']
        }
        variation_dev_pos_data = {
            'sentence':
            [get_pos(sentence) for sentence in variation_dev_data['sentence']],
            'label':
            variation_dev_data['label']
        }
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['word']:
                for ngram_range in [(1, 1), (2, 2), (3, 3)]:
                    prepare_ngram_feature(vectorizer_type, level, ngram_range,
                                          variation_train_pos_data,
                                          variation_dev_pos_data,
                                          variation + '_pos')

        # save analyze result
        write_log(
            format_filename(LOG_DIR,
                            ANALYSIS_LOG_TEMPLATE,
                            variation=variation), analyze_result)
Пример #5
0
                  format(variation, max_dev_performance))

            vote_dev_pred_class = vote_ensemble(model_dev_pred_classes,
                                                fallback=fallback)
            vote_dev_performance = eval_all(dev_data_label,
                                            vote_dev_pred_class)
            ensemble_log['vote_ensemble'] = vote_dev_performance
            print(
                'Logging Info - {} - majority vote ensembling: (acc, f1, p, r):{}'
                .format(variation, vote_dev_performance))

            ensemble_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
            write_log(format_filename(LOG_DIR,
                                      PERFORMANCE_LOG_TEMPLATE,
                                      variation=variation + '_ensemble'),
                      ensemble_log,
                      mode='a')

            if len(model_test_pred_probas) != 0:
                mean_test_pred_class = mean_ensemble(model_test_pred_probas,
                                                     binary_threshold)
                writer_predict(
                    format_filename(
                        PREDICT_DIR, '%s_%s_mean_ensemble.labels' %
                        (variation,
                         '_'.join(dl_model_names + ml_model_names))),
                    mean_test_pred_class)

                max_test_pred_class = max_ensemble(model_test_pred_probas,
                                                   binary_threshold)
Пример #6
0
def train_recognition(model_name, label_schema='BIOES', batch_size=32, n_epoch=50, learning_rate=0.001,
                      optimizer_type='adam', use_char_input=True, embed_type=None, embed_trainable=True,
                      use_bert_input=False, bert_type='bert', bert_trainable=True, bert_layer_num=1,
                      use_bichar_input=False, bichar_embed_type=None, bichar_embed_trainable=True,
                      use_word_input=False, word_embed_type=None, word_embed_trainable=True,
                      use_charpos_input=False, charpos_embed_type=None, charpos_embed_trainable=True,
                      use_softword_input=False, use_dictfeat_input=False, use_maxmatch_input=False,
                      callbacks_to_add=None, overwrite=False, swa_start=3, early_stopping_patience=3, **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.label_schema = label_schema
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    config.use_char_input = use_char_input
    if embed_type:
        config.embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type))
        config.embed_trainable = embed_trainable
        config.embed_dim = config.embeddings.shape[1]
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or ['modelcheckpoint', 'earlystopping']
    if 'swa' in config.callbacks_to_add:
        config.swa_start = swa_start
        config.early_stopping_patience = early_stopping_patience

    config.vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))

    if config.use_char_input:
        config.exp_name = '{}_{}_{}_{}_{}_{}_{}'.format(model_name, config.embed_type if config.embed_type else 'random',
                                                        'tune' if config.embed_trainable else 'fix', batch_size,
                                                        optimizer_type, learning_rate, label_schema)
    else:
        config.exp_name = '{}_{}_{}_{}_{}'.format(model_name, batch_size, optimizer_type, learning_rate, label_schema)
    if config.n_epoch != 50:
        config.exp_name += '_{}'.format(config.n_epoch)
    if kwargs:
        config.exp_name += '_' + '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '')
    config.exp_name += callback_str

    config.use_bert_input = use_bert_input
    config.bert_type = bert_type
    config.bert_trainable = bert_trainable
    config.bert_layer_num = bert_layer_num
    assert config.use_char_input or config.use_bert_input
    if config.use_bert_input:
        config.exp_name += '_{}_layer_{}_{}'.format(bert_type, bert_layer_num, 'tune' if config.bert_trainable else 'fix')
    config.use_bichar_input = use_bichar_input
    if config.use_bichar_input:
        config.bichar_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='bichar'))
        config.bichar_vocab_size = len(config.bichar_vocab) + 2
        if bichar_embed_type:
            config.bichar_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                               type=bichar_embed_type))
            config.bichar_embed_trainable = bichar_embed_trainable
            config.bichar_embed_dim = config.bichar_embeddings.shape[1]
        else:
            config.bichar_embeddings = None
            config.bichar_embed_trainable = True
        config.exp_name += '_bichar_{}_{}'.format(bichar_embed_type if bichar_embed_type else 'random',
                                                  'tune' if config.bichar_embed_trainable else 'fix')
    config.use_word_input = use_word_input
    if config.use_word_input:
        config.word_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='word'))
        config.word_vocab_size = len(config.word_vocab) + 2
        if word_embed_type:
            config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                             type=word_embed_type))
            config.word_embed_trainable = word_embed_trainable
            config.word_embed_dim = config.word_embeddings.shape[1]
        else:
            config.word_embeddings = None
            config.word_embed_trainable = True
        config.exp_name += '_word_{}_{}'.format(word_embed_type if word_embed_type else 'random',
                                                'tune' if config.word_embed_trainable else 'fix')
    config.use_charpos_input = use_charpos_input
    if config.use_charpos_input:
        config.charpos_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='charpos'))
        config.charpos_vocab_size = len(config.charpos_vocab) + 2
        if charpos_embed_type:
            config.charpos_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE,
                                                                type=charpos_embed_type))
            config.charpos_embed_trainable = charpos_embed_trainable
            config.charpos_embed_dim = config.charpos_embeddings.shape[1]
        else:
            config.charpos_embeddings = None
            config.charpos_embed_trainable = True
        config.exp_name += '_charpos_{}_{}'.format(charpos_embed_type if charpos_embed_type else 'random',
                                                   'tune' if config.charpos_embed_trainable else 'fix')
    config.use_softword_input = use_softword_input
    if config.use_softword_input:
        config.exp_name += '_softword'
    config.use_dictfeat_input = use_dictfeat_input
    if config.use_dictfeat_input:
        config.exp_name += '_dictfeat'
    config.use_maxmatch_input = use_maxmatch_input
    if config.use_maxmatch_input:
        config.exp_name += '_maxmatch'

    # logger to log output of training process
    train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch,
                 'learning_rate': learning_rate, 'other_params': kwargs}

    print('Logging Info - Experiment: %s' % config.exp_name)
    model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name))
    model = RecognitionModel(config, **kwargs)

    train_data_type, dev_data_type = 'train', 'dev'
    train_generator = RecognitionDataGenerator(train_data_type, config.batch_size, config.label_schema,
                                               config.label_to_one_hot[config.label_schema],
                                               config.vocab if config.use_char_input else None,
                                               config.bert_vocab_file(config.bert_type) if config.use_bert_input else None,
                                               config.bert_seq_len, config.bichar_vocab, config.word_vocab,
                                               config.use_word_input, config.charpos_vocab, config.use_softword_input,
                                               config.use_dictfeat_input, config.use_maxmatch_input)
    valid_generator = RecognitionDataGenerator(dev_data_type, config.batch_size, config.label_schema,
                                               config.label_to_one_hot[config.label_schema],
                                               config.vocab if config.use_char_input else None,
                                               config.bert_vocab_file(config.bert_type) if config.use_bert_input else None,
                                               config.bert_seq_len, config.bichar_vocab, config.word_vocab,
                                               config.use_word_input, config.charpos_vocab, config.use_softword_input,
                                               config.use_dictfeat_input, config.use_maxmatch_input)

    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_generator, valid_generator)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    r, p, f1 = model.evaluate(valid_generator)
    train_log['dev_performance'] = (r, p, f1)

    swa_type = None
    if 'swa' in config.callbacks_to_add:
        swa_type = 'swa'
    elif 'swa_clr' in config.callbacks_to_add:
        swa_type = 'swa_clr'
    if swa_type:
        model.load_swa_model(swa_type)
        print('Logging Info - Evaluate over valid data based on swa model:')
        r, p, f1 = model.evaluate(valid_generator)
        train_log['swa_dev_performance'] = (r, p, f1)

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_er'), log=train_log, mode='a')

    del model
    gc.collect()
    K.clear_session()
Пример #7
0
def main():
    process_conf = ProcessConfig()
    # create directory
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    if not os.path.exists(MODEL_SAVED_DIR):
        os.makedirs(MODEL_SAVED_DIR)
    if not os.path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # load SNLI, MultiNLI and MLI datasets
    data_train, data_dev, data_test = load_data()
    print('Logging Info - Data: train - {}, dev - {}, test - {}'.format(data_train.shape, data_dev.shape,
                                                                        data_test.shape))

    for genre in GENRES:
        if genre not in data_train.index:
            continue

        analyze_result = {}

        genre_train = data_train.loc[genre]
        genre_dev = data_dev.loc[genre]
        genre_test = data_test.loc[genre]   # might be None
        print('Logging Info - Genre: {}, train - {}, dev - {}, test - {}'.format(genre, genre_train.shape,
                                                                                 genre_dev.shape, genre_test.shape))
        analyze_result.update({'train_set': len(genre_train), 'dev_set': len(genre_dev),
                               'test_set': 0 if genre_test is None else len(genre_test)})

        genre_train_data = process_data(genre_train, process_conf.clean, process_conf.stem)
        genre_dev_data = process_data(genre_dev, process_conf.clean, process_conf.stem)

        # class distribution analysis
        train_label_distribution = analyze_class_distribution(genre_train_data['label'])
        analyze_result.update(dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items()))
        dev_label_distribution = analyze_class_distribution(genre_dev_data['label'])
        analyze_result.update(dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items()))

        # create tokenizer and vocabulary
        sentences_train = genre_train_data['premise'] + genre_train_data['hypothesis']
        sentences_dev = genre_dev_data['premise'] + genre_dev_data['hypothesis']

        word_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=False)
        char_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=True)
        word_tokenizer.fit_on_texts(sentences_train)    # just fit on train data
        char_tokenizer.fit_on_texts(sentences_train)
        print('Logging Info - Genre: {}, word_vocab: {}, char_vocab: {}'.format(genre, len(word_tokenizer.word_index),
                                                                                len(char_tokenizer.word_index)))
        analyze_result.update({'word_vocab': len(word_tokenizer.word_index),
                               'char_vocab': len(char_tokenizer.word_index)})

        # length analysis
        word_len_distribution, word_max_len = analyze_len_distribution(sentences_train, level='word')
        analyze_result.update(dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items()))
        char_len_distribution, char_max_len = analyze_len_distribution(sentences_train, level='char')
        analyze_result.update(dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items()))

        train_word_ids = create_data_matrices(word_tokenizer, genre_train_data, process_conf.padding,
                                              process_conf.truncating, process_conf.n_class, word_max_len)
        train_char_ids = create_data_matrices(char_tokenizer, genre_train_data, process_conf.padding,
                                              process_conf.truncating, process_conf.n_class, char_max_len)
        dev_word_ids = create_data_matrices(word_tokenizer, genre_dev_data, process_conf.padding,
                                            process_conf.truncating, process_conf.n_class, word_max_len)
        dev_char_ids = create_data_matrices(char_tokenizer, genre_dev_data, process_conf.padding,
                                            process_conf.truncating, process_conf.n_class, char_max_len)

        # create embedding matrix from pretrained word vectors
        glove_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['glove_cc'], word_tokenizer.word_index)
        fasttext_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_cc'], word_tokenizer.word_index)
        fasttext_wiki = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_wiki'], word_tokenizer.word_index)
        # create embedding matrix by training on nil dataset
        w2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index)
        w_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index)
        w_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index)

        # save pre-process data
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre), genre_train_data)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre), genre_dev_data)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'word'), train_word_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'char'), train_char_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'word'), dev_word_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'char'), dev_char_ids)

        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'glove_cc'), glove_cc)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_cc'), fasttext_cc)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_wiki'), fasttext_wiki)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w2v_nil'), w2v_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c2v_nil'), c2v_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_fasttext_nil'), w_fasttext_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_fasttext_nil'), c_fasttext_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_glove_nil'), w_glove_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_glove_nil'), c_glove_nil)

        pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'word'), word_tokenizer)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'char'), char_tokenizer)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'word'), word_tokenizer.word_index)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'char'), char_tokenizer.word_index)

        if genre_test is not None:
            genre_test_data = process_data(genre_test, process_conf.clean, process_conf.stem)
            test_label_distribution = analyze_class_distribution(genre_test_data['label'])
            analyze_result.update(
                dict(('test_cls_%d' % cls, percent) for cls, percent in test_label_distribution.items()))

            test_word_ids = create_data_matrices(word_tokenizer, genre_test_data, process_conf.padding,
                                                 process_conf.truncating, process_conf.n_class,
                                                 word_max_len)
            test_char_ids = create_data_matrices(char_tokenizer, genre_test_data, process_conf.padding,
                                                 process_conf.truncating, process_conf.n_class,
                                                 char_max_len)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre), genre_test_data)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'word'), test_word_ids)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'char'), test_char_ids)

        # save analyze result
        analyze_result['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        write_log(format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, genre), analyze_result)
Пример #8
0
def train_dl_model(variation,
                   input_level,
                   word_embed_type,
                   word_embed_trainable,
                   batch_size,
                   learning_rate,
                   optimizer_type,
                   model_name,
                   binary_threshold=0.5,
                   checkpoint_dir=None,
                   overwrite=False,
                   log_error=False,
                   save_log=True,
                   **kwargs):
    config = ModelConfig()
    config.variation = variation
    config.input_level = input_level
    if '_aug' in variation:
        config.max_len = {
            'word': config.aug_word_max_len,
            'char': config.aug_char_max_len
        }
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.word_embeddings = np.load(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        variation=variation,
                        type=word_embed_type))
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.binary_threshold = binary_threshold
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
        if not os.path.exists(config.checkpoint_dir):
            os.makedirs(config.checkpoint_dir)
    config.exp_name = '{}_{}_{}_{}_{}'.format(
        variation, model_name, input_level, word_embed_type,
        'tune' if word_embed_trainable else 'fix')

    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'learning_rate': learning_rate,
        'binary_threshold': binary_threshold
    }

    print('Logging Info - Experiment: ', config.exp_name)
    if model_name == 'bilstm':
        model = BiLSTM(config, **kwargs)
    elif model_name == 'cnnrnn':
        model = CNNRNN(config, **kwargs)
    elif model_name == 'dcnn':
        model = DCNN(config, **kwargs)
    elif model_name == 'dpcnn':
        model = DPCNN(config, **kwargs)
    elif model_name == 'han':
        model = HAN(config, **kwargs)
    elif model_name == 'multicnn':
        model = MultiTextCNN(config, **kwargs)
    elif model_name == 'rcnn':
        model = RCNN(config, **kwargs)
    elif model_name == 'rnncnn':
        model = RNNCNN(config, **kwargs)
    elif model_name == 'cnn':
        model = TextCNN(config, **kwargs)
    elif model_name == 'vdcnn':
        model = VDCNN(config, **kwargs)
    else:
        raise ValueError('Model Name Not Understood : {}'.format(model_name))

    train_input = load_processed_data(variation, input_level, 'train')
    dev_input = load_processed_data(variation, input_level, 'dev')
    test_input = load_processed_data(variation, input_level, 'test')

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if not path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_input, dev_input)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s',
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    # load the best model
    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate(
        dev_input)
    train_log['valid_acc'] = valid_acc
    train_log['valid_f1'] = valid_f1
    train_log['valid_macro_f1'] = valid_macro_f1
    train_log['valid_p'] = valid_p
    train_log['valid_r'] = valid_r
    train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())

    if log_error:
        error_indexes, error_pred_probas = model.error_analyze(dev_input)
        dev_text_input = load_processed_text_data(variation, 'dev')
        for error_index, error_pred_prob in zip(error_indexes,
                                                error_pred_probas):
            train_log['error_%d' % error_index] = '{},{},{},{}'.format(
                error_index, dev_text_input['sentence'][error_index],
                dev_text_input['label'][error_index], error_pred_prob)
    if save_log:
        write_log(format_filename(LOG_DIR,
                                  PERFORMANCE_LOG_TEMPLATE,
                                  variation=variation),
                  log=train_log,
                  mode='a')

    if test_input is not None:
        test_predictions = model.predict(test_input)
        writer_predict(
            format_filename(PREDICT_DIR, config.exp_name + '.labels'),
            test_predictions)

    return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
Пример #9
0
def train_match_model(variation,
                      input_level,
                      word_embed_type,
                      word_embed_trainable,
                      batch_size,
                      learning_rate,
                      optimizer_type,
                      encoder_type='concat_attention',
                      metrics='euclidean',
                      checkpoint_dir=None,
                      overwrite=False):
    config = ModelConfig()
    config.variation = variation
    config.input_level = input_level
    if '_aug' in variation:
        config.max_len = {
            'word': config.aug_word_max_len,
            'char': config.aug_char_max_len
        }
    config.word_embed_type = word_embed_type
    config.word_embed_trainable = word_embed_trainable
    config.word_embeddings = np.load(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        variation=variation,
                        type=word_embed_type))
    config.batch_size = batch_size
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    if checkpoint_dir is not None:
        config.checkpoint_dir = checkpoint_dir
        if not os.path.exists(config.checkpoint_dir):
            os.makedirs(config.checkpoint_dir)
    config.exp_name = '{}_dialect_match_{}_{}_{}_{}_{}'.format(
        variation, encoder_type, metrics, input_level, word_embed_type,
        'tune' if word_embed_trainable else 'fix')
    config.checkpoint_monitor = 'val_loss'
    config.early_stopping_monitor = 'val_loss'
    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'learning_rate': learning_rate
    }

    model = DialectMatchModel(config,
                              encoder_type='concat_attention',
                              metrics='euclidean')
    train_input = load_processed_data(variation, input_level, 'train')
    dev_input = load_processed_data(variation, input_level, 'dev')

    model_save_path = path.join(config.checkpoint_dir,
                                '{}.hdf5'.format(config.exp_name))
    if not path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_input, dev_input)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s',
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    # load the best model
    model.load_best_model()

    print('Logging Info - Evaluate over valid data:')
    valid_acc, valid_f1 = model.evaluate(dev_input)
    train_log['valid_acc'] = valid_acc
    train_log['valid_f1'] = valid_f1
    train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())

    write_log(format_filename(LOG_DIR,
                              PERFORMANCE_LOG_TEMPLATE,
                              variation=variation + '_match'),
              log=train_log,
              mode='a')
    return valid_acc, valid_f1
Пример #10
0
def train_link(model_name,
               batch_size=32,
               n_epoch=50,
               learning_rate=0.001,
               optimizer_type='adam',
               embed_type=None,
               embed_trainable=True,
               callbacks_to_add=None,
               use_relative_pos=False,
               n_neg=1,
               omit_one_cand=True,
               overwrite=False,
               swa_start=5,
               early_stopping_patience=3,
               **kwargs):
    config = ModelConfig()
    config.model_name = model_name
    config.batch_size = batch_size
    config.n_epoch = n_epoch
    config.learning_rate = learning_rate
    config.optimizer = get_optimizer(optimizer_type, learning_rate)
    config.embed_type = embed_type
    if embed_type:
        config.embeddings = np.load(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            type=embed_type))
        config.embed_trainable = embed_trainable
    else:
        config.embeddings = None
        config.embed_trainable = True

    config.callbacks_to_add = callbacks_to_add or [
        'modelcheckpoint', 'earlystopping'
    ]
    if 'swa' in config.callbacks_to_add:
        config.swa_start = swa_start
        config.early_stopping_patience = early_stopping_patience

    config.vocab = pickle_load(
        format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char'))
    config.vocab_size = len(config.vocab) + 2
    config.mention_to_entity = pickle_load(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME))
    config.entity_desc = pickle_load(
        format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME))

    config.exp_name = '{}_{}_{}_{}_{}_{}'.format(
        model_name, embed_type if embed_type else 'random',
        'tune' if config.embed_trainable else 'fix', batch_size,
        optimizer_type, learning_rate)
    config.use_relative_pos = use_relative_pos
    if config.use_relative_pos:
        config.exp_name += '_rel'
    config.n_neg = n_neg
    if config.n_neg > 1:
        config.exp_name += '_neg_{}'.format(config.n_neg)
    config.omit_one_cand = omit_one_cand
    if not config.omit_one_cand:
        config.exp_name += '_not_omit'
    if kwargs:
        config.exp_name += '_' + '_'.join(
            [str(k) + '_' + str(v) for k, v in kwargs.items()])
    callback_str = '_' + '_'.join(config.callbacks_to_add)
    callback_str = callback_str.replace('_modelcheckpoint',
                                        '').replace('_earlystopping', '')
    config.exp_name += callback_str

    # logger to log output of training process
    train_log = {
        'exp_name': config.exp_name,
        'batch_size': batch_size,
        'optimizer': optimizer_type,
        'epoch': n_epoch,
        'learning_rate': learning_rate,
        'other_params': kwargs
    }

    print('Logging Info - Experiment: %s' % config.exp_name)
    model_save_path = os.path.join(config.checkpoint_dir,
                                   '{}.hdf5'.format(config.exp_name))
    model = LinkModel(config, **kwargs)

    train_data_type, dev_data_type = 'train', 'dev'
    train_generator = LinkDataGenerator(
        train_data_type, config.vocab, config.mention_to_entity,
        config.entity_desc, config.batch_size, config.max_desc_len,
        config.max_erl_len, config.use_relative_pos, config.n_neg,
        config.omit_one_cand)
    dev_data = load_data(dev_data_type)

    if not os.path.exists(model_save_path) or overwrite:
        start_time = time.time()
        model.train(train_generator, dev_data)
        elapsed_time = time.time() - start_time
        print('Logging Info - Training time: %s' %
              time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        train_log['train_time'] = time.strftime("%H:%M:%S",
                                                time.gmtime(elapsed_time))

    model.load_best_model()
    dev_text_data, dev_pred_mentions, dev_gold_mention_entities = [], [], []
    for data in dev_data:
        dev_text_data.append(data['text'])
        dev_pred_mentions.append(data['mention_data'])
        dev_gold_mention_entities.append(data['mention_data'])
    print('Logging Info - Evaluate over valid data:')
    r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                              dev_gold_mention_entities)
    train_log['dev_performance'] = (r, p, f1)

    swa_type = None
    if 'swa' in config.callbacks_to_add:
        swa_type = 'swa'
    elif 'swa_clr' in config.callbacks_to_add:
        swa_type = 'swa_clr'
    if swa_type:
        model.load_swa_model(swa_type)
        print('Logging Info - Evaluate over valid data based on swa model:')
        r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions,
                                  dev_gold_mention_entities)
        train_log['swa_dev_performance'] = (r, p, f1)

    train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime())
    write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_el'),
              log=train_log,
              mode='a')
    del model
    gc.collect()
    K.clear_session()