def load_single_ngram_data(variation, vectorizer_type, level, ngram_range, data_type): if data_type == 'train': filename = format_filename(PROCESSED_DATA_DIR, TRAIN_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range) elif data_type == 'valid' or data_type == 'dev': filename = format_filename(PROCESSED_DATA_DIR, DEV_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range) elif data_type == 'test': filename = format_filename(PROCESSED_DATA_DIR, TEST_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range) else: raise ValueError('Data Type Not Understood: {}'.format(data_type)) if os.path.exists(filename): return pickle_load(filename) else: return None
def load_processed_data(genre, level, data_type): if data_type == 'train': filename = format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, level) elif data_type == 'valid' or data_type == 'dev': filename = format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, level) elif data_type == 'test': filename = format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, level) else: raise ValueError('Data Type Not Understood: {}'.format(data_type)) return pickle_load(filename)
def __init__(self, genre): self.genre = genre self.train_data = pickle_load( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre)) self.dev_data = pickle_load( format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre)) self.test_data = pickle_load( format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre)) if not os.path.exists(FEATURE_DIR): os.makedirs(FEATURE_DIR)
def load_features(genre, data_type, scale_features): feat_type = 'all_scaled' if scale_features else 'all' if data_type == 'train': filename = format_filename(FEATURE_DIR, TRAIN_FEATURES_TEMPLATE, genre, feat_type) elif data_type == 'valid' or data_type == 'dev': filename = format_filename(FEATURE_DIR, DEV_FEATURES_TEMPLATE, genre, feat_type) elif data_type == 'test': filename = format_filename(FEATURE_DIR, TEST_FEATURES_TEMPLATE, genre, feat_type) else: raise ValueError('Data Type Not Understood: {}'.format(data_type)) return pickle_load(filename)
def format_feature_file(self, data_type, feat_type): if data_type == 'train': feat_file = format_filename(FEATURE_DIR, TRAIN_FEATURES_TEMPLATE, self.genre, feat_type) elif data_type == 'dev' or data_type == 'valid': feat_file = format_filename(FEATURE_DIR, DEV_FEATURES_TEMPLATE, self.genre, feat_type) elif data_type == 'test': feat_file = format_filename(FEATURE_DIR, TEST_FEATURES_TEMPLATE, self.genre, feat_type) else: raise ValueError('Data Type `{}` not understood'.format(data_type)) return feat_file
def load_data(data_type): if data_type == 'train': data = pickle_load( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_FILENAME)) elif data_type == 'dev': data = pickle_load( format_filename(PROCESSED_DATA_DIR, DEV_DATA_FILENAME)) elif data_type == 'test': data = pickle_load( format_filename(PROCESSED_DATA_DIR, TEST_DATA_FILENAME)) elif data_type == 'test_final': data = pickle_load( format_filename(PROCESSED_DATA_DIR, TEST_FINAL_DATA_FILENAME)) else: raise ValueError('data tye not understood: {}'.format(data_type)) return data
def load_processed_text_data(variation, data_type): if data_type == 'train': filename = format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, variation=variation) elif data_type == 'valid' or data_type == 'dev': filename = format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, variation=variation) elif data_type == 'test': filename = format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, variation=variation) else: raise ValueError('Data Type Not Understood: {}'.format(data_type)) if os.path.exists(filename): return pickle_load(filename) else: return None
def train_ensemble_model(ensemble_models, model_name, variation, dev_data, train_data=None, test_data=None, binary_threshold=0.5, checkpoint_dir=None, overwrite=False, log_error=False, save_log=True, **kwargs): config = ModelConfig() config.binary_threshold = binary_threshold if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir if not path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) config.exp_name = '{}_{}_ensemble_with_{}'.format(variation, model_name, ensemble_models) train_log = { 'exp_name': config.exp_name, 'binary_threshold': binary_threshold } print('Logging Info - Ensemble Experiment: ', config.exp_name) if model_name == 'svm': model = SVMModel(config, **kwargs) elif model_name == 'lr': model = LRModel(config, **kwargs) elif model_name == 'sgd': model = SGDModel(config, **kwargs) elif model_name == 'gnb': model = GaussianNBModel(config, **kwargs) elif model_name == 'mnb': model = MultinomialNBModel(config, **kwargs) elif model_name == 'bnb': model = BernoulliNBModel(config, **kwargs) elif model_name == 'rf': model = RandomForestModel(config, **kwargs) elif model_name == 'gbdt': model = GBDTModel(config, **kwargs) elif model_name == 'xgboost': model = XGBoostModel(config, **kwargs) elif model_name == 'lda': model = LDAModel(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if train_data is not None and (not path.exists(model_save_path) or overwrite): model.train(train_data) model.load_best_model() print('Logging Info - Evaluate over valid data:') valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate( dev_data) train_log['valid_acc'] = valid_acc train_log['valid_f1'] = valid_f1 train_log['valid_macro_f1'] = valid_macro_f1 train_log['valid_p'] = valid_p train_log['valid_r'] = valid_r train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if log_error: error_indexes, error_pred_probas = model.error_analyze(dev_data) dev_text_input = load_processed_text_data(variation, 'dev') for error_index, error_pred_prob in zip(error_indexes, error_pred_probas): train_log['error_%d' % error_index] = '{},{},{},{}'.format( error_index, dev_text_input['sentence'][error_index], dev_text_input['label'][error_index], error_pred_prob) if save_log: write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation), log=train_log, mode='a') if test_data is not None: test_predictions = model.predict(test_data) writer_predict( format_filename(PREDICT_DIR, config.exp_name + '.labels'), test_predictions) return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
def predict_dl_model(data_type, variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, checkpoint_dir=None, return_proba=True, **kwargs): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir config.exp_name = '{}_{}_{}_{}_{}'.format( variation, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') print('Logging Info - Experiment: ', config.exp_name) if model_name == 'bilstm': model = BiLSTM(config, **kwargs) elif model_name == 'cnnrnn': model = CNNRNN(config, **kwargs) elif model_name == 'dcnn': model = DCNN(config, **kwargs) elif model_name == 'dpcnn': model = DPCNN(config, **kwargs) elif model_name == 'han': model = HAN(config, **kwargs) elif model_name == 'multicnn': model = MultiTextCNN(config, **kwargs) elif model_name == 'rcnn': model = RCNN(config, **kwargs) elif model_name == 'rnncnn': model = RNNCNN(config, **kwargs) elif model_name == 'cnn': model = TextCNN(config, **kwargs) elif model_name == 'vdcnn': model = VDCNN(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path): raise FileNotFoundError('Model Not Found: {}'.format(model_save_path)) # load the best model model.load_best_model() data = load_processed_data(variation, input_level, data_type) if data is None: return None, config.exp_name if return_proba: return model.predict_proba(data), config.exp_name else: return model.predict(data), config.exp_name
print('Logging Info - {} - max ensembling: (acc, f1, p, r):{}'. format(variation, max_dev_performance)) vote_dev_pred_class = vote_ensemble(model_dev_pred_classes, fallback=fallback) vote_dev_performance = eval_all(dev_data_label, vote_dev_pred_class) ensemble_log['vote_ensemble'] = vote_dev_performance print( 'Logging Info - {} - majority vote ensembling: (acc, f1, p, r):{}' .format(variation, vote_dev_performance)) ensemble_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation + '_ensemble'), ensemble_log, mode='a') if len(model_test_pred_probas) != 0: mean_test_pred_class = mean_ensemble(model_test_pred_probas, binary_threshold) writer_predict( format_filename( PREDICT_DIR, '%s_%s_mean_ensemble.labels' % (variation, '_'.join(dl_model_names + ml_model_names))), mean_test_pred_class) max_test_pred_class = max_ensemble(model_test_pred_probas,
def main(): process_conf = ProcessConfig() # create directory if not os.path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load SNLI, MultiNLI and MLI datasets data_train, data_dev, data_test = load_data() print('Logging Info - Data: train - {}, dev - {}, test - {}'.format(data_train.shape, data_dev.shape, data_test.shape)) for genre in GENRES: if genre not in data_train.index: continue analyze_result = {} genre_train = data_train.loc[genre] genre_dev = data_dev.loc[genre] genre_test = data_test.loc[genre] # might be None print('Logging Info - Genre: {}, train - {}, dev - {}, test - {}'.format(genre, genre_train.shape, genre_dev.shape, genre_test.shape)) analyze_result.update({'train_set': len(genre_train), 'dev_set': len(genre_dev), 'test_set': 0 if genre_test is None else len(genre_test)}) genre_train_data = process_data(genre_train, process_conf.clean, process_conf.stem) genre_dev_data = process_data(genre_dev, process_conf.clean, process_conf.stem) # class distribution analysis train_label_distribution = analyze_class_distribution(genre_train_data['label']) analyze_result.update(dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items())) dev_label_distribution = analyze_class_distribution(genre_dev_data['label']) analyze_result.update(dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items())) # create tokenizer and vocabulary sentences_train = genre_train_data['premise'] + genre_train_data['hypothesis'] sentences_dev = genre_dev_data['premise'] + genre_dev_data['hypothesis'] word_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=False) char_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=True) word_tokenizer.fit_on_texts(sentences_train) # just fit on train data char_tokenizer.fit_on_texts(sentences_train) print('Logging Info - Genre: {}, word_vocab: {}, char_vocab: {}'.format(genre, len(word_tokenizer.word_index), len(char_tokenizer.word_index))) analyze_result.update({'word_vocab': len(word_tokenizer.word_index), 'char_vocab': len(char_tokenizer.word_index)}) # length analysis word_len_distribution, word_max_len = analyze_len_distribution(sentences_train, level='word') analyze_result.update(dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items())) char_len_distribution, char_max_len = analyze_len_distribution(sentences_train, level='char') analyze_result.update(dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items())) train_word_ids = create_data_matrices(word_tokenizer, genre_train_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) train_char_ids = create_data_matrices(char_tokenizer, genre_train_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) dev_word_ids = create_data_matrices(word_tokenizer, genre_dev_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) dev_char_ids = create_data_matrices(char_tokenizer, genre_dev_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) # create embedding matrix from pretrained word vectors glove_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['glove_cc'], word_tokenizer.word_index) fasttext_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_cc'], word_tokenizer.word_index) fasttext_wiki = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_wiki'], word_tokenizer.word_index) # create embedding matrix by training on nil dataset w2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) # save pre-process data pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre), genre_train_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre), genre_dev_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'word'), train_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'char'), train_char_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'word'), dev_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'char'), dev_char_ids) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'glove_cc'), glove_cc) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_cc'), fasttext_cc) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_wiki'), fasttext_wiki) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w2v_nil'), w2v_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c2v_nil'), c2v_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_fasttext_nil'), w_fasttext_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_fasttext_nil'), c_fasttext_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_glove_nil'), w_glove_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_glove_nil'), c_glove_nil) pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'word'), word_tokenizer) pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'char'), char_tokenizer) pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'word'), word_tokenizer.word_index) pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'char'), char_tokenizer.word_index) if genre_test is not None: genre_test_data = process_data(genre_test, process_conf.clean, process_conf.stem) test_label_distribution = analyze_class_distribution(genre_test_data['label']) analyze_result.update( dict(('test_cls_%d' % cls, percent) for cls, percent in test_label_distribution.items())) test_word_ids = create_data_matrices(word_tokenizer, genre_test_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) test_char_ids = create_data_matrices(char_tokenizer, genre_test_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre), genre_test_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'word'), test_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'char'), test_char_ids) # save analyze result analyze_result['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, genre), analyze_result)
def recognition(model_name, predict_log, label_schema='BIOES', batch_size=32, n_epoch=50, learning_rate=0.001, optimizer_type='adam', use_char_input=True, embed_type=None, embed_trainable=True, use_bert_input=False, bert_type='bert', bert_trainable=True, bert_layer_num=1, use_bichar_input=False, bichar_embed_type=None, bichar_embed_trainable=True, use_word_input=False, word_embed_type=None, word_embed_trainable=True, use_charpos_input=False, charpos_embed_type=None, charpos_embed_trainable=True, use_softword_input=False, use_dictfeat_input=False, use_maxmatch_input=False, callbacks_to_add=None, swa_type=None, predict_on_dev=True, predict_on_final_test=True, **kwargs): config = ModelConfig() config.model_name = model_name config.label_schema = label_schema config.batch_size = batch_size config.n_epoch = n_epoch config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.embed_type = embed_type config.use_char_input = use_char_input if embed_type: config.embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type)) config.embed_trainable = embed_trainable config.embed_dim = config.embeddings.shape[1] else: config.embeddings = None config.embed_trainable = True config.callbacks_to_add = callbacks_to_add or [ 'modelcheckpoint', 'earlystopping' ] config.vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char')) config.vocab_size = len(config.vocab) + 2 config.mention_to_entity = pickle_load( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) if config.use_char_input: config.exp_name = '{}_{}_{}_{}_{}_{}_{}'.format( model_name, config.embed_type if config.embed_type else 'random', 'tune' if config.embed_trainable else 'fix', batch_size, optimizer_type, learning_rate, label_schema) else: config.exp_name = '{}_{}_{}_{}_{}'.format(model_name, batch_size, optimizer_type, learning_rate, label_schema) if kwargs: config.exp_name += '_' + '_'.join( [str(k) + '_' + str(v) for k, v in kwargs.items()]) callback_str = '_' + '_'.join(config.callbacks_to_add) callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '') config.exp_name += callback_str config.use_bert_input = use_bert_input config.bert_type = bert_type config.bert_trainable = bert_trainable config.bert_layer_num = bert_layer_num assert config.use_char_input or config.use_bert_input if config.use_bert_input: config.exp_name += '_{}_layer_{}_{}'.format( bert_type, bert_layer_num, 'tune' if config.bert_trainable else 'fix') config.use_bichar_input = use_bichar_input if config.use_bichar_input: config.bichar_vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='bichar')) config.bichar_vocab_size = len(config.bichar_vocab) + 2 if bichar_embed_type: config.bichar_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=bichar_embed_type)) config.bichar_embed_trainable = bichar_embed_trainable config.bichar_embed_dim = config.bichar_embeddings.shape[1] else: config.bichar_embeddings = None config.bichar_embed_trainable = True config.exp_name += '_bichar_{}_{}'.format( bichar_embed_type if bichar_embed_type else 'random', 'tune' if config.bichar_embed_trainable else 'fix') config.use_word_input = use_word_input if config.use_word_input: config.word_vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='word')) config.word_vocab_size = len(config.word_vocab) + 2 if word_embed_type: config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=word_embed_type)) config.word_embed_trainable = word_embed_trainable config.word_embed_dim = config.word_embeddings.shape[1] else: config.word_embeddings = None config.word_embed_trainable = True config.exp_name += '_word_{}_{}'.format( word_embed_type if word_embed_type else 'random', 'tune' if config.word_embed_trainable else 'fix') config.use_charpos_input = use_charpos_input if config.use_charpos_input: config.charpos_vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='charpos')) config.charpos_vocab_size = len(config.charpos_vocab) + 2 if charpos_embed_type: config.charpos_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=charpos_embed_type)) config.charpos_embed_trainable = charpos_embed_trainable config.charpos_embed_dim = config.charpos_embeddings.shape[1] else: config.charpos_embeddings = None config.charpos_embed_trainable = True config.exp_name += '_charpos_{}_{}'.format( charpos_embed_type if charpos_embed_type else 'random', 'tune' if config.charpos_embed_trainable else 'fix') config.use_softword_input = use_softword_input if config.use_softword_input: config.exp_name += '_softword' config.use_dictfeat_input = use_dictfeat_input if config.use_dictfeat_input: config.exp_name += '_dictfeat' config.use_maxmatch_input = use_maxmatch_input if config.use_maxmatch_input: config.exp_name += '_maxmatch' # logger to log output of training process predict_log.update({ 'er_exp_name': config.exp_name, 'er_batch_size': batch_size, 'er_optimizer': optimizer_type, 'er_epoch': n_epoch, 'er_learning_rate': learning_rate, 'er_other_params': kwargs }) print('Logging Info - Experiment: %s' % config.exp_name) model = RecognitionModel(config, **kwargs) dev_data_type = 'dev' if predict_on_final_test: test_data_type = 'test_final' else: test_data_type = 'test' valid_generator = RecognitionDataGenerator( dev_data_type, config.batch_size, config.label_schema, config.label_to_one_hot[config.label_schema], config.vocab if config.use_char_input else None, config.bert_vocab_file(config.bert_type) if config.use_bert_input else None, config.bert_seq_len, config.bichar_vocab, config.word_vocab, config.use_word_input, config.charpos_vocab, config.use_softword_input, config.use_dictfeat_input, config.use_maxmatch_input) test_generator = RecognitionDataGenerator( test_data_type, config.batch_size, config.label_schema, config.label_to_one_hot[config.label_schema], config.vocab if config.use_char_input else None, config.bert_vocab_file(config.bert_type) if config.use_bert_input else None, config.bert_seq_len, config.bichar_vocab, config.word_vocab, config.use_word_input, config.charpos_vocab, config.use_softword_input, config.use_dictfeat_input, config.use_maxmatch_input) model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not os.path.exists(model_save_path): raise FileNotFoundError( 'Recognition model not exist: {}'.format(model_save_path)) if swa_type is None: model.load_best_model() elif 'swa' in callbacks_to_add: model.load_swa_model(swa_type) predict_log['er_exp_name'] += '_{}'.format(swa_type) if predict_on_dev: print('Logging Info - Generate submission for valid data:') dev_pred_mentions = model.predict(valid_generator) else: dev_pred_mentions = None print('Logging Info - Generate submission for test data:') test_pred_mentions = model.predict(test_generator) return dev_pred_mentions, test_pred_mentions
def prepare_ngram_feature(vectorizer_type, level, ngram_range, train_data, dev_data, variation): if level not in ['word', 'char', 'char_wb']: raise ValueError('Vectorizer Level Not Understood: {}'.format(level)) if not isinstance(ngram_range, tuple): raise ValueError('ngram_range should be a tuple, got {}'.format( type(ngram_range))) if vectorizer_type == 'binary': vectorizer = CountVectorizer(binary=True, analyzer=level, ngram_range=ngram_range) elif vectorizer_type == 'tf': vectorizer = CountVectorizer(binary=False, analyzer=level, ngram_range=ngram_range) elif vectorizer_type == 'tfidf': vectorizer = TfidfVectorizer(analyzer=level, ngram_range=ngram_range) else: raise ValueError( 'Vectorizer Type Not Understood: {}'.format(vectorizer_type)) train_ngram_feature = vectorizer.fit_transform(train_data['sentence']) train_ngram_data = { 'sentence': train_ngram_feature, 'label': train_data['label'] } dev_ngram_feature = vectorizer.transform(dev_data['sentence']) dev_ngram_data = { 'sentence': dev_ngram_feature, 'label': dev_data['label'] } print( 'Logging info - {}_{}vectorizer_{}_{} : train_ngram_feature shape: {}, ' 'dev_ngram_feature shape: {}'.format(variation, vectorizer_type, level, ngram_range, train_ngram_feature.shape, dev_ngram_feature.shape)) pickle_dump( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), vectorizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), train_ngram_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), dev_ngram_data) return vectorizer, train_ngram_data, dev_ngram_data
def prepare_skip_ngram_feature(vectorizer_type, level, ngram, skip_k, train_data, dev_data, variation): if level not in ['word', 'char']: raise ValueError('Vectorizer Level Not Understood: {}'.format(level)) if vectorizer_type == 'binary': vectorizer = CountVectorizer(binary=True, tokenizer=make_skip_tokenize( ngram, skip_k, level)) elif vectorizer_type == 'tf': vectorizer = CountVectorizer(binary=False, tokenizer=make_skip_tokenize( ngram, skip_k, level)) elif vectorizer_type == 'tfidf': vectorizer = TfidfVectorizer(make_skip_tokenize(ngram, skip_k, level)) else: raise ValueError( 'Vectorizer Type Not Understood: {}'.format(vectorizer_type)) train_ngram_feature = vectorizer.fit_transform(train_data['sentence']) train_ngram_data = { 'sentence': train_ngram_feature, 'label': train_data['label'] } dev_ngram_feature = vectorizer.transform(dev_data['sentence']) dev_ngram_data = { 'sentence': dev_ngram_feature, 'label': dev_data['label'] } print( 'Logging info - {}_{}vectorizer_{}_{}_{} : train_skip_ngram_feature shape: {}, ' 'dev_skip_ngram_feature shape: {}'.format(variation, vectorizer_type, level, ngram, skip_k, train_ngram_feature.shape, dev_ngram_feature.shape)) # pickle can't pickle lambda function, here i use drill: https://github.com/uqfoundation/dill with open( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), 'wb') as writer: dill.dump(vectorizer, writer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), train_ngram_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), dev_ngram_data) return vectorizer, train_ngram_data, dev_ngram_data
def train_recognition(model_name, label_schema='BIOES', batch_size=32, n_epoch=50, learning_rate=0.001, optimizer_type='adam', use_char_input=True, embed_type=None, embed_trainable=True, use_bert_input=False, bert_type='bert', bert_trainable=True, bert_layer_num=1, use_bichar_input=False, bichar_embed_type=None, bichar_embed_trainable=True, use_word_input=False, word_embed_type=None, word_embed_trainable=True, use_charpos_input=False, charpos_embed_type=None, charpos_embed_trainable=True, use_softword_input=False, use_dictfeat_input=False, use_maxmatch_input=False, callbacks_to_add=None, overwrite=False, swa_start=3, early_stopping_patience=3, **kwargs): config = ModelConfig() config.model_name = model_name config.label_schema = label_schema config.batch_size = batch_size config.n_epoch = n_epoch config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.embed_type = embed_type config.use_char_input = use_char_input if embed_type: config.embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type)) config.embed_trainable = embed_trainable config.embed_dim = config.embeddings.shape[1] else: config.embeddings = None config.embed_trainable = True config.callbacks_to_add = callbacks_to_add or ['modelcheckpoint', 'earlystopping'] if 'swa' in config.callbacks_to_add: config.swa_start = swa_start config.early_stopping_patience = early_stopping_patience config.vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char')) config.vocab_size = len(config.vocab) + 2 config.mention_to_entity = pickle_load(format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) if config.use_char_input: config.exp_name = '{}_{}_{}_{}_{}_{}_{}'.format(model_name, config.embed_type if config.embed_type else 'random', 'tune' if config.embed_trainable else 'fix', batch_size, optimizer_type, learning_rate, label_schema) else: config.exp_name = '{}_{}_{}_{}_{}'.format(model_name, batch_size, optimizer_type, learning_rate, label_schema) if config.n_epoch != 50: config.exp_name += '_{}'.format(config.n_epoch) if kwargs: config.exp_name += '_' + '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()]) callback_str = '_' + '_'.join(config.callbacks_to_add) callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '') config.exp_name += callback_str config.use_bert_input = use_bert_input config.bert_type = bert_type config.bert_trainable = bert_trainable config.bert_layer_num = bert_layer_num assert config.use_char_input or config.use_bert_input if config.use_bert_input: config.exp_name += '_{}_layer_{}_{}'.format(bert_type, bert_layer_num, 'tune' if config.bert_trainable else 'fix') config.use_bichar_input = use_bichar_input if config.use_bichar_input: config.bichar_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='bichar')) config.bichar_vocab_size = len(config.bichar_vocab) + 2 if bichar_embed_type: config.bichar_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=bichar_embed_type)) config.bichar_embed_trainable = bichar_embed_trainable config.bichar_embed_dim = config.bichar_embeddings.shape[1] else: config.bichar_embeddings = None config.bichar_embed_trainable = True config.exp_name += '_bichar_{}_{}'.format(bichar_embed_type if bichar_embed_type else 'random', 'tune' if config.bichar_embed_trainable else 'fix') config.use_word_input = use_word_input if config.use_word_input: config.word_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='word')) config.word_vocab_size = len(config.word_vocab) + 2 if word_embed_type: config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=word_embed_type)) config.word_embed_trainable = word_embed_trainable config.word_embed_dim = config.word_embeddings.shape[1] else: config.word_embeddings = None config.word_embed_trainable = True config.exp_name += '_word_{}_{}'.format(word_embed_type if word_embed_type else 'random', 'tune' if config.word_embed_trainable else 'fix') config.use_charpos_input = use_charpos_input if config.use_charpos_input: config.charpos_vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='charpos')) config.charpos_vocab_size = len(config.charpos_vocab) + 2 if charpos_embed_type: config.charpos_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=charpos_embed_type)) config.charpos_embed_trainable = charpos_embed_trainable config.charpos_embed_dim = config.charpos_embeddings.shape[1] else: config.charpos_embeddings = None config.charpos_embed_trainable = True config.exp_name += '_charpos_{}_{}'.format(charpos_embed_type if charpos_embed_type else 'random', 'tune' if config.charpos_embed_trainable else 'fix') config.use_softword_input = use_softword_input if config.use_softword_input: config.exp_name += '_softword' config.use_dictfeat_input = use_dictfeat_input if config.use_dictfeat_input: config.exp_name += '_dictfeat' config.use_maxmatch_input = use_maxmatch_input if config.use_maxmatch_input: config.exp_name += '_maxmatch' # logger to log output of training process train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch, 'learning_rate': learning_rate, 'other_params': kwargs} print('Logging Info - Experiment: %s' % config.exp_name) model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) model = RecognitionModel(config, **kwargs) train_data_type, dev_data_type = 'train', 'dev' train_generator = RecognitionDataGenerator(train_data_type, config.batch_size, config.label_schema, config.label_to_one_hot[config.label_schema], config.vocab if config.use_char_input else None, config.bert_vocab_file(config.bert_type) if config.use_bert_input else None, config.bert_seq_len, config.bichar_vocab, config.word_vocab, config.use_word_input, config.charpos_vocab, config.use_softword_input, config.use_dictfeat_input, config.use_maxmatch_input) valid_generator = RecognitionDataGenerator(dev_data_type, config.batch_size, config.label_schema, config.label_to_one_hot[config.label_schema], config.vocab if config.use_char_input else None, config.bert_vocab_file(config.bert_type) if config.use_bert_input else None, config.bert_seq_len, config.bichar_vocab, config.word_vocab, config.use_word_input, config.charpos_vocab, config.use_softword_input, config.use_dictfeat_input, config.use_maxmatch_input) if not os.path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_generator, valid_generator) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) model.load_best_model() print('Logging Info - Evaluate over valid data:') r, p, f1 = model.evaluate(valid_generator) train_log['dev_performance'] = (r, p, f1) swa_type = None if 'swa' in config.callbacks_to_add: swa_type = 'swa' elif 'swa_clr' in config.callbacks_to_add: swa_type = 'swa_clr' if swa_type: model.load_swa_model(swa_type) print('Logging Info - Evaluate over valid data based on swa model:') r, p, f1 = model.evaluate(valid_generator) train_log['swa_dev_performance'] = (r, p, f1) train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_er'), log=train_log, mode='a') del model gc.collect() K.clear_session()
def __init__(self, data_type, batch_size, label_schema, label_to_onehot, char_vocab=None, bert_vocab=None, bert_seq_len=None, bichar_vocab=None, word_vocab=None, use_word_input=False, charpos_vocab=None, use_softword_input=False, use_dictfeat_input=False, use_maxmatch_input=False, shuffle=True): self.data_type = data_type self.data = load_data(data_type) self.data_size = len(self.data) self.batch_size = batch_size self.indices = np.arange(self.data_size) self.steps = int(np.ceil(self.data_size / self.batch_size)) assert label_schema in ['BIO', 'BIOES'] self.label_schema = label_schema self.label_to_onehot = label_to_onehot # main input self.char_vocab = char_vocab self.use_char_input = False if self.char_vocab is None else True # additional feature input self.bert_vocab = bert_vocab self.use_bert_input = False if self.bert_vocab is None else True self.bert_seq_len = bert_seq_len if self.use_bert_input else None assert self.use_char_input or self.use_bert_input if self.use_bert_input: self.token_dict = {} with codecs.open(self.bert_vocab, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.bert_tokenizer = Tokenizer(self.token_dict) self.bichar_vocab = bichar_vocab self.use_bichar_input = False if self.bichar_vocab is None else True self.word_vocab = word_vocab self.use_word_input = use_word_input assert not (self.use_word_input and self.word_vocab is None) self.charpos_vocab = charpos_vocab self.use_charpos_input = False if self.charpos_vocab is None else True self.use_softword_input = use_softword_input self.use_dictfeat_input = use_dictfeat_input self.use_maxmatch_input = use_maxmatch_input self.mention_to_entity = None if self.use_word_input or self.use_charpos_input or self.use_softword_input: self.mention_to_entity = pickle_load( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) for mention in self.mention_to_entity.keys(): jieba.add_word(mention, freq=1000000) if (self.use_dictfeat_input or self.use_maxmatch_input) and self.mention_to_entity is None: self.mention_to_entity = pickle_load( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) self.shuffle = shuffle
if not os.path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not os.path.exists(SUBMIT_DIR): os.makedirs(SUBMIT_DIR) if not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load knowledge base data mention_to_entity, entity_to_mention, entity_desc, entity_type = load_kb_data( KB_FILENAME) pickle_dump( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME), mention_to_entity) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME), entity_desc) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_TYPE_FILENAME), entity_type) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_TO_MENTION_FILENAME), entity_to_mention) # load training data train_data = load_train_data(CCKS_TRAIN_FILENAME) # prepare character embedding char_vocab, idx2char, char_corpus = load_char_vocab_and_corpus( entity_desc, train_data)
def train_match_model(variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, encoder_type='concat_attention', metrics='euclidean', checkpoint_dir=None, overwrite=False): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir if not os.path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) config.exp_name = '{}_dialect_match_{}_{}_{}_{}_{}'.format( variation, encoder_type, metrics, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') config.checkpoint_monitor = 'val_loss' config.early_stopping_monitor = 'val_loss' train_log = { 'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'learning_rate': learning_rate } model = DialectMatchModel(config, encoder_type='concat_attention', metrics='euclidean') train_input = load_processed_data(variation, input_level, 'train') dev_input = load_processed_data(variation, input_level, 'dev') model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_input, dev_input) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) # load the best model model.load_best_model() print('Logging Info - Evaluate over valid data:') valid_acc, valid_f1 = model.evaluate(dev_input) train_log['valid_acc'] = valid_acc train_log['valid_f1'] = valid_f1 train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation + '_match'), log=train_log, mode='a') return valid_acc, valid_f1
def link(model_name, dev_pred_mentions, test_pred_mentions, predict_log, batch_size=32, n_epoch=50, learning_rate=0.001, optimizer_type='adam', embed_type=None, embed_trainable=True, use_relative_pos=False, n_neg=1, omit_one_cand=True, callbacks_to_add=None, swa_type=None, predict_on_final_test=True, **kwargs): config = ModelConfig() config.model_name = model_name config.batch_size = batch_size config.n_epoch = n_epoch config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.embed_type = embed_type if embed_type: config.embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type)) config.embed_trainable = embed_trainable else: config.embeddings = None config.embed_trainable = True config.callbacks_to_add = callbacks_to_add or [ 'modelcheckpoint', 'earlystopping' ] config.vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char')) config.vocab_size = len(config.vocab) + 2 config.mention_to_entity = pickle_load( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) config.entity_desc = pickle_load( format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME)) config.exp_name = '{}_{}_{}_{}_{}_{}'.format( model_name, embed_type if embed_type else 'random', 'tune' if embed_trainable else 'fix', batch_size, optimizer_type, learning_rate) config.use_relative_pos = use_relative_pos if config.use_relative_pos: config.exp_name += '_rel' config.n_neg = n_neg if config.n_neg > 1: config.exp_name += '_neg_{}'.format(config.n_neg) config.omit_one_cand = omit_one_cand if not config.omit_one_cand: config.exp_name += '_not_omit' if kwargs: config.exp_name += '_' + '_'.join( [str(k) + '_' + str(v) for k, v in kwargs.items()]) callback_str = '_' + '_'.join(config.callbacks_to_add) callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '') config.exp_name += callback_str # logger to log output of training process predict_log.update({ 'el_exp_name': config.exp_name, 'el_batch_size': batch_size, 'el_optimizer': optimizer_type, 'el_epoch': n_epoch, 'el_learning_rate': learning_rate, 'el_other_params': kwargs }) print('Logging Info - Experiment: %s' % config.exp_name) model = LinkModel(config, **kwargs) model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not os.path.exists(model_save_path): raise FileNotFoundError( 'Recognition model not exist: {}'.format(model_save_path)) if swa_type is None: model.load_best_model() elif 'swa' in callbacks_to_add: model.load_swa_model(swa_type) predict_log['er_exp_name'] += '_{}'.format(swa_type) dev_data_type = 'dev' dev_data = load_data(dev_data_type) dev_text_data, dev_gold_mention_entities = [], [] for data in dev_data: dev_text_data.append(data['text']) dev_gold_mention_entities.append(data['mention_data']) if predict_on_final_test: test_data_type = 'test_final' else: test_data_type = 'test' test_data = load_data(test_data_type) test_text_data = [data['text'] for data in test_data] if dev_pred_mentions is not None: print( 'Logging Info - Evaluate over valid data based on predicted mention:' ) r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions, dev_gold_mention_entities) dev_performance = 'dev_performance' if swa_type is None else '%s_dev_performance' % swa_type predict_log[dev_performance] = (r, p, f1) print('Logging Info - Generate submission for test data:') test_pred_mention_entities = model.predict(test_text_data, test_pred_mentions) test_submit_file = predict_log[ 'er_exp_name'] + '_' + config.exp_name + '_%s%ssubmit.json' % ( swa_type + '_' if swa_type else '', 'final_' if predict_on_final_test else '') submit_result(test_submit_file, test_data, test_pred_mention_entities) predict_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step'), log=predict_log, mode='a') return predict_log
def train_link(model_name, batch_size=32, n_epoch=50, learning_rate=0.001, optimizer_type='adam', embed_type=None, embed_trainable=True, callbacks_to_add=None, use_relative_pos=False, n_neg=1, omit_one_cand=True, overwrite=False, swa_start=5, early_stopping_patience=3, **kwargs): config = ModelConfig() config.model_name = model_name config.batch_size = batch_size config.n_epoch = n_epoch config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.embed_type = embed_type if embed_type: config.embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, type=embed_type)) config.embed_trainable = embed_trainable else: config.embeddings = None config.embed_trainable = True config.callbacks_to_add = callbacks_to_add or [ 'modelcheckpoint', 'earlystopping' ] if 'swa' in config.callbacks_to_add: config.swa_start = swa_start config.early_stopping_patience = early_stopping_patience config.vocab = pickle_load( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, level='char')) config.vocab_size = len(config.vocab) + 2 config.mention_to_entity = pickle_load( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME)) config.entity_desc = pickle_load( format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME)) config.exp_name = '{}_{}_{}_{}_{}_{}'.format( model_name, embed_type if embed_type else 'random', 'tune' if config.embed_trainable else 'fix', batch_size, optimizer_type, learning_rate) config.use_relative_pos = use_relative_pos if config.use_relative_pos: config.exp_name += '_rel' config.n_neg = n_neg if config.n_neg > 1: config.exp_name += '_neg_{}'.format(config.n_neg) config.omit_one_cand = omit_one_cand if not config.omit_one_cand: config.exp_name += '_not_omit' if kwargs: config.exp_name += '_' + '_'.join( [str(k) + '_' + str(v) for k, v in kwargs.items()]) callback_str = '_' + '_'.join(config.callbacks_to_add) callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '') config.exp_name += callback_str # logger to log output of training process train_log = { 'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch, 'learning_rate': learning_rate, 'other_params': kwargs } print('Logging Info - Experiment: %s' % config.exp_name) model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) model = LinkModel(config, **kwargs) train_data_type, dev_data_type = 'train', 'dev' train_generator = LinkDataGenerator( train_data_type, config.vocab, config.mention_to_entity, config.entity_desc, config.batch_size, config.max_desc_len, config.max_erl_len, config.use_relative_pos, config.n_neg, config.omit_one_cand) dev_data = load_data(dev_data_type) if not os.path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_generator, dev_data) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) model.load_best_model() dev_text_data, dev_pred_mentions, dev_gold_mention_entities = [], [], [] for data in dev_data: dev_text_data.append(data['text']) dev_pred_mentions.append(data['mention_data']) dev_gold_mention_entities.append(data['mention_data']) print('Logging Info - Evaluate over valid data:') r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions, dev_gold_mention_entities) train_log['dev_performance'] = (r, p, f1) swa_type = None if 'swa' in config.callbacks_to_add: swa_type = 'swa' elif 'swa_clr' in config.callbacks_to_add: swa_type = 'swa_clr' if swa_type: model.load_swa_model(swa_type) print('Logging Info - Evaluate over valid data based on swa model:') r, p, f1 = model.evaluate(dev_text_data, dev_pred_mentions, dev_gold_mention_entities) train_log['swa_dev_performance'] = (r, p, f1) train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, model_type='2step_el'), log=train_log, mode='a') del model gc.collect() K.clear_session()
def process_data(): config = ModelConfig() # create dir if not path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load datasets data_train, data_dev = load_data() print('Logging Info - Data: train - {}, dev - {}'.format( data_train.shape, data_dev.shape)) for variation in VARIATIONS: if variation not in data_train.index: continue analyze_result = {} variation_train = data_train.loc[variation] variation_dev = data_dev.loc[variation] print('Logging Info - Variation: {}, train - {}, dev - {}'.format( variation, variation_train.shape, variation_dev.shape)) analyze_result.update({ 'train_set': len(variation_train), 'dev_set': len(variation_train) }) variation_train_data = get_sentence_label(variation_train) variation_dev_data = get_sentence_label(variation_dev) if config.data_augment: variation_train_data = augment_data(variation_train_data) variation += '_aug' # class distribution analysis train_label_distribution = analyze_class_distribution( variation_train_data['label']) analyze_result.update( dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items())) dev_label_distribution = analyze_class_distribution( variation_dev_data['label']) analyze_result.update( dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items())) # create tokenizer and vocabulary sentences_train = variation_train_data['sentence'] sentences_dev = variation_dev_data['sentence'] word_tokenizer = Tokenizer(char_level=False) char_tokenizer = Tokenizer(char_level=True) word_tokenizer.fit_on_texts(sentences_train) char_tokenizer.fit_on_texts(sentences_train) print('Logging Info - Variation: {}, word_vocab: {}, char_vocab: {}'. format(variation, len(word_tokenizer.word_index), len(char_tokenizer.word_index))) analyze_result.update({ 'word_vocab': len(word_tokenizer.word_index), 'char_vocab': len(char_tokenizer.word_index) }) # length analysis word_len_distribution, word_max_len = analyze_len_distribution( sentences_train, level='word') analyze_result.update( dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items())) char_len_distribution, char_max_len = analyze_len_distribution( sentences_train, level='char') analyze_result.update( dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items())) one_hot = False if config.loss_function == 'binary_crossentropy' else True train_word_ids = create_data_matrices(word_tokenizer, variation_train_data, config.n_class, one_hot, word_max_len) train_char_ids = create_data_matrices(char_tokenizer, variation_train_data, config.n_class, one_hot, char_max_len) dev_word_ids = create_data_matrices(word_tokenizer, variation_dev_data, config.n_class, one_hot, word_max_len) dev_char_ids = create_data_matrices(char_tokenizer, variation_dev_data, config.n_class, one_hot, char_max_len) # create embedding matrix by training on dataset w2v_data = train_w2v(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c2v_data = train_w2v(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_fasttext_data = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_fasttext_data = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) # w_glove_data = train_glove(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index) # c_glove_data = train_glove(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index) # save pre-process data pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, variation=variation), variation_train_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, variation=variation), variation_dev_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, variation=variation, level='word'), train_word_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, variation=variation, level='char'), train_char_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, variation=variation, level='word'), dev_word_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, variation=variation, level='char'), dev_char_ids) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='w2v_data'), w2v_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='c2v_data'), c2v_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='w_fasttext_data'), w_fasttext_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='c_fasttext_data'), c_fasttext_data) # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, # type='w_glove_data'), w_glove_data) # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, # type='c_glove_data'), c_glove_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, variation=variation, level='word'), word_tokenizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, variation=variation, level='char'), char_tokenizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, variation=variation, level='word'), word_tokenizer.word_index) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, variation=variation, level='char'), char_tokenizer.word_index) # prepare ngram feature for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['char', 'word']: for ngram_range in [(1, 1), (2, 2), (3, 3), (2, 3), (1, 3), (2, 4), (1, 4), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8)]: prepare_ngram_feature(vectorizer_type, level, ngram_range, variation_train_data, variation_dev_data, variation) # prepare skip ngram features for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['word', 'char']: for ngram in [2, 3]: for skip_k in [1, 2, 3]: prepare_skip_ngram_feature(vectorizer_type, level, ngram, skip_k, variation_train_data, variation_dev_data, variation) # prepare pos ngram variation_train_pos_data = { 'sentence': [ get_pos(sentence) for sentence in variation_train_data['sentence'] ], 'label': variation_train_data['label'] } variation_dev_pos_data = { 'sentence': [get_pos(sentence) for sentence in variation_dev_data['sentence']], 'label': variation_dev_data['label'] } for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['word']: for ngram_range in [(1, 1), (2, 2), (3, 3)]: prepare_ngram_feature(vectorizer_type, level, ngram_range, variation_train_pos_data, variation_dev_pos_data, variation + '_pos') # save analyze result write_log( format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, variation=variation), analyze_result)
def train_dl_model(variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, binary_threshold=0.5, checkpoint_dir=None, overwrite=False, log_error=False, save_log=True, **kwargs): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.binary_threshold = binary_threshold if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir if not os.path.exists(config.checkpoint_dir): os.makedirs(config.checkpoint_dir) config.exp_name = '{}_{}_{}_{}_{}'.format( variation, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') train_log = { 'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'learning_rate': learning_rate, 'binary_threshold': binary_threshold } print('Logging Info - Experiment: ', config.exp_name) if model_name == 'bilstm': model = BiLSTM(config, **kwargs) elif model_name == 'cnnrnn': model = CNNRNN(config, **kwargs) elif model_name == 'dcnn': model = DCNN(config, **kwargs) elif model_name == 'dpcnn': model = DPCNN(config, **kwargs) elif model_name == 'han': model = HAN(config, **kwargs) elif model_name == 'multicnn': model = MultiTextCNN(config, **kwargs) elif model_name == 'rcnn': model = RCNN(config, **kwargs) elif model_name == 'rnncnn': model = RNNCNN(config, **kwargs) elif model_name == 'cnn': model = TextCNN(config, **kwargs) elif model_name == 'vdcnn': model = VDCNN(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) train_input = load_processed_data(variation, input_level, 'train') dev_input = load_processed_data(variation, input_level, 'dev') test_input = load_processed_data(variation, input_level, 'test') model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path) or overwrite: start_time = time.time() model.train(train_input, dev_input) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) # load the best model model.load_best_model() print('Logging Info - Evaluate over valid data:') valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r = model.evaluate( dev_input) train_log['valid_acc'] = valid_acc train_log['valid_f1'] = valid_f1 train_log['valid_macro_f1'] = valid_macro_f1 train_log['valid_p'] = valid_p train_log['valid_r'] = valid_r train_log['time_stamp'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if log_error: error_indexes, error_pred_probas = model.error_analyze(dev_input) dev_text_input = load_processed_text_data(variation, 'dev') for error_index, error_pred_prob in zip(error_indexes, error_pred_probas): train_log['error_%d' % error_index] = '{},{},{},{}'.format( error_index, dev_text_input['sentence'][error_index], dev_text_input['label'][error_index], error_pred_prob) if save_log: write_log(format_filename(LOG_DIR, PERFORMANCE_LOG_TEMPLATE, variation=variation), log=train_log, mode='a') if test_input is not None: test_predictions = model.predict(test_input) writer_predict( format_filename(PREDICT_DIR, config.exp_name + '.labels'), test_predictions) return valid_acc, valid_f1, valid_macro_f1, valid_p, valid_r
def train_model(genre, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, n_epoch=50, add_features=False, scale_features=False, overwrite=False, lr_range_test=False, callbacks_to_add=None, eval_on_train=False, **kwargs): config = ModelConfig() config.genre = genre config.input_level = input_level config.max_len = config.word_max_len[genre] if input_level == 'word' else config.char_max_len[genre] config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.callbacks_to_add = callbacks_to_add or [] config.add_features = add_features config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) config.n_epoch = n_epoch config.word_embeddings = np.load(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, word_embed_type)) vocab = pickle_load(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, input_level)) config.idx2token = dict((idx, token) for token, idx in vocab.items()) # experiment name configuration config.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}'.format(genre, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix', batch_size, '_'.join([str(k) + '_' + str(v) for k, v in kwargs.items()]), optimizer_type) if config.add_features: config.exp_name = config.exp_name + '_feature_scaled' if scale_features else config.exp_name + '_featured' if len(config.callbacks_to_add) > 0: callback_str = '_' + '_'.join(config.callbacks_to_add) callback_str = callback_str.replace('_modelcheckpoint', '').replace('_earlystopping', '') config.exp_name += callback_str input_config = kwargs['input_config'] if 'input_config' in kwargs else 'token' # input default is word embedding if input_config in ['cache_elmo', 'token_combine_cache_elmo']: # get elmo embedding based on cache, we first get a ELMoCache instance if 'elmo_model_type' in kwargs: elmo_model_type = kwargs['elmo_model_type'] kwargs.pop('elmo_model_type') # we don't need it in kwargs any more else: elmo_model_type = 'allennlp' if 'elmo_output_mode' in kwargs: elmo_output_mode = kwargs['elmo_output_mode'] kwargs.pop('elmo_output_mode') # we don't need it in kwargs any more else: elmo_output_mode ='elmo' elmo_cache = ELMoCache(options_file=config.elmo_options_file, weight_file=config.elmo_weight_file, cache_dir=config.cache_dir, idx2token=config.idx2token, max_sentence_length=config.max_len, elmo_model_type=elmo_model_type, elmo_output_mode=elmo_output_mode) elif input_config in ['elmo_id', 'elmo_s', 'token_combine_elmo_id', 'token_combine_elmo_s']: # get elmo embedding using tensorflow_hub, we must provide a tfhub_url kwargs['elmo_model_url'] = config.elmo_model_url # logger to log output of training process train_log = {'exp_name': config.exp_name, 'batch_size': batch_size, 'optimizer': optimizer_type, 'epoch': n_epoch, 'learning_rate': learning_rate, 'other_params': kwargs} print('Logging Info - Experiment: %s' % config.exp_name) if model_name == 'KerasInfersent': model = KerasInfersentModel(config, **kwargs) elif model_name == 'KerasEsim': model = KerasEsimModel(config, **kwargs) elif model_name == 'KerasDecomposable': model = KerasDecomposableAttentionModel(config, **kwargs) elif model_name == 'KerasSiameseBiLSTM': model = KerasSimaeseBiLSTMModel(config, **kwargs) elif model_name == 'KerasSiameseCNN': model = KerasSiameseCNNModel(config, **kwargs) elif model_name == 'KerasIACNN': model = KerasIACNNModel(config, **kwargs) elif model_name == 'KerasSiameseLSTMCNNModel': model = KerasSiameseLSTMCNNModel(config, **kwargs) elif model_name == 'KerasRefinedSSAModel': model = KerasRefinedSSAModel(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) # model.summary() train_input, dev_input, test_input = None, None, None if lr_range_test: # conduct lr range test to find optimal learning rate (not train model) train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features) dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features) model.lr_range_test(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'], y_valid=dev_input['y']) return model_save_path = os.path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not os.path.exists(model_save_path) or overwrite: start_time = time.time() if input_config in ['cache_elmo', 'token_combine_cache_elmo']: train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache, return_data=(input_config == 'token_combine_cache_elmo'), return_features=config.add_features) dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache, return_data=(input_config == 'token_combine_cache_elmo'), return_features=config.add_features) model.train_with_generator(train_input, dev_input) else: train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features) dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features) model.train(x_train=train_input['x'], y_train=train_input['y'], x_valid=dev_input['x'], y_valid=dev_input['y']) elapsed_time = time.time() - start_time print('Logging Info - Training time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) train_log['train_time'] = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) def eval_on_data(eval_with_generator, input_data, data_type): model.load_best_model() if eval_with_generator: acc = model.evaluate_with_generator(generator=input_data, y=input_data.input_label) else: acc = model.evaluate(x=input_data['x'], y=input_data['y']) train_log['%s_acc' % data_type] = acc swa_type = None if 'swa' in config.callbacks_to_add: swa_type = 'swa' elif 'swa_clr' in config.callbacks_to_add: swa_type = 'swa_clr' if swa_type: print('Logging Info - %s Model' % swa_type) model.load_swa_model(swa_type=swa_type) swa_acc = model.evaluate(x=input_data['x'], y=input_data['y']) train_log['%s_%s_acc' % (swa_type, data_type)] = swa_acc ensemble_type = None if 'sse' in config.callbacks_to_add: ensemble_type = 'sse' elif 'fge' in config.callbacks_to_add: ensemble_type = 'fge' if ensemble_type: print('Logging Info - %s Ensemble Model' % ensemble_type) ensemble_predict = {} for model_file in os.listdir(config.checkpoint_dir): if model_file.startswith(config.exp_name+'_%s' % ensemble_type): match = re.match(r'(%s_%s_)([\d+])(.hdf5)' % (config.exp_name, ensemble_type), model_file) model_id = int(match.group(2)) model_path = os.path.join(config.checkpoint_dir, model_file) print('Logging Info: Loading {} ensemble model checkpoint: {}'.format(ensemble_type, model_file)) model.load_model(model_path) ensemble_predict[model_id] = model.predict(x=input_data['x']) ''' we expect the models saved towards the end of run may have better performance than models saved earlier in the run, we sort the models so that the older models ('s id) are first. ''' sorted_ensemble_predict = sorted(ensemble_predict.items(), key=lambda x: x[0], reverse=True) model_predicts = [] for model_id, model_predict in sorted_ensemble_predict: single_acc = eval_acc(model_predict, input_data['y']) print('Logging Info - %s_single_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, single_acc)) train_log['%s_single_%d_%s_acc' % (ensemble_type, model_id, data_type)] = single_acc model_predicts.append(model_predict) ensemble_acc = eval_acc(np.mean(np.array(model_predicts), axis=0), input_data['y']) print('Logging Info - %s_ensemble_%d_%s Acc : %f' % (ensemble_type, model_id, data_type, ensemble_acc)) train_log['%s_ensemble_%d_%s_acc' % (ensemble_type, model_id, data_type)] = ensemble_acc if eval_on_train: # might take a long time print('Logging Info - Evaluate over train data:') if input_config in ['cache_elmo', 'token_combine_cache_elmo']: train_input = ELMoGenerator(genre, input_level, 'train', config.batch_size, elmo_cache, return_data=(input_config == 'token_combine_cache_elmo'), return_features=config.add_features, return_label=False) eval_on_data(eval_with_generator=True, input_data=train_input, data_type='train') else: train_input = load_input_data(genre, input_level, 'train', input_config, config.add_features, scale_features) eval_on_data(eval_with_generator=False, input_data=train_input, data_type='train') print('Logging Info - Evaluate over valid data:') if input_config in ['cache_elmo', 'token_combine_cache_elmo']: dev_input = ELMoGenerator(genre, input_level, 'dev', config.batch_size, elmo_cache, return_data=(input_config == 'token_combine_cache_elmo'), return_features=config.add_features, return_label=False) eval_on_data(eval_with_generator=True, input_data=dev_input, data_type='dev') else: if dev_input is None: dev_input = load_input_data(genre, input_level, 'dev', input_config, config.add_features, scale_features) eval_on_data(eval_with_generator=False, input_data=dev_input, data_type='dev') print('Logging Info - Evaluate over test data:') if input_config in ['cache_elmo', 'token_combine_cache_elmo']: test_input = ELMoGenerator(genre, input_level, 'test', config.batch_size, elmo_cache, return_data=(input_config == 'token_combine_cache_elmo'), return_features=config.add_features, return_label=False) eval_on_data(eval_with_generator=True, input_data=test_input, data_type='test') else: if test_input is None: test_input = load_input_data(genre, input_level, 'test', input_config, config.add_features, scale_features) eval_on_data(eval_with_generator=False, input_data=test_input, data_type='test') train_log['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, PERFORMANCE_LOG, genre), log=train_log, mode='a') return train_log
if __name__ == '__main__': if not os.path.exists(PREDICT_DIR): os.makedirs(PREDICT_DIR) config = ModelConfig() raw_data = dict() raw_data['simplified'] = read_raw_test_data(SIMP_TEST_FILENAME) raw_data['traditional'] = read_raw_test_data(TRAD_TEST_FILENAME) for variation in raw_data.keys(): test_data = raw_data[variation] # prepare word embedding input word_tokenizer = pickle_load( format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, variation=variation, level='word')) word_ids_test = create_token_ids_matrix(word_tokenizer, raw_data[variation], config.word_max_len) # prepare n-gram input vectorizer = pickle_load( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type='binary', level='char', ngram_range=(2, 3))) n_gram_test = vectorizer.transform(raw_data[variation])