def main(logger): df_train, df_test = load_data(INPUT_DIR, logger) logger.info('Preprocess text') df_train = preprocess_text(df_train.iloc[:200000]) df_test = preprocess_text(df_test) seq_train, tokenizer = tokenize_text(df_train, logger) seq_test, _ = tokenize_text(df_test, logger, tokenizer=tokenizer) text_train = df_train['question_text'].values.tolist() embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300) scdv = SCDV(embedding_matrix, tokenizer, logger, num_clusters=50, gmm_path=DATA_DIR.joinpath('gmm_tmp.pkl')) with logger.timer('SCDV computation on train data'): scdv_train = scdv.fit_transform(text_train, seq_train) logger.post( f'Computing SCDV for train data has been done: shape = {scdv_train.shape}' ) with logger.timer('SCDV computation on test data'): scdv_test = scdv.transform(seq_test) logger.post( f'Computing SCDV for test data has been done: shape = {scdv_test.shape}' )
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) logger.info('Preprocess text') df_train = preprocess_text(df_train) seq_train, tokenizer = tokenize_text(df_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) logger.info('Load multiple embeddings') embedding_matrix = load_multiple_embeddings( tokenizer.word_index, embed_types=[0, 1, 2], max_workers=args['max_workers']) embedding_matrix = np.array(embedding_matrix).mean(axis=0) # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) set_seed(SEED) batch_size = args['batch_size'] * len(device_ids) max_workers = args['max_workers'] if args['debug']: epochs = 2 else: epochs = EPOCHS logger.info('Start training and evaluation loop') skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) oof_preds = np.zeros(seq_train.shape[0]) results = [] for fold, (index_train, index_valid) in enumerate(skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model') x_train, x_valid = seq_train[index_train].astype( int), seq_train[index_valid].astype(int) y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) dataset_train = SimpleDataset(x_train, y_train) dataset_valid = SimpleDataset(x_valid, y_valid) dataloader_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True, worker_init_fn=worker_init_fn) dataloader_valid = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=False, pin_memory=True, worker_init_fn=worker_init_fn) dataloaders = {'train': dataloader_train, 'valid': dataloader_valid} model = StackedNormalizedRNNFM(embedding_matrix, PADDING_LENGTH, hidden_size=64, out_hidden_dim=64, embed_drop=0.2, out_drop=0.3, residual=False) model.to(output_device) criteria = [[nn.BCEWithLogitsLoss(reduction='mean')], [1.0]] metric = f1_from_logits_optimized optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = None model_save_path = str( DATA_DIR.joinpath( f'models/{Path(__file__).stem}_fold_{fold}.model')) model_name = model._get_name() config = { 'epochs': epochs, 'loss_names': ['BCE Loss'], 'metric_type': 'batch', 'model_save_path': model_save_path, 'output_device': output_device, 'mode': 'max', 'early_stopping': 200, 'model_name': model_name, 'reg_lambda': None, 'fold': fold } model, valid_score, best_epoch = train_model(model, criteria, metric, optimizer, scheduler, dataloaders, logger, config) results.append({ 'fold': fold, 'best_score': valid_score, 'best_epoch': best_epoch }) message = f'Training and evaluation for the fold {fold + 1} / {KFOLD} has been done.\n' message += f'Validation F1 score: {valid_score}\n' logger.post(message) dataloader_valid = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=False, pin_memory=True) oof_preds[index_valid] = sp.special.expit( predict(model, dataloader_valid, config).reshape(-1, )) logger.post(f'K-Fold train and evaluation results: {results}') logger.info( 'Training and evaluation loop has been done. Start f1 threshold search.' ) search_result = threshold_search(label_train.reshape(-1, ), oof_preds) logger.post( f'Threshold search result - f1: {search_result["f1"]}, threshold: {search_result["threshold"]}' )
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) if args['debug']: df_train = df_train.iloc[:30000] logger.info('Extract nlp features') df_train = extract_nlp_features(df_train) else: logger.info('Extract nlp features') df_train = extract_nlp_features(df_train) logger.info('Preprocess text') df_train = preprocess_text(df_train) seq_train, tokenizer = tokenize_text(df_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) if args['debug']: embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300).astype(np.float32) else: logger.info('Load multiple embeddings') embedding_matrices = load_multiple_embeddings( tokenizer.word_index, embed_types=[0, 2], max_workers=args['max_workers']) embedding_matrix = np.array(embedding_matrices).mean(0) continuous_columns = [ 'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!', 'n_you' ] for col in continuous_columns: scaler = StandardScaler() df_train[col] = scaler.fit_transform(df_train[col].values.astype( np.float32).reshape(-1, 1)).reshape(-1, ) x_continuous = [ df_train[col].values.reshape(-1, 1) for col in continuous_columns ] # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) trigger = TRIGGER if args['debug']: epochs = 3 n_splits = 2 else: epochs = EPOCHS n_splits = KFOLD logger.info('Start training and evaluation loop') model_specs = [{ 'nlp_dim': 16, 'nlp_dropout': 0.2, 'num_dense_layers': 2, 'mask': True }] model_name_base = 'NLPFeaturesRNN' for spec_id, spec in enumerate(model_specs): model_name = model_name_base + f'_specId={spec_id}_nlpdim={spec["nlp_dim"]}_nlpdrop={spec["nlp_dropout"]}' model_name += f'_numlayers={spec["num_dense_layers"]}_mask={spec["mask"]}' skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) oof_mv_preds = np.zeros(len(seq_train)) oof_preds_proba = np.zeros(len(seq_train)) oof_opt_preds = np.zeros(len(seq_train)) oof_reopt_preds = np.zeros(len(seq_train)) results_list = [] for fold, (index_train, index_valid) in enumerate( skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model' ) x_train = { 'text': seq_train[index_train].astype(int), 'continuous': [x[index_train] for x in x_continuous] } x_valid = { 'text': seq_train[index_valid].astype(int), 'continuous': [x[index_valid] for x in x_continuous] } y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) model = NLPFeaturesRNN({'continuous': len(x_continuous)}, embedding_matrix, PADDING_LENGTH, hidden_size=64, out_hidden_dim=64, out_drop=0.3, embed_drop=0.1, dense_activate='relu', nlp_hidden_dim=spec['nlp_dim'], mask=spec['mask'], nlp_dropout=spec['nlp_dropout'], factorize=False, num_dense_layers=spec['num_dense_layers']) steps_per_epoch = seq_train[index_train].shape[0] // batch_size scheduler_trigger_steps = steps_per_epoch * trigger step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'criterion_type': 'bce', 'criteria_weights': [1.0, 1.0], 'criterion_gamma': 2.0, 'criterion_alpha': 0.75, 'optimizer': 'adam', 'optimizer_lr': 0.003, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.0005, 'max_lr': 0.003, 'step_size': step_size, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': scheduler_trigger_steps, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) fold_results = calculate_fold_metrics( eval_results, label_train[index_valid].reshape(-1, )) results_list.append(fold_results) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, ' message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n' message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, ' message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n' message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, ' message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n' message += f'Focal Loss: {fold_results["oof_focal_loss"]}, ' message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, ' message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, ' logger.post(message) eval_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } for res in eval_results: res.update(eval_results_addition) post_to_snapshot_metrics_table(data=res, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) fold_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } fold_results.update(fold_results_addition) post_to_fold_metrics_table(fold_results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) oof_mv_preds[index_valid] = fold_results['oof_mv_preds'] oof_opt_preds[index_valid] = fold_results['oof_opt_preds'] oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds'] oof_preds_proba[index_valid] = fold_results['oof_preds_proba'] results = calculate_total_metrics(results_list) results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name } results.update(results_addition) post_to_total_metrics_table(results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) message = 'KFold training and evaluation has been done.\n' message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, ' message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n' message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, ' message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n' message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, ' message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n' mv_thresholds = ",".join( [str(th) for th in results["mv_thresholds_avg"]]) message += f'Focal Loss: {results["focal_loss_avg"]}, ' message += f'Optimized Threshold: {results["opt_threshold_avg"]}, ' message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n' message += f'Majority Voting Thresholds: {mv_thresholds}' logger.post(message)
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) logger.info('Preprocess text') if args['debug']: df_train = df_train.iloc[:200000] else: df_train = preprocess_text(df_train) seq_train, tokenizer = tokenize_text(df_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) logger.info('Load multiple embeddings') if args['debug']: embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300) else: embedding_matrix = load_multiple_embeddings( tokenizer.word_index, embed_types=[0, 2], max_workers=args['max_workers']) embedding_matrix = np.array(embedding_matrix).mean(axis=0) # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) epochs = EPOCHS logger.info('Start training and evaluation loop') model_name = 'StackedCNNRNN' skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) oof_preds_optimized = np.zeros(seq_train.shape[0]) oof_preds_majority = np.zeros(seq_train.shape[0]) results = [] for fold, (index_train, index_valid) in enumerate(skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model') x_train, x_valid = seq_train[index_train].astype( int), seq_train[index_valid].astype(int) y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) model = StackedCNNRNN(embedding_matrix, PADDING_LENGTH, hidden_size=32, out_hidden_dim=32, kernel_sizes=(3, 5), seq_dropout=0.2, out_drop=0.2, embed_drop=0.1) config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'optimizer': 'adam', 'optimizer_lr': 0.003, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.00001, 'max_lr': 0.003, 'step_size': 1200, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': 4000, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) oof_preds_majority[index_valid] = np.array( [res['preds_binary'] for res in eval_results]).mean(0) > 0.5 oof_majority_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_majority[index_valid]) oof_preds_proba = np.array( [res['preds_proba'] for res in eval_results]).mean(0) oof_threshold_mean: float = np.mean( [res['best_threshold'] for res in eval_results]) oof_preds_optimized[index_valid] = oof_preds_proba > oof_threshold_mean oof_optimized_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_optimized[index_valid]) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}' logger.post(message) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='SCORE', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['f1'] for res in eval_results]) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='THRESHOLD', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['best_threshold'] for res in eval_results]) post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, f1_majority=oof_majority_f1, f1_optimized=oof_optimized_f1, threshold=oof_threshold_mean) results.append({ 'f1_majority': oof_majority_f1, 'f1_optimized': oof_optimized_f1, 'threshold': oof_threshold_mean }) f1_majority_mean = np.mean([res['f1_majority'] for res in results]) f1_majority_std = np.std([res['f1_majority'] for res in results]) f1_optimized_mean = np.mean([res['f1_optimized'] for res in results]) f1_optimized_std = np.std([res['f1_optimized'] for res in results]) threshold_mean = np.mean([res['threshold'] for res in results]) total_metrics = [ f1_majority_mean, f1_majority_std, f1_optimized_mean, f1_optimized_std, threshold_mean ] post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=-1, f1_majority=-1, f1_optimized=-1, threshold=-1, others=total_metrics) message = 'KFold training and evaluation has been done.\n' message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n' message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n' message += f'Threshold - Avg: {threshold_mean}' logger.post(message)
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) if args['debug']: df_train = df_train.iloc[:30000] texts_train = df_train['question_text'] else: logger.info('Preprocess text') texts_train = preprocess_text(df_train, return_df=False) seq_train, tokenizer = tokenize_texts(texts_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) embed_types = [0, 1, 2] logger.info( 'Start multiprocess nlp feature extraction and embedding matrices loading' ) with mp.Pool(processes=2) as p: results = p.map(parallel_apply, [(extract_nlp_features, (df_train, )), (load_multiple_embeddings, (tokenizer.word_index, embed_types, args['debug']))]) df_train_extracted = results[0] embedding_matrices = results[1] embedding_matrix = np.concatenate( [np.array([embedding_matrices[i] for i in [0, 1, 2]]).mean(0)] + [embedding_matrices[j] for j in [1]], axis=1) nlp_columns = [ 'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!', 'n_you' ] for col in nlp_columns: scaler = StandardScaler() df_train_extracted[col] = scaler.fit_transform( df_train_extracted[col].values.astype(np.float32).reshape( -1, 1)).reshape(-1, ) x_nlp = [ df_train_extracted[col].values.reshape(-1, 1) for col in nlp_columns ] nlp_size = len(x_nlp) # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) trigger = TRIGGER if args['debug']: epochs = 3 n_splits = 2 else: epochs = EPOCHS n_splits = KFOLD logger.info('Start training and evaluation loop') model_specs = [ { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 64, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.3 }) }, { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 128, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.3 }) }, { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 128, 'dropout': 0.5 }, { 'dim': 128, 'dropout': 0.3 }) }, { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 256, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.3 }) }, { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 64, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.3 }) }, { 'nlp_layer_types': ({ 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }, { 'activation': 'relu', 'dim': 16, 'dropout': 0.2 }), 'rnn_layer_types': ({ 'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }, { 'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0 }), 'upper_layer_types': ({ 'dim': 128, 'dropout': 0.5 }, { 'dim': 128, 'dropout': 0.5 }, { 'dim': 64, 'dropout': 0.3 }) }, ] model_name_base = 'NLPFeaturesDeepRNN' for spec_id, spec in enumerate(model_specs): model_name = model_name_base + f'_specId={spec_id}' skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) oof_mv_preds = np.zeros(len(seq_train)) oof_preds_proba = np.zeros(len(seq_train)) oof_opt_preds = np.zeros(len(seq_train)) oof_reopt_preds = np.zeros(len(seq_train)) results_list = [] for fold, (index_train, index_valid) in enumerate( skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model' ) x_train = { 'text': seq_train[index_train].astype(int), 'nlp': [x[index_train] for x in x_nlp] } x_valid = { 'text': seq_train[index_valid].astype(int), 'nlp': [x[index_valid] for x in x_nlp] } y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) model = NLPFeaturesDeepRNN( embedding_matrix, PADDING_LENGTH, nlp_size, embed_drop=0.2, mask=True, nlp_layer_types=spec['nlp_layer_types'], rnn_layer_types=spec['rnn_layer_types'], upper_layer_types=spec['upper_layer_types']) steps_per_epoch = seq_train[index_train].shape[0] // batch_size scheduler_trigger_steps = steps_per_epoch * trigger step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'criterion_type': 'bce', 'criteria_weights': [1.0, 1.0], 'criterion_gamma': 2.0, 'criterion_alpha': 0.75, 'optimizer': 'adam', 'optimizer_lr': 0.003, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.0005, 'max_lr': 0.003, 'step_size': step_size, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': scheduler_trigger_steps, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) fold_results = calculate_fold_metrics( eval_results, label_train[index_valid].reshape(-1, )) results_list.append(fold_results) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, ' message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n' message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, ' message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n' message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, ' message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n' message += f'Focal Loss: {fold_results["oof_focal_loss"]}, ' message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, ' message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, ' logger.post(message) eval_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } for res in eval_results: res.update(eval_results_addition) # post_to_snapshot_metrics_table(data=res, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) fold_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } fold_results.update(fold_results_addition) post_to_fold_metrics_table(fold_results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) oof_mv_preds[index_valid] = fold_results['oof_mv_preds'] oof_opt_preds[index_valid] = fold_results['oof_opt_preds'] oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds'] oof_preds_proba[index_valid] = fold_results['oof_preds_proba'] results = calculate_total_metrics(results_list) results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name } results.update(results_addition) if args['save_preds']: save_path = DATA_DIR.joinpath( f'predictions/{SCRIPT_NAME + "_" + model_name + ".pkl"}') predictions = { 'proba': oof_preds_proba, 'mv': oof_mv_preds, 'opt': oof_opt_preds, 'reopt': oof_reopt_preds } joblib.dump(predictions, str(save_path)) post_to_total_metrics_table(results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) logger.post(f'Spec ID: {spec_id}\nModel Spec: {spec}') message = 'KFold training and evaluation has been done.\n' message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, ' message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n' message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, ' message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n' message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, ' message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n' mv_thresholds = ", ".join( [str(th) for th in results["mv_thresholds_avg"]]) message += f'Focal Loss: {results["focal_loss_avg"]}, ' message += f'Optimized Threshold: {results["opt_threshold_avg"]}, ' message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n' message += f'Majority Voting Thresholds: {mv_thresholds}' logger.post(message)
if __name__ == '__main__': device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(0) print(device) whole_data_path = 'data/combined/whole.txt' train_data_path = 'data/combined/train.txt' dev_data_path = 'data/combined/dev.txt' test_data_path = 'data/combined/digitoday.2015.test.txt' wiki_data_path = 'data/combined/wikipedia.test.txt' whole_data_morph_path = 'utils/subword_segmentation/output/segmented/whole_vocab_segmented.txt' whole_data = prepare_data.load_data(whole_data_path) train_data = prepare_data.load_data(train_data_path) dev_data = prepare_data.load_data(dev_data_path) test_data = prepare_data.load_data(test_data_path) wiki_data = prepare_data.load_data(wiki_data_path) whole_data_morphs = prepare_data.load_data_morphs(whole_data_morph_path) # convert to lower case whole_data = prepare_data.to_lower(whole_data) train_data = prepare_data.to_lower(train_data) dev_data = prepare_data.to_lower(dev_data) test_data = prepare_data.to_lower(test_data) wiki_data = prepare_data.to_lower(wiki_data)
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) logger.info('Preprocess text') if args['debug']: df_train = df_train.iloc[:200000] else: df_train = preprocess_text(df_train) seq_train, tokenizer = tokenize_text(df_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH, padding='post', truncating='post') pos_train = np.repeat([np.arange(PADDING_LENGTH) + 1], seq_train.shape[0], axis=0) pos_train = pos_train * np.not_equal(seq_train, 0) label_train = df_train['target'].values.reshape(-1, 1) if args['debug']: embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300).astype(np.float32) else: logger.info('Load multiple embeddings') embedding_matrices = load_multiple_embeddings( tokenizer.word_index, embed_types=[0, 2], max_workers=args['max_workers']) embedding_matrix = np.array(embedding_matrices).mean(0) # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) epochs = EPOCHS trigger = TRIGGER logger.info('Start training and evaluation loop') model_specs = [{ 'attention_type': 'general', 'num_layers': 2 }, { 'attention_type': 'dot', 'num_layers': 2 }, { 'attention_type': 'general', 'num_layers': 1 }, { 'attention_type': 'dot', 'num_layers': 1 }] model_name_base = 'TransformerRNN' for spec_id, spec in enumerate(model_specs): model_name = model_name_base + f'_specId={spec_id}_attentiontype={spec["attention_type"]}' model_name += f'_numlayers={spec["num_layers"]}' skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) oof_preds_optimized = np.zeros(seq_train.shape[0]) oof_preds_majority = np.zeros(seq_train.shape[0]) results = [] for fold, (index_train, index_valid) in enumerate( skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model' ) x_train = { 'sequence': seq_train[index_train].astype(int), 'position': pos_train[index_train].astype(int) } x_valid = { 'sequence': seq_train[index_valid].astype(int), 'position': pos_train[index_valid].astype(int) } y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) model = TransformerRNN(embedding_matrix, PADDING_LENGTH, hidden_dim=64, out_hidden_dim=64, out_drop=0.3, embed_drop=0.2, attention_type=spec['attention_type'], num_layers=spec['num_layers']) steps_per_epoch = seq_train[index_train].shape[0] // batch_size scheduler_trigger_steps = steps_per_epoch * trigger step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'criterion_type': 'bce', 'criteria_weights': [0.5, 0.5], 'criterion_gamma': 2.0, 'criterion_alpha': 0.75, 'optimizer': 'adam', 'optimizer_lr': 0.001, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.0005, 'max_lr': 0.001, 'step_size': step_size, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': scheduler_trigger_steps, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) oof_preds_majority[index_valid] = np.array( [res['preds_binary'] for res in eval_results]).mean(0) > 0.5 oof_majority_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_majority[index_valid]) oof_preds_proba = np.array( [res['preds_proba'] for res in eval_results]).mean(0) oof_threshold_mean: float = np.mean( [res['best_threshold'] for res in eval_results]) oof_preds_optimized[ index_valid] = oof_preds_proba > oof_threshold_mean oof_optimized_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_optimized[index_valid]) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}' logger.post(message) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='SCORE', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['f1'] for res in eval_results]) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='THRESHOLD', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['best_threshold'] for res in eval_results]) post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, f1_majority=oof_majority_f1, f1_optimized=oof_optimized_f1, threshold=oof_threshold_mean) results.append({ 'f1_majority': oof_majority_f1, 'f1_optimized': oof_optimized_f1, 'threshold': oof_threshold_mean }) f1_majority_mean = np.mean([res['f1_majority'] for res in results]) f1_majority_std = np.std([res['f1_majority'] for res in results]) f1_optimized_mean = np.mean([res['f1_optimized'] for res in results]) f1_optimized_std = np.std([res['f1_optimized'] for res in results]) threshold_mean = np.mean([res['threshold'] for res in results]) total_metrics = [ f1_majority_mean, f1_majority_std, f1_optimized_mean, f1_optimized_std, threshold_mean ] post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=-1, f1_majority=-1, f1_optimized=-1, threshold=-1, others=total_metrics) message = 'KFold training and evaluation has been done.\n' message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n' message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n' message += f'Threshold - Avg: {threshold_mean}' logger.post(message)
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) if args['debug']: df_train = df_train.iloc[:200000] logger.info('Extract nlp features') df_train = extract_nlp_features(df_train) else: logger.info('Extract nlp features') df_train = extract_nlp_features(df_train) logger.info('Preprocess text') df_train = preprocess_text(df_train) seq_train, tokenizer = tokenize_text(df_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) if args['debug']: embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300).astype(np.float32) else: logger.info('Load multiple embeddings') embedding_matrices = load_multiple_embeddings( tokenizer.word_index, embed_types=[0, 2], max_workers=args['max_workers']) embedding_matrix = np.array(embedding_matrices).mean(0) continuous_columns = [ 'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!', 'n_you' ] for col in continuous_columns: scaler = StandardScaler() df_train[col] = scaler.fit_transform(df_train[col].values.astype( np.float32).reshape(-1, 1)).reshape(-1, ) x_continuous = [ df_train[col].values.reshape(-1, 1) for col in continuous_columns ] # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) epochs = EPOCHS trigger = TRIGGER logger.info('Start training and evaluation loop') model_specs = [ { 'nlp_dim': 64, 'nlp_dropout': 0.2, 'num_dense_layers': 1, 'mask': False }, { 'nlp_dim': 32, 'nlp_dropout': 0.2, 'num_dense_layers': 1, 'mask': False }, { 'nlp_dim': 16, 'nlp_dropout': 0.2, 'num_dense_layers': 2, 'mask': False }, { 'nlp_dim': 32, 'nlp_dropout': 0.2, 'num_dense_layers': 2, 'mask': False }, { 'nlp_dim': 64, 'nlp_dropout': 0.5, 'num_dense_layers': 2, 'mask': False }, { 'nlp_dim': 32, 'nlp_dropout': 0.5, 'num_dense_layers': 1, 'mask': False }, { 'nlp_dim': 64, 'nlp_dropout': 0.2, 'num_dense_layers': 1, 'mask': True }, { 'nlp_dim': 32, 'nlp_dropout': 0.2, 'num_dense_layers': 2, 'mask': True }, ] model_name_base = 'NLPFeaturesRNN' for spec_id, spec in enumerate(model_specs): model_name = model_name_base + f'_specId={spec_id}_nlpdim={spec["nlp_dim"]}_nlpdrop={spec["nlp_dropout"]}' model_name += f'_numlayers={spec["num_dense_layers"]}_mask={spec["mask"]}' skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) oof_preds_optimized = np.zeros(len(seq_train)) oof_preds_majority = np.zeros(len(seq_train)) results = [] for fold, (index_train, index_valid) in enumerate( skf.split(label_train, label_train)): logger.info( f'Fold {fold + 1} / {KFOLD} - create dataloader and build model' ) x_train = { 'text': seq_train[index_train].astype(int), 'continuous': [x[index_train] for x in x_continuous] } x_valid = { 'text': seq_train[index_valid].astype(int), 'continuous': [x[index_valid] for x in x_continuous] } y_train, y_valid = label_train[index_train].astype( np.float32), label_train[index_valid].astype(np.float32) model = NLPFeaturesRNN({'continuous': len(x_continuous)}, embedding_matrix, PADDING_LENGTH, hidden_size=64, out_hidden_dim=64, out_drop=0.3, embed_drop=0.1, dense_activate='relu', nlp_hidden_dim=spec['nlp_dim'], mask=spec['mask'], nlp_dropout=spec['nlp_dropout'], factorize=False, num_dense_layers=spec['num_dense_layers']) steps_per_epoch = seq_train[index_train].shape[0] // batch_size scheduler_trigger_steps = steps_per_epoch * trigger step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'criterion_type': 'bce', 'criteria_weights': [1.0, 1.0], 'criterion_gamma': 2.0, 'criterion_alpha': 0.75, 'optimizer': 'adam', 'optimizer_lr': 0.003, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.0005, 'max_lr': 0.003, 'step_size': step_size, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': scheduler_trigger_steps, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) oof_preds_majority[index_valid] = np.array( [res['preds_binary'] for res in eval_results]).mean(0) > 0.5 oof_majority_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_majority[index_valid]) oof_preds_proba = np.array( [res['preds_proba'] for res in eval_results]).mean(0) oof_threshold_mean: float = np.mean( [res['best_threshold'] for res in eval_results]) oof_preds_optimized[ index_valid] = oof_preds_proba > oof_threshold_mean oof_optimized_f1 = f1_score( label_train.reshape(-1, )[index_valid], oof_preds_optimized[index_valid]) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}' logger.post(message) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='SCORE', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['f1'] for res in eval_results]) post_to_snapshot_spreadsheet( logger, SPREADSHEET_SNAPSHOT_URL, eval_type='SNAPSHOT', tag='THRESHOLD', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, snapshot_info=[res['best_threshold'] for res in eval_results]) post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=fold, f1_majority=oof_majority_f1, f1_optimized=oof_optimized_f1, threshold=oof_threshold_mean) results.append({ 'f1_majority': oof_majority_f1, 'f1_optimized': oof_optimized_f1, 'threshold': oof_threshold_mean }) f1_majority_mean = np.mean([res['f1_majority'] for res in results]) f1_majority_std = np.std([res['f1_majority'] for res in results]) f1_optimized_mean = np.mean([res['f1_optimized'] for res in results]) f1_optimized_std = np.std([res['f1_optimized'] for res in results]) threshold_mean = np.mean([res['threshold'] for res in results]) total_metrics = [ f1_majority_mean, f1_majority_std, f1_optimized_mean, f1_optimized_std, threshold_mean ] post_to_main_spreadsheet(logger, SPREADSHEET_MAIN_URL, eval_type='SNAPSHOT', script_name=SCRIPT_NAME, model_name=model_name, fold=-1, f1_majority=-1, f1_optimized=-1, threshold=-1, others=total_metrics) message = 'KFold training and evaluation has been done.\n' message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n' message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n' message += f'Threshold - Avg: {threshold_mean}' logger.post(message)
if __name__ == '__main__': graph = tf.Graph() sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) config = SimpleConfig() model = SimpleModel(config, sess, graph) # model.sess.run(model.init) # print("\nGlobal Variables Initialized") model.restore() train_dogs, train_cats = load_data(config.image_size) train_batches = prepare_train_data(train_dogs, train_cats, config.batch_size) # train_batch = next_batch(train_batches) batch_images, batch_labels = map(list, zip(*train_batches[0])) batch_images = np.array(batch_images) batch_labels = np.array(batch_labels).reshape(-1, 1) pred, loss, acc = model.predict(batch_images, batch_labels) # zeros = np.zeros( # (8, 224, 224, 3), dtype=np.int) # pred, loss, acc = model.predict( # zeros, np.array([1, 1, 1, 1, 1, 1, 1, 1]).reshape(-1, 1)) # print(pred, batch_labels) print(pred, batch_labels) print(loss) print(acc)
all_probs.append(prob) all_probs = np.array(all_probs) print(f'All probs shape: {all_probs.shape}') os.makedirs(str(ROOT_PROBA_FOLDER / BIG_CATEGORY / MODEL_NAME), exist_ok=True) np.save( str(ROOT_PROBA_FOLDER / BIG_CATEGORY / MODEL_NAME / f'{subset}.npy'), all_probs) test = all_probs[:10] test = np.argmax(test, axis=1) print(test) if __name__ == '__main__': x_train = load_data(TRAIN_CSV) x_valid = load_data(VALID_CSV) x_test = load_data(TEST_CSV) if MODEL_NAME == 'adv_abblstm': x_valid, x_test, vocab_freq, word2idx, vocab_size = \ data_preprocessing_with_dict(x_train, x_valid, x_test, max_len=WORD_MAX_LEN) else: x_valid, x_test, vocab_size = data_preprocessing_v2(x_train, x_valid, x_test, max_len=16) #print(x_test) extract_and_save_probs(x_valid, subset='valid') extract_and_save_probs(x_test, subset='test') """OUTOUT LAYERS:
def run(): if FLAGS.log_file_name: sys.stdout = open(FLAGS.log_file_name, 'w') tf.reset_default_graph() # Model Code Block word_id_mapping, word_embedding, pos_embedding = load_w2v(FLAGS.embedding_dim, FLAGS.embedding_dim_pos, FLAGS.train_file_path, FLAGS.w2v_file) word_embedding = tf.constant(word_embedding, dtype=tf.float32, name='word_embedding') if FLAGS.pos_trainable: print('pos_embedding trainable!') pos_embedding = tf.Variable(pos_embedding, dtype=tf.float32, name='pos_embedding') else: pos_embedding = tf.constant(pos_embedding, dtype=tf.float32, name='pos_embedding') print('build model...') x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len]) word_dis = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len]) DGL = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.max_doc_len]) sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len]) doc_len = tf.placeholder(tf.int32, [None]) keep_prob1 = tf.placeholder(tf.float32) keep_prob2 = tf.placeholder(tf.float32) y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class]) y_p = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, 102]) placeholders = [x, word_dis, DGL, sen_len, doc_len, keep_prob1, keep_prob2, y, y_p] pred_c_tr, pred_c_te, pred_p, reg = build_model(word_embedding, pos_embedding, x, word_dis, DGL, sen_len, doc_len, keep_prob1, keep_prob2) valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32) loss_cause = - tf.reduce_sum(y * tf.log(pred_c_tr)) / valid_num loss_position = - tf.reduce_sum(y_p * tf.log(pred_p)) / valid_num loss_op = loss_cause + loss_position * FLAGS.lambda1 + reg * FLAGS.l2_reg optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(loss_op) print('lambda1: {}'.format(FLAGS.lambda1)) true_y_op = tf.argmax(y, 2) pred_y_op = tf.argmax(pred_c_tr, 2) pred_y_op_te = tf.argmax(pred_c_te, 2) print('build model done!\n') # Data Code Block y_p_data, y_data, x_data, sen_len_data, doc_len_data, word_distance, DGL_data = load_data(FLAGS.train_file_path, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len) # Training Code Block print_training_info() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # with tf.Session() as sess: kf, fold = KFold(n_splits=10), 1 p_list, r_list, f1_list = [], [], [] for train, test in kf.split(x_data): tr_x, tr_y, tr_y_p, tr_sen_len, tr_doc_len, tr_word_dis, tr_DGL = map(lambda x: x[train], [x_data, y_data, y_p_data, sen_len_data, doc_len_data, word_distance, DGL_data]) te_x, te_y, te_y_p, te_sen_len, te_doc_len, te_word_dis, te_DGL = map(lambda x: x[test], [x_data, y_data, y_p_data, sen_len_data, doc_len_data, word_distance, DGL_data]) sess.run(tf.global_variables_initializer()) print('############# fold {} ###############'.format(fold)) fold += 1 max_f1 = 0.0 print('train docs: {} test docs: {}'.format(len(tr_y), len(te_y))) for i in xrange(FLAGS.training_iter): start_time = time.time() step = 1 # train for train, _ in get_batch_data(tr_x, tr_word_dis, tr_DGL, tr_sen_len, tr_doc_len, FLAGS.keep_prob1, FLAGS.keep_prob2, tr_y, tr_y_p, FLAGS.batch_size): _, loss, loss_c, loss_p, pred_y, true_y, doc_len_batch = sess.run( [optimizer, loss_op, loss_cause, loss_position, pred_y_op, true_y_op, doc_len], feed_dict=dict(zip(placeholders, train))) acc, p, r, f1 = acc_prf(pred_y, true_y, doc_len_batch) print('step {}: loss {:.4f} loss_cause {:.4f} loss_position {:.4f} acc {:.4f} \np {:.4f} r {:.4f} f1 {:.4f}'.format(step, loss, loss_c, loss_p, acc, p, r, f1 )) step = step + 1 # test test = [te_x, te_word_dis, te_DGL, te_sen_len, te_doc_len, 1., 1., te_y, te_y_p] loss, pred_y_te, true_y, doc_len_batch = sess.run( [loss_op, pred_y_op_te, true_y_op, doc_len], feed_dict=dict(zip(placeholders, test))) acc, p, r, f1 = acc_prf(pred_y_te, true_y, doc_len_batch) if f1 > max_f1: max_acc, max_p, max_r, max_f1 = acc, p, r, f1 print('\nepoch {}: loss {:.4f} acc {:.4f}\np {:.4f} r {:.4f} f1 {:.4f} max_f1 {:.4f}'.format(i, loss, acc, p, r, f1, max_f1 )) print("cost time: {:.1f}s\n".format(time.time()-start_time)) print 'Optimization Finished!\n' p_list.append(max_p) r_list.append(max_r) f1_list.append(max_f1) print_training_info() p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list]) print("f1_score in 10 fold: {}\naverage : p {} r {} f1 {}\n".format(np.array(f1_list).reshape(-1,1), p, r, f1))