def test(model_filename: str, test_corpus: str, window_size: int = 5, _run: Run = None, _log: logger = None): _run.add_resource(test_corpus) _run.add_resource(f'{model_filename}.pkl') test_sents, _ = get_tagged_sents_and_words(test_corpus) X_test = [sent2features(s, window_size) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] _log.info(f'load from: {model_filename}.pkl') crf = sklearn_crfsuite.CRF(model_filename=model_filename) y_pred = crf.predict(X_test) overall, by_type = evaluate(y_test, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}')
def train(train_corpus: str, dev_corpus: str, c1: float = 0.0, c2: float = 0.0, algorithm: str = 'lbfgs', max_iterations: int = 100, all_possible_transitions: bool = False, window_size: int = 1, model_filename: str = None, _run: Run = None, _log: logger = None): """ running crf experiment """ _run.add_resource(train_corpus) _run.add_resource(dev_corpus) train_sents, _ = get_tagged_sents_and_words(train_corpus) dev_sents, _ = get_tagged_sents_and_words(dev_corpus) X_train = [sent2features(s, window_size) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_dev = [sent2features(s, window_size) for s in dev_sents] y_dev = [sent2labels(s) for s in dev_sents] crf = sklearn_crfsuite.CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions, model_filename=model_filename, ) crf.fit(X_train, y_train) y_pred = crf.predict(X_dev) overall, by_type = evaluate(y_dev, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}') if model_filename is not None: _log.info(f'saving to: {model_filename}.pkl') joblib.dump(crf, f'{model_filename}.pkl') _run.add_artifact(f'{model_filename}.pkl')
def train(train_corpus: str, dev_corpus: str, char_int: int, save_path: str, test_corpus: str = None, dropout: float = 0.5, num_epochs: int = 10, lm_loss_scale=0.1, device: int = 0, save=False, _run: Run = None): _run.add_resource(train_corpus) _run.add_resource(dev_corpus) trainer = TrainerMod(_run, train_corpus, save_path, dev_corpus, num_epochs=num_epochs, dropout=dropout, char_integration_method=char_int, lm_loss_scale=lm_loss_scale, save=save, device=device) trainer.run() if test_corpus: _run.add_resource(test_corpus) ex.run_command('test', config_updates={ 'save_path': save_path, 'test_corpus': test_corpus, 'device': device })
def train_w_pretrained(train_corpus: str, dev_corpus: str, char_int: int, pretrained_embeddings: str, save_path: str, test_corpus: str = None, word_embedding_size: int = 300, update_pretrained_embedding: bool = True, dropout: float = 0.5, num_epochs: int = 10, lm_loss_scale=0.1, device: int = 0, save=False, _run: Run = None): _run.add_resource(train_corpus) _run.add_resource(dev_corpus) trainer = TrainerMod( _run, train_corpus, save_path, dev_corpus, word_embedding_size=word_embedding_size, num_epochs=num_epochs, dropout=dropout, char_integration_method=char_int, lm_loss_scale=lm_loss_scale, save=save, device=device, pretrained_embeddings=pretrained_embeddings, update_pretrained_embedding=update_pretrained_embedding, model_class=NewSequenceLabeler) trainer.run() if test_corpus: _run.add_resource(test_corpus) ex.run_command('test_w_pretrained', config_updates={ 'save_path': save_path, 'test_corpus': test_corpus, 'device': device })
def train(train_corpus: str, dev_corpus: str, pacrf: str, model_filename: str, labels: List, c1: float = 0.0, c2: float = 1.0, algorithm: str = 'lbfgs', max_iterations: int = None, all_possible_transitions: bool = False, window_size: int = 0, _run: Run = None, _log: logger = None): """ running crf experiment """ _run.add_resource(train_corpus) _run.add_resource(dev_corpus) train_sents, _ = get_tagged_sents_and_words(train_corpus) dev_sents, _ = get_tagged_sents_and_words(dev_corpus) tmp_train = tempfile.NamedTemporaryFile(mode='w+') # temp_train_corpus = open(f'{model_filename}-{train_corpus}.feature', mode='w+') print_corpus(train_sents, labels, tmp_train, window_size=window_size) # X_dev = [sent2features(s, window_size) for s in dev_sents] y_dev = [sent2labels_colmap(s, col=1) for s in dev_sents] tmp_dev = tempfile.NamedTemporaryFile(mode='w+') # temp_test_corpus = open(f'{model_filename}-{test_corpus}.feature', mode='w+') print_corpus(dev_sents, labels, tmp_dev, window_size=window_size) # to call partial-crf via Popen command # command = f'{pacrf} learn -m {model_filename} -a {algorithm} {temp_train_corpus}' # call([pacrf, "--help"]) crfsuire_proc = Popen([pacrf, "learn", "-m", model_filename, "-a", algorithm, \ "-p", f"c1={c1}", "-p", f"c2={c2}", tmp_train.name]) out, err = crfsuire_proc.communicate() print(out) print(err) # os.system(f'{pacrf} learn -m {model_filename} -a {algorithm} {tmp_train.name}') tmp_train.close() tmp_pred = tempfile.NamedTemporaryFile(mode='w+') # cmd_out([pacrf, "tag", "-m", model_filename, tmp_dev.name, ">", tmp_pred.name]) _run.add_artifact(model_filename) # TODO modified this to call partial-crf via Popen command # y_pred = crf.predict(X_dev) y_pred = get_tagged_sents_and_words(tmp_pred.name) print(y_pred) y_pred = [sent2labels_colmap(s, 0) for s in y_pred] # TODO modified this to read partial-crf via tempfile overall, by_type = evaluate(y_dev, y_pred) tmp_pred.close() tmp_dev.close() _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}')