예제 #1
0
파일: exp.py 프로젝트: hxiaom/ScienceIE
def run_exp_dev(crf, train_feat_dirs, dev_feat_dirs, target_label):
    """
    Run an experiment with training on the train data and testing on the dev data
    """
    # Collect data for running CRF classifier
    train_dir = join(LOCAL_DIR, 'train')
    true_iob_dir = join(train_dir, 'iob')
    X_train = collect_features(true_iob_dir, *train_feat_dirs)
    train_labels_fname = join(train_dir, 'train_labels.pkl')
    train_labels = read_labels(train_labels_fname)
    y_train_true = train_labels[target_label]

    dev_dir = join(LOCAL_DIR, 'dev')
    true_iob_dir = join(dev_dir, 'iob')
    X_dev = collect_features(true_iob_dir, *dev_feat_dirs)
    dev_labels_fname = join(dev_dir, 'dev_labels.pkl')
    dev_labels = read_labels(dev_labels_fname)
    y_dev_true = dev_labels[target_label]

    # Predict
    crf.fit(X_train, y_train_true)
    y_dev_pred = crf.predict(X_dev)
    print(
        flat_classification_report(y_dev_true,
                                   y_dev_pred,
                                   digits=3,
                                   labels=('B', 'I')))
    return y_dev_pred
예제 #2
0
파일: exp.py 프로젝트: hxiaom/ScienceIE
def eval_exp_train(preds, part='train', postproc=None, zip_fname=None):
    """
    Evaluate predictions from experiment

    Converts IOB tags predicted by CRF to Brat format and then calls the official scoring function.
    """
    part_dir = join(LOCAL_DIR, part)
    true_iob_dir = join(part_dir, 'iob')

    labels_fname = join(part_dir, part + '_labels.pkl')
    labels = read_labels(labels_fname)
    filenames = labels['__filenames__']

    # Convert CRF prediction to IOB tags
    pred_iob_dir = '_' + part + '/iob'
    pred_to_iob(preds, filenames, true_iob_dir, pred_iob_dir)

    if postproc:
        postproc_dir = '_' + part + '/iob_pp'
        postproc(pred_iob_dir, postproc_dir)
        pred_iob_dir = postproc_dir

    # Convert predicted IOB tags to predicted Brat annotations
    txt_dir = join(DATA_DIR, part)
    brat_dir = '_' + part + '/brat'
    iob_to_brat(pred_iob_dir, txt_dir, brat_dir)

    # Evaluate
    calculateMeasures(txt_dir, brat_dir, 'rel')

    if zip_fname:
        package(brat_dir, part, zip_fname)

    return brat_dir
예제 #3
0
파일: exp.py 프로젝트: hxiaom/ScienceIE
def run_exp_train_cv(crf, feat_dirs, target_label, n_folds=5, n_jobs=-1):
    """
    Run cross-validated experiment on training data
    """
    # Collect data for running CRF classifier
    train_dir = join(LOCAL_DIR, 'train')
    true_iob_dir = join(train_dir, 'iob')
    X = collect_features(true_iob_dir, *feat_dirs)
    labels_fname = join(train_dir, 'train_labels.pkl')
    labels = read_labels(labels_fname)
    y_true = labels[target_label]
    folds_fname = join(train_dir, 'folds.pkl')
    folds = read_folds(folds_fname, n_folds)

    # Predict]
    y_pred = cross_val_predict(crf,
                               X,
                               y_true,
                               cv=folds,
                               verbose=2,
                               n_jobs=n_jobs)
    print(
        flat_classification_report(y_true, y_pred, digits=3,
                                   labels=('B', 'I')))
    return y_pred
예제 #4
0
파일: exp.py 프로젝트: hxiaom/ScienceIE
def run_exp_test(crf, train_feat_dirs, dev_feat_dirs, test_feat_dirs,
                 target_label):
    """
    Run an experiment with training on the train and dev data combined and testing on the test data
    """
    # Collect data for running CRF classifier

    # train
    train_dir = join(LOCAL_DIR, 'train')
    true_iob_dir = join(train_dir, 'iob')
    X_train = collect_features(true_iob_dir, *train_feat_dirs)
    train_labels_fname = join(train_dir, 'train_labels.pkl')
    train_labels = read_labels(train_labels_fname)
    y_train_true = train_labels[target_label]

    # dev
    dev_dir = join(LOCAL_DIR, 'dev')
    true_iob_dir = join(dev_dir, 'iob')
    X_dev = collect_features(true_iob_dir, *dev_feat_dirs)
    dev_labels_fname = join(dev_dir, 'dev_labels.pkl')
    dev_labels = read_labels(dev_labels_fname)
    y_dev_true = dev_labels[target_label]

    # now combine train and dev data
    X_combined = X_train + X_dev
    y_combined_true = y_train_true + y_dev_true

    # test
    test_dir = join(LOCAL_DIR, 'test')
    true_iob_dir = join(test_dir, 'iob')
    X_test = collect_features(true_iob_dir, *test_feat_dirs)
    test_labels_fname = join(test_dir, 'test_labels.pkl')
    test_labels = read_labels(test_labels_fname)
    y_test_true = test_labels[target_label]

    # Predict
    crf.fit(X_combined, y_combined_true)
    y_test_pred = crf.predict(X_test)
    try:
        print(
            flat_classification_report(y_test_true,
                                       y_test_pred,
                                       digits=3,
                                       labels=('B', 'I')))
    except ZeroDivisionError:
        print('WARNING: no true annotation')
    return y_test_pred
예제 #5
0
파일: exp.py 프로젝트: hxiaom/ScienceIE
def run_exp_train(crf, feat_dirs, target_label):
    """
    Run an experiment with both training and testing on the train data
    """
    # Collect data for running CRF classifier
    train_dir = join(LOCAL_DIR, 'train')
    true_iob_dir = join(train_dir, 'iob')
    X = collect_features(true_iob_dir, *feat_dirs)
    labels_fname = join(train_dir, 'train_labels.pkl')
    labels = read_labels(labels_fname)
    y_true = labels[target_label]

    # Predict
    crf.fit(X, y_true)
    y_pred = crf.predict(X)
    print(
        flat_classification_report(y_true, y_pred, digits=3,
                                   labels=('B', 'I')))
    return y_pred
예제 #6
0
base_feats_dir = join(EXPS_DIR, 'crf1/_train/features1')
word_feats_dir = join(EXPS_DIR, 'wordfeats/_train/wordfeats1')
wn_feats_dir = join(EXPS_DIR, 'morewn/_train/wnfeats1')

feat_dirs = [
    base_feats_dir,
    word_feats_dir,
    wn_feats_dir
]

train_dir = join(LOCAL_DIR, 'train')
true_iob_dir = join(train_dir, 'iob')
X = collect_features(true_iob_dir, *feat_dirs)

labels_fname = join(train_dir, 'train_labels.pkl')
labels = read_labels(labels_fname)

n_folds = 5
folds_fname = join(train_dir, 'folds.pkl')
folds = read_folds(folds_fname, n_folds)

params_space = {
    'c1': [0.0, 0.001, 0.01, 0.1, 1.0, 10],
    'c2': [0.0, 0.001, 0.01, 0.1, 1.0, 10],
    'min_freq': [0, 1, 2, 3, 5, 10],
    #'all_possible_states': [True, False],
    #'all_possible_transitions': [True, False]
}

# use the same metric for evaluation
f1_scorer = make_scorer(flat_f1_score, average='micro', labels=('B', 'I'))