Exemplo n.º 1
0
def main():
    args = process_commands()

    # load data

    ltruth = linkage.LinkageFile(args.linkage)
    truth = ltruth.all_words()
    detector = linkage.LinkageDetector(args.connective)
    corpus_file = corpus.CorpusFile(
        args.corpus, args.corpus_pos, args.corpus_parse)
    vectors = corpus.VectorFile(args.vector)

    if args.output:
        cands, Y, X = get_features(
            detector, corpus_file, vectors, truth,
            select=args.select, reverse_select=args.reverse_select)

        output_file(args.output, cands, Y, X)

    if args.perfect_output:
        cands, Y, X = get_features(
            detector, corpus_file, vectors, truth, args.output_ambig,
            select=args.select, reverse_select=args.reverse_select,
            perfect=ltruth)

        output_file(args.perfect_output, cands, Y, X)
Exemplo n.º 2
0
def main():
    args = process_commands()

    detector = linkage.LinkageDetector(args.connective)
    corpus_file = corpus.CorpusFile(args.corpus)
    truth = linkage.LinkageFile(args.linkage)
    arg_truth = argument.ArgumentFile(args.argument)
    arg_truth.init_truth(corpus_file)

    stat_all_detect(detector, corpus_file, truth, arg_truth,
                    args.output_count, args.output_cnnct_count)
Exemplo n.º 3
0
def main():
    args = process_commands()

    # loading data

    truth = linkage.LinkageFile(args.linkage)
    fhelper = corpus.FoldsHelper(args.folds)
    detector = linkage.LinkageDetector(args.connective)
    vectors = corpus.VectorFile(args.vector)
    corpus_file = corpus.CorpusFile(args.corpus, args.corpus_pos,
                                    args.corpus_parse)

    print('process file')

    if args.output:
        cands, Y, X = get_linkage_features(corpus_file,
                                           detector,
                                           vectors,
                                           truth,
                                           reverse_select=args.reverse_select,
                                           select=args.select,
                                           select_cnnct=args.select_cnnct)

        output_file(args.output, cands, Y, X)

        if args.check_accuracy:
            check_accuracy(X, Y)

    # extract perfect features for sense experiments

    if args.perfect_output:
        print('process perfect file')

        cands, Y, X = get_linkage_features(corpus_file,
                                           detector,
                                           vectors,
                                           truth,
                                           select=args.select,
                                           reverse_select=args.reverse_select,
                                           perfect=True,
                                           select_cnnct=args.select_cnnct)

        output_file(args.perfect_output, cands, Y, X)

        if args.check_accuracy:
            check_accuracy(X, Y)
Exemplo n.º 4
0
def main():
    args = process_commands()

    truth = linkage.LinkageFile(args.linkage)

    # truth.print_type_stats()

    feature_tbl = features.load_features_table(args.linkage_features,
                                               lambda x: tuple(x.split('-')))

    #word_feature_tbl = features.load_features_table(args.word_features)

    X = []
    Y = []
    labels = []
    X2 = []
    Y2 = []
    labels2 = []

    for label, pset in sorted(truth.linkage.items()):
        feature_set = feature_tbl[label]
        x_set = {key: tbl for key, _, tbl in feature_set}

        for indices in sorted(pset):
            X.append(x_set[indices])
            Y.append(truth.linkage_type[label][indices])
            labels.append((label, indices))

            ctype2 = truth.linkage_type2[label][indices]
            if ctype2 not in NOT_COUNTED:
                X2.append(x_set[indices])
                Y2.append(TRANS[ctype2])
                labels2.append((label, indices))

    X = np.array(X)
    X2 = np.array(X2)
    Y = np.array(Y)
    Y2 = np.array(Y2)

    lr = SVC()
    lr = LogisticRegression()
    lr = GaussianNB()
    lr = LogisticRegressionCV()

    print('predict 1-level...')
    folds = cross_validation.StratifiedKFold(
        Y, 10, shuffle=True, random_state=np.random.RandomState(1))
    Yp = cross_validation.cross_val_predict(lr, X, Y, cv=folds, n_jobs=10)

    print('predict 2-level...')
    folds2 = cross_validation.StratifiedKFold(
        Y2, 10, shuffle=True, random_state=np.random.RandomState(1))
    Y2p = cross_validation.cross_val_predict(lr, X2, Y2, cv=folds2, n_jobs=10)

    print('collect type predictions...')
    Ys, Yps = [], []
    wYs, wYps = [], []
    for _, test_idx in folds:
        ys = list(Y[test_idx])
        yps = list(Yp[test_idx])
        ls = extract_indices(labels, test_idx)

        Ys.append(ys)
        Yps.append(yps)

        wys, wyps = [], []

        for y, yp, l in zip(ys, yps, ls):
            length = len(l[1])
            wys.extend([y] * length)
            wyps.extend([yp] * length)

        wYs.append(wys)
        wYps.append(wyps)

    print('collect 2-level type predictions...')
    Y2s, Y2ps = [], []
    wY2s, wY2ps = [], []
    for _, test_idx in folds2:
        ys = list(Y2[test_idx])
        yps = list(Y2p[test_idx])
        ls = extract_indices(labels2, test_idx)

        Y2s.append(ys)
        Y2ps.append(yps)

        wys, wyps = [], []

        for y, yp, l in zip(ys, yps, ls):
            length = len(l[1])
            wys.extend([y] * length)
            wyps.extend([yp] * length)

        wY2s.append(wys)
        wY2ps.append(wyps)

    evaluate.print_sense_scores(Ys, Yps, 'Overall', print_accuracy=True)
    evaluate.print_sense_scores(Y2s,
                                Y2ps,
                                'Overall for 2nd-level',
                                print_accuracy=True)

    print('\n== word stats ==')

    evaluate.print_sense_scores(wYs, wYps, 'Overall', print_accuracy=True)
    evaluate.print_sense_scores(wY2s,
                                wY2ps,
                                'Overall for 2nd-level',
                                print_accuracy=True)
Exemplo n.º 5
0
def main():
    args = process_commands()

    corpus_file = corpus.CorpusFile(args.corpus, args.corpus_pos,
                                    args.corpus_parse)
    fhelper = corpus.FoldsHelper(args.folds)
    truth = linkage.LinkageFile(args.linkage)
    words = truth.all_words()
    detector = linkage.LinkageDetector(args.connective)
    feature_tbl = features.load_features_table(args.linkage_features,
                                               lambda x: tuple(x.split('-')))

    linkage_counts, lcdict = count_linkage(args.linkage)
    linkage_probs = load_linkage_probs(args.linkage_probs)
    linkage_class = load_linkage_probs(args.linkage_class)

    word_ambig = evaluate.WordAmbig(args.word_ambig)

    ranking_probs = compute_ranking_probs(linkage_probs)

    if args.perfect:
        cut = lambda x, _: any((x[0], w) not in words for w in x[1])
    else:
        cut = lambda x, _: linkage_class[x] < args.threshold

    # The B2 model
    if not args.pipeline:
        print('===== ranking model =====')
        cross_validation(
            corpus_file,
            fhelper,
            feature_tbl,
            truth,
            detector,
            linkage_counts,
            lcdict,
            ranking_probs,
            word_ambig,
            cut=cut,
            words=words,
            perfect=args.perfect,
            count_path=args.word_count,
            arg_output=args.arg_output,
            greedy=args.greedy,
            rank=args.rank,
            predict_sstats=args.predict_sense,
            predict_wstats=args.predict_wstats,
        )

    elif not args.perfect:
        word_probs, word_truth = load_word_probs(args.word_probs)
        # cut by word probs
        cut = lambda x, _: any(word_probs[(x[0], w)] < args.threshold for w in
                               x[1])  # or linkage_class[x] < args.threshold

        # The B1 model
        print('\n===== pipeline model =====')
        cross_validation(
            corpus_file,
            fhelper,
            feature_tbl,
            truth,
            detector,
            linkage_counts,
            lcdict,
            ranking_probs,
            word_ambig,
            cut=cut,
            words=words,
            count_path=args.word_count,
            perfect=args.perfect,
            greedy=args.greedy,
            rank=args.rank,
            predict_sstats=args.predict_sense,
            predict_wstats=args.predict_wstats,
        )