Python CorpusFile 예제들, corpus.CorpusFile Python 예제들

예제 #1

0

파일 보기

파일: arg_experiment.py 프로젝트: shaform/disambig

def main():
    args = process_commands()
    linkings = corpus.load_linking(args.linking)
    arguments = argument.ArgumentFile(args.argument)
    test_arguments = argument.ArgumentFile(args.argument_test)
    corpus_file = corpus.CorpusFile(args.corpus, args.corpus_pos,
                                    args.corpus_parse, args.corpus_dep)
    fhelper = corpus.FoldsHelper(args.folds)

    keep_boundary = args.keep_boundary and args.argument == args.argument_test

    test(fhelper,
         arguments,
         test_arguments,
         corpus_file,
         linkings,
         args.train,
         args.test,
         args.model,
         args.crfsuite,
         args.log,
         keep_boundary,
         use_baseline=args.use_baseline,
         use_feature=args.select,
         reverse_select=args.reverse_select,
         rstats=args.rstats,
         threshold=args.threshold)

예제 #2

0

파일 보기

파일: extract_word_features.py 프로젝트: shaform/disambig

def main():
    args = process_commands()

    # load data

    ltruth = linkage.LinkageFile(args.linkage)
    truth = ltruth.all_words()
    detector = linkage.LinkageDetector(args.connective)
    corpus_file = corpus.CorpusFile(
        args.corpus, args.corpus_pos, args.corpus_parse)
    vectors = corpus.VectorFile(args.vector)

    if args.output:
        cands, Y, X = get_features(
            detector, corpus_file, vectors, truth,
            select=args.select, reverse_select=args.reverse_select)

        output_file(args.output, cands, Y, X)

    if args.perfect_output:
        cands, Y, X = get_features(
            detector, corpus_file, vectors, truth, args.output_ambig,
            select=args.select, reverse_select=args.reverse_select,
            perfect=ltruth)

        output_file(args.perfect_output, cands, Y, X)

예제 #3

0

파일 보기

파일: statistics.py 프로젝트: shaform/disambig

def main():
    args = process_commands()

    detector = linkage.LinkageDetector(args.connective)
    corpus_file = corpus.CorpusFile(args.corpus)
    truth = linkage.LinkageFile(args.linkage)
    arg_truth = argument.ArgumentFile(args.argument)
    arg_truth.init_truth(corpus_file)

    stat_all_detect(detector, corpus_file, truth, arg_truth,
                    args.output_count, args.output_cnnct_count)

예제 #4

0

파일 보기

def preprocess(args):
    corpus_file = corpus.CorpusFile(args.corpus)

    labels = []
    with open(args.output, 'w') as f:
        for l, tokens in corpus_file.corpus.items():
            labels.append(l)
            for s, e in corpus_file.edu_corpus[l]:
                f.write('{}\n'.format(' '.join(tokens[s:e])))

    with open(args.label, 'w') as f:
        for l in labels:
            f.write(l + '\n')

예제 #5

0

파일 보기

def main():
    args = process_commands()

    # loading data

    truth = linkage.LinkageFile(args.linkage)
    fhelper = corpus.FoldsHelper(args.folds)
    detector = linkage.LinkageDetector(args.connective)
    vectors = corpus.VectorFile(args.vector)
    corpus_file = corpus.CorpusFile(args.corpus, args.corpus_pos,
                                    args.corpus_parse)

    print('process file')

    if args.output:
        cands, Y, X = get_linkage_features(corpus_file,
                                           detector,
                                           vectors,
                                           truth,
                                           reverse_select=args.reverse_select,
                                           select=args.select,
                                           select_cnnct=args.select_cnnct)

        output_file(args.output, cands, Y, X)

        if args.check_accuracy:
            check_accuracy(X, Y)

    # extract perfect features for sense experiments

    if args.perfect_output:
        print('process perfect file')

        cands, Y, X = get_linkage_features(corpus_file,
                                           detector,
                                           vectors,
                                           truth,
                                           select=args.select,
                                           reverse_select=args.reverse_select,
                                           perfect=True,
                                           select_cnnct=args.select_cnnct)

        output_file(args.perfect_output, cands, Y, X)

        if args.check_accuracy:
            check_accuracy(X, Y)

예제 #6

0

파일 보기

def postprocess(args):
    corpus_file = corpus.CorpusFile(args.corpus)
    labels = []
    with open(args.label) as f:
        for l in f:
            labels.append(l.strip())
    with open(args.input) as f, open(args.output, 'w') as out:
        for lb in labels:
            lb = lb.strip()
            edus = []
            for _ in corpus_file.edu_corpus[lb]:
                items = []
                for l in f:
                    if l == '\n':
                        break
                    else:
                        items.append(l.strip())
                edus.append('@@@@'.join(items))
            out.write('{}\t{}\n'.format(lb, '\t'.join(edus)))

예제 #7

0

파일 보기

파일: experiment.py 프로젝트: shaform/disambig

def main():
    args = process_commands()

    corpus_file = corpus.CorpusFile(args.corpus, args.corpus_pos,
                                    args.corpus_parse)
    fhelper = corpus.FoldsHelper(args.folds)
    truth = linkage.LinkageFile(args.linkage)
    words = truth.all_words()
    detector = linkage.LinkageDetector(args.connective)
    feature_tbl = features.load_features_table(args.linkage_features,
                                               lambda x: tuple(x.split('-')))

    linkage_counts, lcdict = count_linkage(args.linkage)
    linkage_probs = load_linkage_probs(args.linkage_probs)
    linkage_class = load_linkage_probs(args.linkage_class)

    word_ambig = evaluate.WordAmbig(args.word_ambig)

    ranking_probs = compute_ranking_probs(linkage_probs)

    if args.perfect:
        cut = lambda x, _: any((x[0], w) not in words for w in x[1])
    else:
        cut = lambda x, _: linkage_class[x] < args.threshold

    # The B2 model
    if not args.pipeline:
        print('===== ranking model =====')
        cross_validation(
            corpus_file,
            fhelper,
            feature_tbl,
            truth,
            detector,
            linkage_counts,
            lcdict,
            ranking_probs,
            word_ambig,
            cut=cut,
            words=words,
            perfect=args.perfect,
            count_path=args.word_count,
            arg_output=args.arg_output,
            greedy=args.greedy,
            rank=args.rank,
            predict_sstats=args.predict_sense,
            predict_wstats=args.predict_wstats,
        )

    elif not args.perfect:
        word_probs, word_truth = load_word_probs(args.word_probs)
        # cut by word probs
        cut = lambda x, _: any(word_probs[(x[0], w)] < args.threshold for w in
                               x[1])  # or linkage_class[x] < args.threshold

        # The B1 model
        print('\n===== pipeline model =====')
        cross_validation(
            corpus_file,
            fhelper,
            feature_tbl,
            truth,
            detector,
            linkage_counts,
            lcdict,
            ranking_probs,
            word_ambig,
            cut=cut,
            words=words,
            count_path=args.word_count,
            perfect=args.perfect,
            greedy=args.greedy,
            rank=args.rank,
            predict_sstats=args.predict_sense,
            predict_wstats=args.predict_wstats,
        )