Exemplo n.º 1
0
def cross_validate(num_iters, algo_name, corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir):
    global _NAIVE_CV_GLOBALS

    corpus_files = get_corpus_files(corpus_dir)
    shuffle(corpus_files)
    splits = split_seq(corpus_files, num_iters)

    _NAIVE_CV_GLOBALS = [ corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name ]

    pool = multiprocessing.Pool()
    results = pool.map(cross_validate_inner, range(num_iters))
    pool.close()
    pool.join()

    def summary(seq):
        q, s, n = 0.0, 0.0, 0.0
        for x in seq:
            q += x * x
            s += x
            n += 1.0
        avg = s / n
        dev = math.sqrt( q / n - avg ** 2 )
        return avg *100 , dev * 100

    prec         = list( (tck + tcu) / (tk + tu) for tck, tcu, tk, tu, _, _ in results )
    known_prec   = list( tck / tk                for tck, tcu, tk, tu, _, _ in results )
    unknown_prec = list( tcu / tu                for tck, tcu, tk, tu, _, _ in results )

    ub_known     = list( ubk / tk for tck, tcu, tk, tu, ubk, ubu in results )
    ub_unknown   = list( ubu / tu for tck, tcu, tk, tu, ubk, ubu in results )

    print "RESULT:         total precision: {0:.4f}% +- {1:.4f}%".format(*summary(prec))
    print "RESULT:      by-known precision: {0:.4f}% +- {1:.4f}%".format(*summary(known_prec))
    print "RESULT:    by-unknown precision: {0:.4f}% +- {1:.4f}%".format(*summary(unknown_prec))
    print "RESULT:   upper bound by knowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_known))
    print "RESULT: upper bound by unknowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_unknown))
    print "RESULT: " # Just a separator.
    print "RESULT: Finished {0} algorithm with {1} tagset".format( algo_name, get_tag_set_by_func(N_func ) )
    print "RESULT: Raw: " + repr(results)
Exemplo n.º 2
0
def cross_validate_inner(i):
    corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name = _NAIVE_CV_GLOBALS

    remove_directory_content(algo_dir)
    print "Starting {0} fold".format( i )
    train_fold_corpus_files = flatten(splits[j] for j in range(num_iters) if i != j)
    test_corpus_files = flatten(splits[j] for j in range(num_iters) if i == j)

    morph_analysis_files = [ os.path.join( morph_analysis_dir, os.path.basename( test_file ) ) for test_file in test_corpus_files if os.path.exists( os.path.join( morph_analysis_dir, os.path.basename( test_file ) ) )]
    algo = None
    if algo_name == ALGONAMES.BASELINE:
        algo = NaiveAlgorithm(N_func=N_func)
        algo.train_from_filelist( train_fold_corpus_files )
    elif algo_name == ALGONAMES.HMM:
        algo = HMMAlgorithm(N_filter_func=N_func)
        algo.train_model_from_filelist(corpus_files =  train_fold_corpus_files )
    elif algo_name == ALGONAMES.MEMM:
        algo = MMEMAlgorithm(N_filter_func=N_func)
        algo.train_model_file_list(corpus_filelist =  train_fold_corpus_files, ambiguity_dir = morph_analysis_dir )
    if algo is None:
        raise Exception("Not supported algorithm {0}".format( algo_name ))

    print "Finished training. Starting testing phase!"
    remove_ambiguity_file_list(ambig_filelist=morph_analysis_files, output_dir= algo_dir, algo = algo )
    print "Finished working of algo. Starting measuring phase"
    total_correct_known, total_correct_unknown, total_known, total_unknown, upper_bound_known,upper_bound_unknown  = calculate_dir_precision( algo_dir = algo_dir, ambi_dir= morph_analysis_dir, gold_dir =  corpus_dir, M = M_strict_mathcher, N =  N_func, P = P_no_garbage,
        errors_context_filename = os.path.join(error_dir, "{1}_errors_context_{0}_{2}.txt".format( i , algo_name, get_tag_set_by_func( N_func ) ) ) )

    return (total_correct_known, total_correct_unknown, total_known, total_unknown, upper_bound_known,upper_bound_unknown )