Exemplo n.º 1
0
Arquivo: tweaked.py Projeto: laat/ex3
def get_features(in_file, idf_enabled=False):
    print "loading xml..."
    lexical_tree = load_xml.get_pairs(in_file)
    syntax_tree = create_tree.generate_syntax_tree(in_file)
    print "done loading"

    if idf_enabled:
        generate_idf_score(lexical_tree)

    print "parsing reference"
    ref = get_attributes_pair(in_file)

    print "extracting features"
    features = defaultdict(list)

    #word_matching
    score = lexical.word_match(lexical_tree, idf_enabled=idf_enabled)
    for k, v in score:
        features[k].append(v)

    #simple negation
    score = lexical.get_simple_negations(lexical_tree)
    for k, v in score:
        features[k].append(v)

    #tree edit distance
    score = syntactic.tree_edit_distance(syntax_tree)
    for k,v in score:
        features[k].append(v)

    #number_match
    score = lexical.number_match(lexical_tree)
    for k,v in score:
        features[k].append(v)

    #1,2,3-gram with synonyms of lemmas 
    for n in [1,2,3]:
        score = bleu(lexical_tree, n=n, idf_enabled=True, lemma=True, synonyms=True)
        for k,v in score:
            features[k].append(v)

    memory = {}
    for n in [2]: # 2-gram without synonyms
        score = bleu(lexical_tree, n=n, idf_enabled=True, lemma=True, synonyms=False)
        for k,v in score:
            features[k].append(v)

    #appending task and entailment
    for k,v in features.iteritems():
        features[k].extend(ref[str(k)])

    return features
Exemplo n.º 2
0
def get_features(in_file, idf_enabled=False):

    print "loading xml..."
    lexical_tree = load_xml.get_pairs(in_file)
    syntax_tree = create_tree.generate_syntax_tree(in_file)
    print "done loading"

    if idf_enabled:
        generate_idf_score(lexical_tree)

    print "parsing reference"
    ref = get_attributes_pair(in_file)

    print "extracting features"
    features = defaultdict(list)

    #word_matching
    score = lexical.word_match(lexical_tree, idf_enabled=idf_enabled)
    for k, v in score:
        features[k].append(v)


    #lemma_matching
    score = lexical.lemma_match(lexical_tree)
    for k, v in score:
        features[k].append(v)

    #bigram_matching (lemma)
    score = lexical.bleu(lexical_tree, n=2, return_only_n=2,
                         idf_enabled=idf_enabled, lemma=True)
    for k, v in score:
        features[k].append(v)

    #leamma_pos_matching
    score = lexical.lemma_match(lexical_tree)
    for k, v in score:
        features[k].append(v)

    #simple negation
    score = lexical.get_simple_negations(lexical_tree)
    for k, v in score:
        features[k].append(v)

    score = syntactic.tree_edit_distance(syntax_tree)
    for k,v in score:
        features[k].append(v)

    #appending task and entailment
    for k,v in features.iteritems():
        features[k].extend(ref[str(k)])

    return features
Exemplo n.º 3
0
Arquivo: rte.py Projeto: laat/ex3
def main(tree, output, method, threshold, find_best, n=4, idf_enabled=False):
    #load xml and idf
    if method in ["word", "lemma", "bleu"]:
        print "Loading xmlfile"
        tree = (load_xml.get_pairs(tree), tree)
        print "done."

        if idf_enabled:
            generate_idf_score(tree[0])

    elif method in ["print_ted", "ted"]:
        print "Loading xmlfile"
        tree = (create_tree.generate_syntax_tree(tree), tree)
        print "done."

        if idf_enabled:
            generate_idf_score(load_xml.get_pairs(tree[1]))

    elif method in ["features"]:
        features = get_features(tree, idf_enabled)
        write_features(output, features) 
        return
    elif method in ["knn", "knn-xv"]:
        tree = (tree, tree)
    
    #run methods
    if find_best:
        find_best_threshold(tree[0], METHODS[method], tree[1], 
                            output, n=n, idf_enabled=idf_enabled)
    else:
        if method in ["knn", "knn-xv"]:
            features = get_features(tree[0], idf_enabled=idf_enabled)
            write_features("features.tab", features) 
            results = METHODS[method](None, outfile="features.tab")
        else:
            results = METHODS[method](tree[0], n=n, idf_enabled=idf_enabled, 
                                  output=output)
        if method == "print_ted":
            return
        classification = classify_results(results, threshold) 

        print "writing output"
        write(classification, output)
        print "Accuracy = %.4f" % evaluate(tree[1], output)