Exemplo n.º 1
0
 def test_model_upper_score_de_en(self):
     graphs = cPickle.load(
         open(config["test_data_dir"] + "/graphs_sample_out_de-en.pkl"))
     self.clear_scores(graphs)
     ref_fname = config["test_data_dir"] + "/lemma_sample_out_de-en.ref"
     ambig_fname = config["sample"]["de-en"]["ambig_fname"]  
     filter = filter_functions("de")
     scorer = ModelUpperScorer(ref_fname, ambig_fname, filter)
     scorer(graphs)
     self.check_scores(graphs)
Exemplo n.º 2
0
def compute_classifier_score(ns):
    models = ns.TranslationClassifier(ns.models_fname)
    if not ns.score_attr:
        ns.score_attr = ns.name + "_score"
    if not hasattr(ns, "vectorizer"):
        ns.vectorizer = Vectorizer()
    scorer = ns.ClassifierScore(models,
                                score_attr=ns.score_attr,
                                filter=filter_functions(ns.source_lang),
                                vectorizer=ns.vectorizer)
    scorer(ns.graphs)
Exemplo n.º 3
0
def preprocess(data_set, lang_pair):
    source_lang, target_lang = lang_pair.split("-")
    graphs_fname = config["eval"][data_set][lang_pair]["graphs_fname"]
    out_dir = os.path.dirname(graphs_fname)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
        
    # annotate
    annotator = get_annotator(source_lang)
    graph_list = annotator.annot_xml_file(
        config["eval"][data_set][lang_pair]["src_fname"])
    
    # lookup translations
    dict_fname = TransDict.load(config["dict"][lang_pair]["pkl_fname"])
    lookup = Lookup(dict_fname)
    lookup(graph_list)
    
    # score most frequent translation
    freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"])
    freq_score(graph_list)
    
    # dict upper scores  
    lemma_ref_fname = \
        config["eval"][data_set][lang_pair]["lemma_ref_fname"]
    scorer = DictUpperScorer(lemma_ref_fname)
    scorer(graph_list)
    
    # model upper scores  
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
    filter = filter_functions(source_lang)
    scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
    scorer(graph_list)
    
    # save graphs
    log.info("saving preprocessed graphs to " + graphs_fname)
    cPickle.dump(graph_list, open(graphs_fname, "wb"))
    
Exemplo n.º 4
0
def nb_exp(data_sets=config["eval"]["data_sets"], 
                lang_pairs=(),
                text=False,
                draw=False,
                diff=False,
                trash_models=False):
    
    n_components = 10
    
    descriptor = [ ("data", "S16"),
                   ("lang", "S8"),
                   ("nist", "f"),
                   ("blue", "f"),
                   ("name", "S256") ] 
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0    
    script_fname = os.path.splitext(os.path.basename(__file__))[0]
    results_fname = "_" + script_fname + "_results.txt"
    results_outf = open(results_fname, "w")    
    
    for data in data_sets: 
        for lang in  lang_pairs or config["eval"][data].keys():
            ambig_fname = config["sample"][lang]["ambig_fname"]
            try:
                samples_fname = config["sample"][lang]["samples_filt_fname"]
            except KeyError:
                samples_fname = config["sample"][lang]["samples_fname"]
                log.warn("backing off to unfiltered samples from " + 
                         samples_fname)
            graphs_fname = config["eval"][data][lang]["graphs_fname"]
            
            name = "{}_{}_{}".format(
                script_fname, data, lang)
            exp_dir = "_" + name     
            if not os.path.exists(exp_dir):
                os.makedirs(exp_dir)
            models_fname = exp_dir + "/" + name + ".hdf5"
            classifier = Pipeline( [("MCF", MinCountFilter(5)),
                                    ("MFF", MaxFreqFilter(0.05)),
                                    ("CHI2", SelectFpr(chi2, alpha=0.001 )),
                                    ("NMF", NMF(n_components=n_components)),
                                    ("MNB", MultinomialNB()),
                                    ])
            
            # get ambiguity map
            ambig_map = AmbiguityMap(ambig_fname, graphs_fname=graphs_fname)
            #ambig_map = AmbiguityMap(ambig_fname, subset={"klar/adj"})

            # train classifier
            model_builder = ModelBuilder( ambig_map, samples_fname,
                                          models_fname, classifier) #,with_vocab_mask=True)
            model_builder.run()
            
            # apply classifier
            model = TranslationClassifier(models_fname)
            score_attr="nb_score"
            source_lang = lang.split("-")[0]
            scorer = ClassifierScore(model,
                                     score_attr=score_attr,
                                     filter=filter_functions(source_lang),
                                     vectorizer="mft")
            graph_list = cPickle.load(open(graphs_fname))
            scorer(graph_list)
            
            best_scorer = BestScorer(["nb_score", "freq_score"])
            best_scorer(graph_list)
            
            scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl"
            log.info("saving scored graphs to " + scored_graphs_fname)
            cPickle.dump(graph_list, open(scored_graphs_fname, "w"))
            #graph_list = cPickle.load(open(scored_graphs_fname))
            
            nist_score, bleu_score = postprocess(
                name, data, lang, graph_list, 
                best_score_attr="best_score",
                base_score_attrs=["nb_score","freq_score"],
                out_dir=exp_dir,
                base_fname=name,
                text=text,
                draw=draw,
                diff=diff
            ) 
            
            results[exp_count] = (data, lang, nist_score, bleu_score, name)
            results_fname = exp_dir + "/" + name + ".npy"
            log.info("saving result to " + results_fname)
            np.save(results_fname, results[exp_count])
            exp_count += 1
            
            if trash_models:
                log.info("Trashing models file " + models_fname)
                os.remove(models_fname)
            
            # add to table of results per data set & language pair
            sub_results = results[(results["lang"] == lang) &
                                  (results["data"] == data)]
            sub_results = np.sort(sub_results, 
                                  axis=0, 
                                  order=("lang", "blue"))[::-1]
            text_table(sub_results, results_outf)
            results_outf.write("\n\n")
            
    results_outf.close()
    results = results[:exp_count]       
    results_fname = "_" + script_fname + "_results.npy"    
    log.info("saving pickled results to " + results_fname)
    np.save(results_fname, results)
    text_table(results)
    
    return results
Exemplo n.º 5
0
def centroid_exp(data_sets=config["eval"]["data_sets"], 
                lang_pairs=(),
                text=False,
                draw=False,
                diff=False,
                trash_models=False,
                dump_centroids=False):
    
    descriptor = [ ("data", "S16"),
                   ("lang", "S8"),
                   ("min_count", "f"),
                   ("max_freq", "f"),
                   ("nist", "f"),
                   ("blue", "f"),
                   ("name", "S256") ] 
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0    
    script_fname = os.path.splitext(os.path.basename(__file__))[0]
    results_fname = "_" + script_fname + "_results.txt"
    results_outf = open(results_fname, "w")    
    
    for data in data_sets: 
        for lang in  lang_pairs or config["eval"][data].keys():
            ambig_fname = config["sample"][lang]["ambig_fname"]
            try:
                samples_fname = config["sample"][lang]["samples_filt_fname"]
            except KeyError:
                samples_fname = config["sample"][lang]["samples_fname"]
                log.warn("backing off to unfiltered samples from " + 
                         samples_fname)
            graphs_fname = config["eval"][data][lang]["graphs_fname"]
            
            #for min_count in (1, 5, 10, 25, 50, 100, 250, 1000, 2500, 5000):
            #    for max_freq in (0.0001, 0.001, 0.005, 0.01, 0.05, 0.10, 0.25, 0.5, 1.0):
            for min_count in (5,):
                for max_freq in (0.01,):
                    name = "{}_{}_{}_min_count={:d}_max_freq={:f}".format(
                        script_fname, data, lang, min_count, max_freq)
                    exp_dir = "_" + name     
                    if not os.path.exists(exp_dir):
                        os.makedirs(exp_dir)
                    models_fname = exp_dir + "/" + name + ".hdf5"
                    classifier = Pipeline( [("MCF", MinCountFilter(min_count)),
                                            ("MFF", MaxFreqFilter(max_freq)),
                                            ("CHI2", SelectFpr()),
                                            #("TFIDF", TfidfTransformer()),
                                            ("CNC", CosNearestCentroid())
                                            #("NC", NearestCentroidProb())
                                            ])
        
                    # train classifier
                    model_builder = ModelBuilder( 
                        ambig_fname, samples_fname, models_fname, classifier,
                        graphs_fname, with_vocab_mask=True)
                    model_builder.run()
                    
                    # print the centroids to a file, only the 50 best features
                    if dump_centroids:
                        print_fname = exp_dir + "/" + name + "_centroids.txt"
                        print_centroids(models_fname, 
                                        n=50, 
                                        outf=print_fname)
                
                    # apply classifier
                    model = TranslationClassifier(models_fname)
                    score_attr="centroid_score"
                    source_lang = lang.split("-")[0]
                    scorer = ClassifierScore(model,
                                             score_attr=score_attr,
                                             filter=filter_functions(source_lang))
                    graph_list = cPickle.load(open(graphs_fname))
                    scorer(graph_list)
                    
                    best_scorer = BestScorer(["centroid_score", "freq_score"])
                    best_scorer(graph_list)
                    
                    scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl"
                    log.info("saving scored graphs to " + scored_graphs_fname)
                    cPickle.dump(graph_list, open(scored_graphs_fname, "w"))
                    #graph_list = cPickle.load(open(scored_graphs_fname))
                    
                    nist_score, bleu_score = postprocess(
                        name, data, lang, graph_list, 
                        best_score_attr="best_score",
                        base_score_attrs=["centroid_score","freq_score"],
                        out_dir=exp_dir,
                        base_fname=name,
                        text=text,
                        draw=draw,
                        diff=diff
                    ) 
                    
                    results[exp_count] = (data, lang, min_count, max_freq,
                                          nist_score, bleu_score, name)
                    results_fname = exp_dir + "/" + name + ".npy"
                    log.info("saving result to " + results_fname)
                    np.save(results_fname, results[exp_count])
                    exp_count += 1
                    
                    if trash_models:
                        log.info("Trashing models file " + models_fname)
                        os.remove(models_fname)
            
            
            sub_results = results[(results["lang"] == lang) &
                                  (results["data"] == data)]
            sub_results = np.sort(sub_results, 
                                  axis=0, 
                                  order=("lang", "blue"))[::-1]
            text_table(sub_results, results_outf)
            results_outf.write("\n\n")
            
    results_outf.close()
    results = results[:exp_count]       
    results_fname = "_" + script_fname + "_results.npy"    
    log.info("saving pickled results to " + results_fname)
    np.save(results_fname, results)
    
    text_table(results)
    
    return results
Exemplo n.º 6
0
def make_graphs():
    """
    Create annotated translations graphs with scores for random translation,
    most frequent translation and approximated maximum. Also create minimal
    translation dictionaries for these graphs and drawings.
    """
    for lang_pair, src_fname, lemma_ref_fname in [ 
        ("en-de", 
         "sample_newstest2011-src.en.sgm", 
         "lemma_sample_newstest2011-ref.de.sgm"),
        ("de-en", 
         "sample_out_de-en.src", 
         "lemma_sample_out_de-en.ref") ]:
        source_lang, target_lang = lang_pair.split("-")
        root_fname = splitext(src_fname)[0]
        
        # annotate
        annotator = get_annotator(source_lang)
        graphs = annotator.annot_xml_file(src_fname)    
        
        # lookup
        dict_fname = config["dict"][lang_pair]["pkl_fname"]
        trans_dict = TransDict.load(dict_fname)
        lookup = LookupKeepKeys(trans_dict)
        lookup(graphs)
        
        #  write pickle of minimal translation dict
        min_dict = lookup.get_minimal_trans_dict()
        min_dict_fname = "dict_" + root_fname + ".pkl"
        dump(min_dict, open(min_dict_fname, "wb"))
        
        # score most frequent translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        freq_score = FreqScorer(counts_fname)
        freq_score(graphs)
        
        # score random translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        rand_score = RandScorer()
        rand_score(graphs)
        
        # dict upper score
        maxscore = DictUpperScorer(lemma_ref_fname)
        maxscore(graphs)
    
        # model upper scores  
        ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
        filter = filter_functions(source_lang)
        scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
        scorer(graphs)
        
        # draw graphs
        draw = Draw()
        draw(graphs, out_format="pdf", 
             base_score_attrs=["dup_score", "mup_score", "freq_score", 
                               "rand_score"], 
             out_dir="_draw_" + lang_pair)
        
        # save graphs
        graphs_fname = "graphs_" + root_fname + ".pkl"
        dump(graphs, open(graphs_fname, "wb"))