Exemplo n.º 1
0
def max_scores():
    subset = r[(r["alpha"] == 0.001) &
               (r["loss"] == "log") &
               (r["n_iter"] == 5) &
               (r["penalty"] == "l2")]
    subset.sort(axis=0, order=["f-score"])
    text_table(subset[::-1])
    print
Exemplo n.º 2
0
def ambig_dist_report(lang_pairs=config["dict"].keys(), 
                      entry="lempos",
                      with_single_word=True, 
                      with_multi_word=False,
                      max_trans=1000,
                      outf=sys.stdout):
    """
    Report statistics on translation ambiguity in dictionary
    
    Parameters
    ----------
    trans_dict: TransDict object 
    entry: 'lempos' or 'lemma'
        count lemma and POS tag combinations or lemmas only (sums ambiguity of 
        lemmas with different POS tag) 
    with_single_word: bool
        count single words
    with_multi_word: bool
        count multi-word expressions
    max_trans: int
        maximum number of translations considered
    outf: file
        output file (defaults to stdout)
    """
    for lang_pair in lang_pairs:
        pkl_fname = config["dict"][lang_pair]["pkl_fname"]
        outf.write("dictionary file: {}\n".format(pkl_fname))
        outf.write("language pair: {}\n".format(lang_pair))
        outf.write("entries: {}\n".format(entry))
        outf.write("count single word entries: {}\n".format(with_single_word))
        outf.write("count multi-word entries: {}\n".format(with_multi_word))
        outf.write("maximum number of translations: {}\n".format(max_trans))
        trans_dict = cPickle.load(open(pkl_fname)) 
            
        dist = ambig_dist(trans_dict, entry=entry,
                          with_single_word=with_single_word, 
                          with_multi_word=with_multi_word,
                          max_trans=max_trans)
        
        outf.write("total number of entries: {0}\n".format(dist["count"].sum()))
        outf.write("total number of ambiguous entries: {0} ({1:.2f}%)\n".format(
            dist[2:]["count"].sum(),
            dist[2:]["percent"].sum()))
        outf.write("total number of non-ambiguous entries: {0} ({1:.2f}%)\n".format(
            dist[:2]["count"].sum(),
            dist[:2]["percent"].sum()))
          
        av_ambig = ( (dist[2:]["count"] * dist[2:]["#trans"]).sum() / 
                     dist[2:]["count"].sum().astype("float"))
        outf.write("average ambiguity (over ambiguous entries only): "
                   "{0:.2f} translations\n\n".format(av_ambig))
   
        text_table(dist, outf)
        outf.write("\n\n")
        print "\n"
Exemplo n.º 3
0
def summary():
    params = ["alpha", "loss", "n_iter", "penalty"]
    scores = ["prec", "rec", "f-score", "accuracy"]
    
    
    
    keys = np.unique(r[params])
    
    summary = np.zeros(len(keys), r.dtype.descr[3:])
    
    for i, k in enumerate(keys):
        subset = r[r[params] == k]
        subset_scores = subset[scores]
        view = subset_scores.view(("f", len(subset_scores.dtype.names)))
        means = view.mean(axis=0)
        summary[i] = tuple(k) + tuple(means)
    
    
    summary.sort(axis=0, order=["f-score"])
    text_table(summary[::-1])
    print
Exemplo n.º 4
0
def nb_exp(data_sets=config["eval"]["data_sets"], 
                lang_pairs=(),
                text=False,
                draw=False,
                diff=False,
                trash_models=False):
    
    n_components = 10
    
    descriptor = [ ("data", "S16"),
                   ("lang", "S8"),
                   ("nist", "f"),
                   ("blue", "f"),
                   ("name", "S256") ] 
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0    
    script_fname = os.path.splitext(os.path.basename(__file__))[0]
    results_fname = "_" + script_fname + "_results.txt"
    results_outf = open(results_fname, "w")    
    
    for data in data_sets: 
        for lang in  lang_pairs or config["eval"][data].keys():
            ambig_fname = config["sample"][lang]["ambig_fname"]
            try:
                samples_fname = config["sample"][lang]["samples_filt_fname"]
            except KeyError:
                samples_fname = config["sample"][lang]["samples_fname"]
                log.warn("backing off to unfiltered samples from " + 
                         samples_fname)
            graphs_fname = config["eval"][data][lang]["graphs_fname"]
            
            name = "{}_{}_{}".format(
                script_fname, data, lang)
            exp_dir = "_" + name     
            if not os.path.exists(exp_dir):
                os.makedirs(exp_dir)
            models_fname = exp_dir + "/" + name + ".hdf5"
            classifier = Pipeline( [("MCF", MinCountFilter(5)),
                                    ("MFF", MaxFreqFilter(0.05)),
                                    ("CHI2", SelectFpr(chi2, alpha=0.001 )),
                                    ("NMF", NMF(n_components=n_components)),
                                    ("MNB", MultinomialNB()),
                                    ])
            
            # get ambiguity map
            ambig_map = AmbiguityMap(ambig_fname, graphs_fname=graphs_fname)
            #ambig_map = AmbiguityMap(ambig_fname, subset={"klar/adj"})

            # train classifier
            model_builder = ModelBuilder( ambig_map, samples_fname,
                                          models_fname, classifier) #,with_vocab_mask=True)
            model_builder.run()
            
            # apply classifier
            model = TranslationClassifier(models_fname)
            score_attr="nb_score"
            source_lang = lang.split("-")[0]
            scorer = ClassifierScore(model,
                                     score_attr=score_attr,
                                     filter=filter_functions(source_lang),
                                     vectorizer="mft")
            graph_list = cPickle.load(open(graphs_fname))
            scorer(graph_list)
            
            best_scorer = BestScorer(["nb_score", "freq_score"])
            best_scorer(graph_list)
            
            scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl"
            log.info("saving scored graphs to " + scored_graphs_fname)
            cPickle.dump(graph_list, open(scored_graphs_fname, "w"))
            #graph_list = cPickle.load(open(scored_graphs_fname))
            
            nist_score, bleu_score = postprocess(
                name, data, lang, graph_list, 
                best_score_attr="best_score",
                base_score_attrs=["nb_score","freq_score"],
                out_dir=exp_dir,
                base_fname=name,
                text=text,
                draw=draw,
                diff=diff
            ) 
            
            results[exp_count] = (data, lang, nist_score, bleu_score, name)
            results_fname = exp_dir + "/" + name + ".npy"
            log.info("saving result to " + results_fname)
            np.save(results_fname, results[exp_count])
            exp_count += 1
            
            if trash_models:
                log.info("Trashing models file " + models_fname)
                os.remove(models_fname)
            
            # add to table of results per data set & language pair
            sub_results = results[(results["lang"] == lang) &
                                  (results["data"] == data)]
            sub_results = np.sort(sub_results, 
                                  axis=0, 
                                  order=("lang", "blue"))[::-1]
            text_table(sub_results, results_outf)
            results_outf.write("\n\n")
            
    results_outf.close()
    results = results[:exp_count]       
    results_fname = "_" + script_fname + "_results.npy"    
    log.info("saving pickled results to " + results_fname)
    np.save(results_fname, results)
    text_table(results)
    
    return results
Exemplo n.º 5
0
def run_cv1(lang_pair, results_fname, subset=None):
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]
    ambig_map = AmbiguityMap(ambig_fname, subset=subset)
    
    samples_fname = config["sample"][lang_pair]["samples_filt_fname"]
    sample_hdfile = h5py.File(samples_fname, "r")
    
    data_gen = DataSetGenerator(ambig_map, sample_hdfile)
    
    classifiers = list(sgd_classifier(
        _alpha = (0.00001, 0.0001, 0.001),
        _loss = ("hinge", "log"),
        _n_iter = (5, 10),
        _penalty = ("l1", "l2"),
        shuffle = True,              # shuffle seems always benificial
        random_state = 73761232569,  # but needs to be repeatable
        n_jobs = 10,
        
    ))
    
    descriptor = [ ("lemma", "U32"),
                   ("pos", "U32"),
                   ("#cand", "i"),
                   ("alpha", "f"),
                   ("loss", "S16"),
                   ("n_iter", "i"),
                   ("penalty", "S16"),
                   ("prec", "f"),
                   ("rec", "f"),
                   ("f-score", "f"),
                   ("accuracy", "f")] 
    results = np.zeros(9999, dtype=descriptor)

    i = 0
    
    for n, data in enumerate(data_gen):
        if not data.target_lempos:
            log.error(data.source_lempos + u"no samples")
            continue
        log.info(u"{}/{} {}".format(n+1, len(ambig_map), data.source_lempos))
        lemma, pos = data.source_lempos.rsplit("/", 1)
        n_cand = len(data.target_lempos)
        # *** shuffling is essential for SGD! *** 
        samples, targets = shuffle(data.samples, data.targets)
        
        for classifier in classifiers:
            scorer = Scorer()
            cross_val_score(classifier, 
                            samples, 
                            targets,
                            scoring=scorer)  
            params = (lemma,
                      pos,  
                      n_cand,
                      classifier.alpha,
                      classifier.loss,
                      classifier.n_iter,
                      classifier.penalty)
            results[i] =  params + tuple(scorer.mean_scores())
            i += 1
            np.save(results_fname, results[:i])
            text_table(results[:i], 
                       results_fname.replace(".npy", ".txt"))
Exemplo n.º 6
0
def centroid_exp(data_sets=config["eval"]["data_sets"], 
                lang_pairs=(),
                text=False,
                draw=False,
                diff=False,
                trash_models=False,
                dump_centroids=False):
    
    descriptor = [ ("data", "S16"),
                   ("lang", "S8"),
                   ("min_count", "f"),
                   ("max_freq", "f"),
                   ("nist", "f"),
                   ("blue", "f"),
                   ("name", "S256") ] 
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0    
    script_fname = os.path.splitext(os.path.basename(__file__))[0]
    results_fname = "_" + script_fname + "_results.txt"
    results_outf = open(results_fname, "w")    
    
    for data in data_sets: 
        for lang in  lang_pairs or config["eval"][data].keys():
            ambig_fname = config["sample"][lang]["ambig_fname"]
            try:
                samples_fname = config["sample"][lang]["samples_filt_fname"]
            except KeyError:
                samples_fname = config["sample"][lang]["samples_fname"]
                log.warn("backing off to unfiltered samples from " + 
                         samples_fname)
            graphs_fname = config["eval"][data][lang]["graphs_fname"]
            
            #for min_count in (1, 5, 10, 25, 50, 100, 250, 1000, 2500, 5000):
            #    for max_freq in (0.0001, 0.001, 0.005, 0.01, 0.05, 0.10, 0.25, 0.5, 1.0):
            for min_count in (5,):
                for max_freq in (0.01,):
                    name = "{}_{}_{}_min_count={:d}_max_freq={:f}".format(
                        script_fname, data, lang, min_count, max_freq)
                    exp_dir = "_" + name     
                    if not os.path.exists(exp_dir):
                        os.makedirs(exp_dir)
                    models_fname = exp_dir + "/" + name + ".hdf5"
                    classifier = Pipeline( [("MCF", MinCountFilter(min_count)),
                                            ("MFF", MaxFreqFilter(max_freq)),
                                            ("CHI2", SelectFpr()),
                                            #("TFIDF", TfidfTransformer()),
                                            ("CNC", CosNearestCentroid())
                                            #("NC", NearestCentroidProb())
                                            ])
        
                    # train classifier
                    model_builder = ModelBuilder( 
                        ambig_fname, samples_fname, models_fname, classifier,
                        graphs_fname, with_vocab_mask=True)
                    model_builder.run()
                    
                    # print the centroids to a file, only the 50 best features
                    if dump_centroids:
                        print_fname = exp_dir + "/" + name + "_centroids.txt"
                        print_centroids(models_fname, 
                                        n=50, 
                                        outf=print_fname)
                
                    # apply classifier
                    model = TranslationClassifier(models_fname)
                    score_attr="centroid_score"
                    source_lang = lang.split("-")[0]
                    scorer = ClassifierScore(model,
                                             score_attr=score_attr,
                                             filter=filter_functions(source_lang))
                    graph_list = cPickle.load(open(graphs_fname))
                    scorer(graph_list)
                    
                    best_scorer = BestScorer(["centroid_score", "freq_score"])
                    best_scorer(graph_list)
                    
                    scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl"
                    log.info("saving scored graphs to " + scored_graphs_fname)
                    cPickle.dump(graph_list, open(scored_graphs_fname, "w"))
                    #graph_list = cPickle.load(open(scored_graphs_fname))
                    
                    nist_score, bleu_score = postprocess(
                        name, data, lang, graph_list, 
                        best_score_attr="best_score",
                        base_score_attrs=["centroid_score","freq_score"],
                        out_dir=exp_dir,
                        base_fname=name,
                        text=text,
                        draw=draw,
                        diff=diff
                    ) 
                    
                    results[exp_count] = (data, lang, min_count, max_freq,
                                          nist_score, bleu_score, name)
                    results_fname = exp_dir + "/" + name + ".npy"
                    log.info("saving result to " + results_fname)
                    np.save(results_fname, results[exp_count])
                    exp_count += 1
                    
                    if trash_models:
                        log.info("Trashing models file " + models_fname)
                        os.remove(models_fname)
            
            
            sub_results = results[(results["lang"] == lang) &
                                  (results["data"] == data)]
            sub_results = np.sort(sub_results, 
                                  axis=0, 
                                  order=("lang", "blue"))[::-1]
            text_table(sub_results, results_outf)
            results_outf.write("\n\n")
            
    results_outf.close()
    results = results[:exp_count]       
    results_fname = "_" + script_fname + "_results.npy"    
    log.info("saving pickled results to " + results_fname)
    np.save(results_fname, results)
    
    text_table(results)
    
    return results
Exemplo n.º 7
0
        ("bleu", "f"),
        ("exp_name", "S128"),   
    ] 

new_results = np.zeros(len(old_results), descriptor)

for i, exp in enumerate(old_results):
    ref_fname = config["eval"][exp["data"]][exp["source"] + "-" + exp["target"]]["lemma_ref_fname"]
    graphs_fname = "_{}/{}_graphs.pkl".format(name, exp["exp_name"])
    graphs = cPickle.load(open(graphs_fname))
    accuracy = accuracy_score(graphs, ref_fname, name + "_score")
    new_results[i]["graphs"] = len(graphs)
    new_results[i]["data"] = exp["data"]
    new_results[i]["source"] = exp["source"]
    new_results[i]["target"] = exp["target"]
    new_results[i]["min_count"] = exp["min_count"]
    new_results[i]["max_freq"] = exp["max_freq"]
    new_results[i]["correct"] = accuracy.correct
    new_results[i]["incorrect"] = accuracy.incorrect
    new_results[i]["accuracy"] = accuracy.score
    new_results[i]["ignored"] = accuracy.ignored
    new_results[i]["nist"] = exp["nist"]
    new_results[i]["bleu"] = exp["bleu"]
    new_results[i]["exp_name"] = exp["exp_name"]

np.save("_" + name + "-acc.npy", new_results)

text_table(new_results, "_" + name + "-acc.txt") 

    
    
Exemplo n.º 8
0
def print_results(results_fname, out_fname=None):
    table = numpy.load(results_fname)
    text_table(table, out_fname)