示例#1
0
 def test_grow(self):
     descriptor = [("a", "f"), ("b", "S8")]
     fname_prefix = NamedTemporaryFile().name
     buf_size = 5
     store = ResultsStore(descriptor, fname_prefix, buf_size=5)
     ns = Namespace(a=1, b="x")
     for _ in range(buf_size):
         store.append(ns)
     assert store.results.shape[0] == buf_size
     # results is filled - appening one more must grow results
     store.append(ns)
     assert store.results.shape[0] == 2 * buf_size
示例#2
0
def sgd_1(name = "sgd-1", 
          data_sets = ("metis", "presemt-dev",),
          lang=None,
          n_graphs=None,
          n_jobs=1):
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("class_weighting", "b"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"), 
        ("correct", "i", "accuracy.correct"),
        ("incorrect", "i", "accuracy.incorrect"),
        ("ignored", "i", "accuracy.ignored"),
        ("accuracy", "f", "accuracy.score"),       
        ("exp_name", "S128"),        
        ("models_fname", "S256"),
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    # best setting found in sgd-cv exps
    classifier = SGDClassifier(loss = "log",
                               penalty = "l2",
                               alpha = 0.001,
                               n_iter = 5,
                               shuffle=True,
                               n_jobs=n_jobs)
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            classifier=classifier,
            data=data,
            _lang=lang or config["eval"][data].keys(),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            n_graphs=n_graphs,
            # *** input to SGDClassifier must be shuffled! ***
            shuffle=True,
            _class_weighting=(True, False),
        )
        
        for ns in exps: 
            result_store.append(ns)
示例#3
0
def fs_2(data_sets=("metis, presemt-dev"), 
         n_graphs=None):
    name = "fs-2"
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("min_count", "f", "MCF__min_count"),
        ("max_freq", "f", "MFF__max_freq"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"),        
        ("exp_name", "S128"),   
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    
    # tricky: 'classifiers' cannot be an iterator
    # because it is called many times during grid_search
    classifiers = list(nb_classifier(
        _min_count=[1, 5, 10, 25, 50, 100, 250, 500],
        _max_freq=[1.0, 0.5, 0.1, 0.075, 0.05, 0.025, 0.01, 0.005],
        chi2_alpha=None))
    
    vectorizer=Vectorizer(score_attr="freq_score")
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=config["eval"][data].keys(),
            #_lang=("de-en",),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            build_models=nb_build_model,
            vectorizer=vectorizer,
            thrash_models=ex.thrash_models,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)
示例#4
0
def bounds(data_sets=config["eval"]["data_sets"], lang_pairs=()): 
    """
    Compute upper and lower bounds on scores. 
    
    The baseline that serves as the lower bound is the Most Frequent
    Translation (MFT) score, which is obtained by choosing the translation
    with te highest frequency in the target language corpus.
    
    Upper bound is the Approximated Maximum (AM) score, which is obtained by
    choosing the translation that occurs most often in the reference
    translations(s) of the sentence. 
    
    Probility scores for lempos translations are already in the preprocessed
    graphs. This function just computes the resulting NIST and BLEU scores.
    """
    name = "bounds"
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("score_attr", "S16", "score_attr"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"),        
        ("exp_name", "S128"),        
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            classifier=None,
            data=data,
            _lang=lang_pairs or config["eval"][data].keys(),
            _score_attr=("freq_score","dup_score", "mup_score"),
            build=ex.SKIP,
            compute_classifier_score=ex.SKIP,
            write_text=ex.SKIP,
            write_diff=ex.SKIP,
            draw_graphs=ex.SKIP)
        
        for ns in exps: 
            result_store.append(ns)
示例#5
0
def lr_1(name = "lr-1", 
          data_sets = ("presemt-dev",),
          n_graphs=None):
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("loss", "S16", "classifier.loss"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"), 
        ("correct", "i", "accuracy.correct"),
        ("incorrect", "i", "accuracy.incorrect"),
        ("ignored", "i", "accuracy.ignored"),
        ("accuracy", "f", "accuracy.score"),       
        ("exp_name", "S128"),        
        ("models_fname", "S256"),
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    # tricky: 'classifiers' cannot be an iterator
    # because it is called many times during grid_search
    classifiers = list(lr_classifier(
        ))
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=config["eval"][data].keys(),
            #_lang=("de-en",),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            #build_models=lr_build_models,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)
示例#6
0
def nc_2(name = "nc-2", n_graphs=None):
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("metric", "S16", "NCC__metric"),
        ("vect_score_attr", "S16", "vectorizer.score_attr"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"), 
        ("correct", "i", "accuracy.correct"),
        ("incorrect", "i", "accuracy.incorrect"),
        ("ignored", "i", "accuracy.ignored"),
        ("accuracy", "f", "accuracy.score"),       
        ("exp_name", "S128"),        
        ("models_fname", "S256"),
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    vectorizers= [Vectorizer(score_attr=score_attr) 
                  for score_attr in (None, "freq_score", "dup_score")]
    nc_1_results = np.load("_nc-1.npy")
    
    for record in nc_1_results:
        exps = ex.single_exp(
                    name=name,
                    data=record["data"],
                    lang=record["source"] + "-" + record["target"],
                    classifier=None,
                    write_text=ex.SKIP,
                    draw_graphs=ex.SKIP,
                    build_models=ex.SKIP,
                    trash_models=ex.SKIP,
                    models_fname=record["models_fname"],
                    _vectorizer=vectorizers,
                    n_graphs=n_graphs,
                )  
        
        for ns in exps: 
            # hack, because there is no classifier in exps
            ns.NCC__metric = record["metric"]
            result_store.append(ns)
示例#7
0
def nb_2(name = "nb-2", n_graphs=None):
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("class_weighting", "b"),
        ("vect_score_attr", "S16", "vectorizer.score_attr"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"),  
        ("correct", "i", "accuracy.correct"),
        ("incorrect", "i", "accuracy.incorrect"),
        ("ignored", "i", "accuracy.ignored"),
        ("accuracy", "f", "accuracy.score"),
        ("exp_name", "S128"),        
        ("models_fname", "S256"),
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    vectorizers=list(vectorizer(
        _score_attr=(None, "freq_score", "dup_score")))
    nb_1_results = np.load("_nb-1.npy")
    
    for record in nb_1_results:
        exps = ex.single_exp(
                    name=name,
                    data=record["data"],
                    lang=record["source"] + "-" + record["target"],
                    classifier=None,
                    class_weighting=record["class_weighting"],
                    write_text=ex.SKIP,
                    draw_graphs=ex.SKIP,
                    build_models=ex.SKIP,
                    trash_models=ex.SKIP,
                    models_fname=record["models_fname"],
                    _vectorizer=vectorizers,
                    n_graphs=n_graphs,
                )    
        
        for ns in exps: 
            result_store.append(ns)
示例#8
0
def nc_1(data_sets=config["eval"]["data_sets"],
         lang_pairs=(), n_graphs=None,
         name = "nc-1"):
    remove_exp_dir(name)
    descriptor = [ 
        ("data", "S16"),
        ("source", "S8",  "source_lang"),
        ("target", "S8", "target_lang"),
        ("metric", "S16", "NCC__metric"),
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"),
        ("correct", "i", "accuracy.correct"),
        ("incorrect", "i", "accuracy.incorrect"),
        ("ignored", "i", "accuracy.ignored"),
        ("accuracy", "f", "accuracy.score"),             
        ("exp_name", "S128"),       
        ("models_fname", "S256"),      
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    classifiers = list(nc_classifier(
        # Contrary to docs, l1 distance (manhattan) does NOT support sparse 
        _metric=("cosine", "euclidean")))
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=lang_pairs or config["eval"][data].keys(),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)