Пример #1
0
def run_all():
    language_pairs =  "de-en", "en-de"
    tab_fnames = ( join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab"),
                   join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"))
    
    descriptor = {'names': ('lang_pair', 'k', 'NIST', 'BLUE'), 
                  'formats': ('S8', 'i4', 'f4', 'f4')}
    #percentiles = [0.25, 0.5, 1, 2.5, 5, 10, 20, 25, 30, 40, 50, 60, 70, 80, 90]
    #p_values = [0.1, 0.05, 0.01, 0.005, 0.001]
    #p_values = [0.0005, 0.0001, 0.00005, 0.00001]
    #p_values = [0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001]
    #p_values = [0.00000005, 0.00000001, 0.000000005, 0.000000001, 0.0000000005, 0.000000001]
    #k_values = [1,2,3,4,5,10,25,50,100,250,500,1000]
    k_values = [150,200,300,350,400,450]
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0
    
    
    for lang_pair, tab_fname in zip(language_pairs, tab_fnames):
        target_lang = lang_pair.split("-")[1]
        samp_fname = target_lang + "_samples_subset_filtered.hdf5"
        graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair)
        target_lang = lang_pair.split("-")[1]
        counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        
        for k in k_values:
            results[exp_count] = ( lang_pair, 
                                   k,
                                   0,
                                   0
                                   )
            exp_dir = "exp_" + "_".join("{}={}".format(var, value) 
                                        for var, value in zip(results.dtype.names, 
                                                              results[exp_count])[:-2])
            if not os.path.exists(exp_dir):
                os.makedirs(exp_dir)
            #else:
            #    continue
            
            classifier = MultinomialNB()
            models_fname = join(exp_dir, "nb_models.hdf5")
            
            builder = NBModelBuilder(tab_fname, samp_fname, models_fname,
                                     classifier, graphs_pkl_fname=graphs_pkl_fname,
                                     counts_pkl_fname=counts_pkl_fname,
                                     feat_selector=SelectKBest(chi2, k))
            builder.run()
            nist, blue = score_model(lang_pair, exp_dir, draw=False)
            results[exp_count]["NIST"] = nist 
            results[exp_count]["BLUE"] = blue
            
            exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("nb_feat_select_bst_results_2.pkl")
Пример #2
0
def run_all():
    language_pairs = "en-de", "de-en"
    tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"),
                   join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") )
    
    descriptor = {'names': ('lang_pair', 'vocab_i', 'vocab_j', 'NIST', 'BLUE'), 
                  'formats': ('S8','i4', 'i4', 'f4', 'f4')}
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0
    
    for lang_pair, tab_fname in zip(language_pairs, tab_fnames):
        target_lang = lang_pair.split("-")[1]
        samp_fname = target_lang + "_samples_subset_filtered.hdf5"
        graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair)
        lempos_subset = extract_source_lempos_subset(graphs_pkl_fname)
        target_lang = lang_pair.split("-")[1]
        counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        
        for vocab_i, vocab_j in cut_vocab(samp_fname, counts_pkl_fname): 
            results[exp_count] = ( lang_pair, 
                                   vocab_i,
                                   vocab_j,
                                   0,
                                   0
                                   )
            exp_dir = "exp_" + "_".join("{}={}".format(var, value) 
                                        for var, value in zip(results.dtype.names, 
                                                              results[exp_count])[:-2])
            if not os.path.exists(exp_dir):
                os.makedirs(exp_dir)
            classifier = MultinomialNB()
            models_fname = join(exp_dir, "nb_models.hdf5")
            make_models(tab_fname, samp_fname, models_fname, classifier,
                        source_lempos_subset=lempos_subset, 
                        counts_pkl_fname=counts_pkl_fname,
                        vocab_i=vocab_i, vocab_j=vocab_j) 
            nist, blue = score_model(lang_pair, exp_dir, draw=False)
            results[exp_count]["NIST"] = nist 
            results[exp_count]["BLUE"] = blue
            exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("nb_cut_vocab_results.pkl")
Пример #3
0
def run_all():
    language_pairs = "en-de", "de-en"
    tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"),
                   join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") )
    extended_vectors = True, #False
    classifier_types = MultinomialNB, BernoulliNB
    alpha_values = 1.0, 0.1, 0.01, np.finfo(np.double).eps
    corpus_prior_values = False, True
    # the fit_prior parameters seems to make absolutely no difference
    # fit_prior_values = True, False
    
    descriptor = {'names': ('lang_pair', 'classifier', 'alpha', 
                            'corpus_priors', 'extended', 'NIST', 'BLUE'), 
                  'formats': ('S8', 'S64', 'f4', 'b', 'b', 'f4', 'f4')}
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0
    
    for lang_pair, tab_fname in zip(language_pairs, tab_fnames):
        graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair)
        lempos_subset = extract_source_lempos_subset(graphs_pkl_fname)
        target_lang = lang_pair.split("-")[1]
        
        for extended in extended_vectors:
            if extended:
                samp_fname = target_lang + "_samples_subset_filtered_extended.hdf5"
            else:
                samp_fname = target_lang + "_samples_subset_filtered.hdf5"
                
            for corpus_prior in corpus_prior_values:
                if corpus_prior:
                    counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
                else:
                    counts_pkl_fname = None
            
                for classifier_class in classifier_types:
                    for alpha in alpha_values:
                        results[exp_count] = ( lang_pair, 
                                               classifier_class.__name__,
                                               alpha,
                                               corpus_prior,
                                               extended,
                                               0,
                                               0
                                               )
                        exp_dir = "exp_" + "_".join("{}={}".format(var, value) 
                                                    for var, value in zip(results.dtype.names, 
                                                                          results[exp_count])[:-2])
                        if not os.path.exists(exp_dir):
                            os.makedirs(exp_dir)
                        classifier = classifier_class(alpha=alpha)
                        models_fname = join(exp_dir, "nb_models.hdf5")
                        make_models(tab_fname, samp_fname, models_fname, classifier,
                                    source_lempos_subset=lempos_subset, 
                                    counts_pkl_fname=counts_pkl_fname) 
                        nist, blue = score_model(lang_pair, exp_dir, draw=False)
                        results[exp_count]["NIST"] = nist 
                        results[exp_count]["BLUE"] = blue
                        exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("results.pkl")