示例#1
0
def run_all():
    language_pairs = "en-de", "de-en"
    tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"),
                   join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") )
    
    descriptor = {'names': ('lang_pair', 'vocab_i', 'vocab_j', 'NIST', 'BLUE'), 
                  'formats': ('S8','i4', 'i4', 'f4', 'f4')}
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0
    
    for lang_pair, tab_fname in zip(language_pairs, tab_fnames):
        target_lang = lang_pair.split("-")[1]
        samp_fname = target_lang + "_samples_subset_filtered.hdf5"
        graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair)
        lempos_subset = extract_source_lempos_subset(graphs_pkl_fname)
        target_lang = lang_pair.split("-")[1]
        counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        
        for vocab_i, vocab_j in cut_vocab(samp_fname, counts_pkl_fname): 
            results[exp_count] = ( lang_pair, 
                                   vocab_i,
                                   vocab_j,
                                   0,
                                   0
                                   )
            exp_dir = "exp_" + "_".join("{}={}".format(var, value) 
                                        for var, value in zip(results.dtype.names, 
                                                              results[exp_count])[:-2])
            if not os.path.exists(exp_dir):
                os.makedirs(exp_dir)
            classifier = MultinomialNB()
            models_fname = join(exp_dir, "nb_models.hdf5")
            make_models(tab_fname, samp_fname, models_fname, classifier,
                        source_lempos_subset=lempos_subset, 
                        counts_pkl_fname=counts_pkl_fname,
                        vocab_i=vocab_i, vocab_j=vocab_j) 
            nist, blue = score_model(lang_pair, exp_dir, draw=False)
            results[exp_count]["NIST"] = nist 
            results[exp_count]["BLUE"] = blue
            exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("nb_cut_vocab_results.pkl")
示例#2
0
def run_all():
    language_pairs = "en-de", "de-en"
    tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"),
                   join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") )
    extended_vectors = True, #False
    classifier_types = MultinomialNB, BernoulliNB
    alpha_values = 1.0, 0.1, 0.01, np.finfo(np.double).eps
    corpus_prior_values = False, True
    # the fit_prior parameters seems to make absolutely no difference
    # fit_prior_values = True, False
    
    descriptor = {'names': ('lang_pair', 'classifier', 'alpha', 
                            'corpus_priors', 'extended', 'NIST', 'BLUE'), 
                  'formats': ('S8', 'S64', 'f4', 'b', 'b', 'f4', 'f4')}
    results = np.zeros(9999, dtype=descriptor)
    exp_count = 0
    
    for lang_pair, tab_fname in zip(language_pairs, tab_fnames):
        graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair)
        lempos_subset = extract_source_lempos_subset(graphs_pkl_fname)
        target_lang = lang_pair.split("-")[1]
        
        for extended in extended_vectors:
            if extended:
                samp_fname = target_lang + "_samples_subset_filtered_extended.hdf5"
            else:
                samp_fname = target_lang + "_samples_subset_filtered.hdf5"
                
            for corpus_prior in corpus_prior_values:
                if corpus_prior:
                    counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
                else:
                    counts_pkl_fname = None
            
                for classifier_class in classifier_types:
                    for alpha in alpha_values:
                        results[exp_count] = ( lang_pair, 
                                               classifier_class.__name__,
                                               alpha,
                                               corpus_prior,
                                               extended,
                                               0,
                                               0
                                               )
                        exp_dir = "exp_" + "_".join("{}={}".format(var, value) 
                                                    for var, value in zip(results.dtype.names, 
                                                                          results[exp_count])[:-2])
                        if not os.path.exists(exp_dir):
                            os.makedirs(exp_dir)
                        classifier = classifier_class(alpha=alpha)
                        models_fname = join(exp_dir, "nb_models.hdf5")
                        make_models(tab_fname, samp_fname, models_fname, classifier,
                                    source_lempos_subset=lempos_subset, 
                                    counts_pkl_fname=counts_pkl_fname) 
                        nist, blue = score_model(lang_pair, exp_dir, draw=False)
                        results[exp_count]["NIST"] = nist 
                        results[exp_count]["BLUE"] = blue
                        exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("results.pkl")