def execute(dataset, hmm_file, tag_file, k):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv_rates = [] # to keep track of the convergence rates varying with k
    for j in range(k):
        conv_rates.append(0.0)

    for sentence in test_sentences:
        k_best = []
        if True: 
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            #TODO remove redundancy
            best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm)
            conv_rates[0] += 1
            k_best.append(best_tags)
            next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
            if num_iter == -1:
                sys.stderr.write("2nd best does not converge :( \n")
                print ' '.join(best_tags)
                print
                continue
            j = 2 # we have the best, and the second best now
            conv_rates[j-1] += 1
            sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n")
            k_best.append(second_best)
         
            while j < k:
                next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
                if num_iter != -1:
                    conv_rates[j] += 1
                    k_best.append(next_best)
                    sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n")
                else:
                    sys.stderr.write(str(j+1) + "th best does not converge\n")
                    break
                j += 1

        for best in k_best:
            print ' '.join(best)                
        print

    for j in range(k):
        sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
Пример #2
0
def execute(dataset, hmm_file, tag_file):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv = 0
    sec_conv = 0
    for sentence in test_sentences:
       
        if True: #len(sentence) < 15:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            
            k_best = []
            best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
            if num_iter != -1:
                sec_conv += 1
                sys.stderr.write("2nd best converges in " + str(num_iter) + "\n")
                k_best.append(best_tags)
                k_best.append(tags1)
                
                third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best)
                if num_iter2 != -1:
                    sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n")
                    conv += 1
                    k_best.append(third_best)
                    fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best)
                    sys.stderr.write("4th best converges in " + str(num_iter3) + "\n")
                    print ' '.join(best_tags)
                    print ' '.join(tags2)
                    print ' '.join(third_best)
                    print ' '.join(fourth_best)
                else:
                    sys.stderr.write( "3rd best does not converge :(\n")
            else:
                sys.stderr.write("2nd best does not converge :(\n")
                continue
            print
    
    sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n")
    sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")