Exemplo n.º 1
0
def main():
    hmm_file = sys.argv[1]
    tag_file = sys.argv[2]
    hmm, tagset = get_param_tagset(hmm_file, tag_file)

    ts, test_tags = data_reader.read_tagging_data(sys.argv[3])
    test_sentences = replace_test(ts, hmm, tagset)

    index = 0
    for sentence in test_sentences:
        index += 1
        print ' '.join(ts[index-1])
	#sentence = "`` We would have to wait until we have collected on those assets before we can move forward , '' he said ."
	#sentence = replace_test([sentence.split(' ')], hmm, tagset)[0]
	#sys.stderr.write(str(index) + " " + ' '.join(sentence) + "\n")

	tagmap = filter_tagset(sentence, tagset, hmm)
	tagseqs = find_all_tagseqs(len(sentence), tagmap, "", [])

	resultmap = {}
	for tagline in tagseqs:
	    tagseq = tagline.strip().split(' ')
	    score = find_log_prob(hmm, tagset, sentence, tagseq)
	    if score != '':
		key = ' '.join(tagseq)
		resultmap[key] = score
	sorted_x = sorted(resultmap.iteritems(), key=operator.itemgetter(1))[-10:]
	for k, v in sorted_x:
	    print k, '\t', "{0:.2f}".format(v)
        print
        if index > 100:
           break 
Exemplo n.º 2
0
def execute(dataset, hmm_file, tag_file, k):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv_rates = [] # to keep track of the convergence rates varying with k
    for j in range(k):
        conv_rates.append(0.0)

    for sentence in test_sentences:
        k_best = []
        if True: 
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            #TODO remove redundancy
            best_tags, num_iter, second_best, sb2 = dd_tagger_fst.run(sentence, tagset, hmm)
            conv_rates[0] += 1
            k_best.append(best_tags)
            #next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
            if num_iter == -1:
                sys.stderr.write("2nd best does not converge :( \n")
                #print ' '.join(best_tags)
                #print
                #continue
            j = 2 # we have the best, and the second best now
            conv_rates[j-1] += 1
            sys.stderr.write(str(j) + " best converges in " + str(num_iter) + " iterations \n")
            k_best.append(second_best)
         
            while j < k:
                next_best, num_iter = dd_k_best.run(sentence, tagset, hmm, k_best)
                k_best.append(next_best)
                if num_iter != -1:
                    conv_rates[j] += 1
                    #k_best.append(next_best)
                    sys.stderr.write(str(j+1) + " best converges in " + str(num_iter) + " iterations \n")
                else:
                    sys.stderr.write(str(j+1) + "th best does not converge\n")
                    #break
                j += 1

        for best in k_best:
            print ' '.join(best)                
        print

    for j in range(k):
        sys.stderr.write("convergence rate of " + str(j) + " best = " + str(conv_rates[j]*100/conv_rates[0]) + "% \n")
Exemplo n.º 3
0
def execute(dataset, hmm_file, tag_file):
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    conv = 0
    sec_conv = 0
    for sentence in test_sentences:
       
        if True: #len(sentence) < 15:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            sys.stderr.write(' '.join(sentence) + "\n")
            print ' '.join(sentence)
            
            k_best = []
            best_tags, num_iter, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
            if num_iter != -1:
                sec_conv += 1
                sys.stderr.write("2nd best converges in " + str(num_iter) + "\n")
                k_best.append(best_tags)
                k_best.append(tags1)
                
                third_best, num_iter2 = dd_k_best.run(sentence, tagset, hmm, k_best)
                if num_iter2 != -1:
                    sys.stderr.write("3rd best converges in " + str(num_iter2) + "\n")
                    conv += 1
                    k_best.append(third_best)
                    fourth_best, num_iter3 = dd_k_best.run(sentence, tagset, hmm, k_best)
                    sys.stderr.write("4th best converges in " + str(num_iter3) + "\n")
                    print ' '.join(best_tags)
                    print ' '.join(tags2)
                    print ' '.join(third_best)
                    print ' '.join(fourth_best)
                else:
                    sys.stderr.write( "3rd best does not converge :(\n")
            else:
                sys.stderr.write("2nd best does not converge :(\n")
                continue
            print
    
    sys.stderr.write("% convergence of 2nd best =" + str(sec_conv*100/i) + "\n")
    sys.stderr.write("% convergence of 3rd best =" + str(conv*100/sec_conv) + "\n")
Exemplo n.º 4
0
def execute(dataset, hmm_file, tag_file):
    sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    sys.stderr.write("reading dev data...\n")
    test_sentences, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(test_sentences, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    for sentence in test_sentences:
       
        if len(sentence) > 0 :#True: #len(tree) < 100:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]
            
            sys.stderr.write('\n' + str(i)+ '\n')
            tagprint(test_sents_not_rare[test_sentences.index(sentence)])
            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
           
            tagprint(best_tags)
            tagprint(tags2)
            if num_iterations != -1:
                
                sys.stderr.write("fst tagger accuracy = " + str(evaluate.accuracy(truetags, tags2)) + "\n")
                sys.stderr.write("best tags accuracy = " + str(evaluate.accuracy(truetags, best_tags)) + "\n")
                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                print
                sys.stderr.write("does not converge :(\n")
            tagprint(truetags)
            print
        #if i==100:
            #break     
    sys.stderr.write("\n" + str(avg_iterations/converges) + " iterations on average\n")
    sys.stderr.write(str(converges*100/i) +  " % convergence\n")
    sys.stderr.write("time_taken = "+ str(time.time() - start_time) + "\n")
def execute(dataset, hmm_file, tag_file):
    # sys.stderr.write("loading learnt parameters...\n")
    hmm, tagset = hmm_utils.get_param_tagset(hmm_file, tag_file)

    # sys.stderr.write("reading dev data...\n")
    ts, test_tags = data_reader.read_tagging_data(dataset)
    test_sentences = replace_test(ts, hmm, tagset)

    i = 0
    converges = 0
    avg_iterations = 0
    start_time = time.time()

    fst_acc = 0
    best_acc = 0
    wrong = 0
    for sentence in test_sentences:

        if True:
            i += 1
            truetags = test_tags[test_sentences.index(sentence)]

            sys.stderr.write("\n" + str(i) + "\n")
            # sys.stderr.write(' '.join(ts[i-1]) + "\n")
            print " ".join(sentence)
            # print ' '.join(ts[i-1])

            best_tags, num_iterations, tags1, tags2 = dd_tagger_fst.run(sentence, tagset, hmm)
            if tags2 == best_tags:
                sys.stderr.write("YOU ARE WRONG!\n")
                wrong += 1

            if num_iterations != -1:
                facc = evaluate.accuracy(truetags, tags2)
                # sys.stderr.write("fst tagger accuracy = " + str(facc) + "\n")
                fst_acc += facc

                bacc = evaluate.accuracy(truetags, best_tags)
                # sys.stderr.write("best tags accuracy = " + str(bacc) + "\n")
                best_acc += bacc

                sys.stderr.write("converges in " + str(num_iterations) + " iterations \n")
                converges += 1
                avg_iterations += num_iterations
            else:
                sys.stderr.write("does not converge :(\n")
            print " ".join(best_tags)
            print " ".join(tags2)
            # print "gold  : ", ' '.join(truetags)
            print

            # if i == 100:
            # break
    sys.stderr.write("\nsystem performance\n--------------------\n")
    sys.stderr.write("\n" + str(wrong * 100 / converges) + "% sequences are wrong:\n")
    sys.stderr.write("\naverage accuracy of best: " + str(best_acc / converges) + "\n")
    sys.stderr.write("average accuracy of 2nd best: " + str(fst_acc / converges) + "\n")

    sys.stderr.write("\nsystem efficiency\n---------------------\n")
    sys.stderr.write("\n" + str(avg_iterations / converges) + " iterations on average\n")
    sys.stderr.write(str(converges * 100 / i) + " % convergence\n")
    sys.stderr.write("time_taken = " + str(time.time() - start_time) + "\n")
Exemplo n.º 6
0
Replaces all emissions with frequency <= 5 with the word
-RARE-
'''
def smooth_emission(emission_counts):
    e_counts = defaultdict()
    for key, val in emission_counts.iteritems():
        if val <= 5:
            tag, word = key.split('~>')
            new_key = tag + '~>-RARE-'
            if new_key in e_counts:
                e_counts[new_key] += val
            else:
                e_counts[new_key] = val
        else:
            e_counts[key] = val

    return e_counts

if __name__=='__main__':
    #replace(sys.argv[1])
    test_sent_tags = sys.argv[1]
    hmm, tagset = hmm_utils.get_param_tagset(sys.argv[2], sys.argv[3])
    ts, test_tags = data_reader.read_tagging_data(test_sent_tags)
    test_sentences = replace_test(ts, hmm, tagset)

    for ts in test_sentences:
        for word in ts:
            print word
        print 

Exemplo n.º 7
0
    for i in range(len(seq)):
        tag = seq[i]
        word = sent[i]
        score += get_local_score(word, prev, tag, hmm)
        prev = tag
    score += get_local_score("", prev, "STOP", hmm)
    return score

def get_aug_hmm(seq, sent, hmm, dd_u):
    score = get_hmm_only_score(seq, sent, hmm)
    for i in range(len(seq)):
        score -= dd_u[i][seq[i]]
    return score        
 
if __name__ == "__main__":
    sentences, truetags = data_reader.read_tagging_data(sys.argv[1])
    hmm, tagset = hmm_utils.get_param_tagset(sys.argv[2], sys.argv[3])
    sentences = replace_test(sentences, hmm, tagset)
    print sentences

    i = 0     
    tot_acc = 0.0
    for sentence in sentences:
#        for tag in tagset:
#            print tag,"\t",
#        print
        tags = run(sentence, tagset, hmm, None) 
        #tot_acc += evaluate.accuracy(truetags, tags)
        print sentence
        print tags, evaluate.accuracy(truetags[i], tags)
        print truetags[i], " :gold"
Exemplo n.º 8
0
    counts = open("pos.counts", "w")
    #emission_counts = smooth_emission(emission_counts)
    for em, count in emission_counts.iteritems():
        tag, terminal = em.split('~>')
        counts.write(str(count)+ " WORDTAG "+ tag+ " "+ terminal+ "\n")
    for trans, count in transition_counts.iteritems():
        prev_tag, current_tag = trans.split("~>")
        counts.write(str(count)+ " 2-GRAM "+ prev_tag+ " "+ current_tag+ "\n")
    counts.close() 

def learn(sentences, tagseqs):
    em_counts = defaultdict()
    trans_counts = defaultdict()
    tag_counts = defaultdict()

    for i in range(len(sentences)):
        sentence = sentences[i]
        tagseq = tagseqs[i]
        update_counts(sentence, tagseq, em_counts, trans_counts, tag_counts)

#    write_hmm_params(em_counts, trans_counts, tag_counts)
    write_for_java(em_counts, trans_counts)
    return em_counts, trans_counts

if __name__ == "__main__":
    rare_sent_tags = sys.argv[1]
    sentences, tagseqs = data_reader.read_tagging_data(rare_sent_tags)
    emission, transition = learn(sentences, tagseqs)