示例#1
0
文件: hmm_tagger.py 项目: TahaK/NLP
def main(argv):
    input_file = open(argv[0], 'r')
    output_file = open(argv[1], 'wb')
    sentences = input_file.read().split("\n\n")
    v = viterbi.Viterbi()
    oowv_count = 0.;
    for sentence in sentences:
        word_infos = sentence.split("\n")
        words = []

        for word_info in word_infos:
            elements = word_info.split("\t")
            if elements.__len__() > 1:
                if elements[1] != '_':
                    words.append(elements[1])
        if words.__len__() > 0:
            tags, oowv_counts, tag_names = v.decode(words)

            for word, tag in zip(words, tags):
                output_file.writelines(word+"|"+tag+'\n')

            output_file.writelines('\n')
            oowv_count += oowv_counts

    fp_1 = open("oowv_count.pkl", "w")
    pickle.dump(oowv_count, fp_1)

    fp_2 = open("tag_names.pkl", "w")
    pickle.dump(tag_names, fp_2)

    output_file.close()
示例#2
0
def tag(tags, vocab, A, B):
    """
    Tag development/test data
    """
    tagged = []

    # Preprocess data
    data_fp = DEV_WORDS
    if split == "test":
        data_fp = TEST_WORDS
    orig, prep = preprocess(vocab, data_fp)

    # Decode
    decoder = viterbi.Viterbi(vocab, tags, prep, A, B)
    pred = decoder.decode()

    for word, tag in zip(orig, pred):
        tagged.append((word, tag))

    # Write output file
    out_fp = DEV_OUT
    if split == "test":
        out_fp = TEST_OUT

    with open(out_fp, "w") as out:
        for word, tag in tagged:
            if not word:
                out.write("\n")
            else:
                out.write("{0}\t{1}\n".format(word, tag))

    out.close()
def main(string, option):
    train = '../dataset/CoNLL-2003/eng.train'

    allData = load_sentences(train)
    globalData = []
    for i in allData:
        for j in i:
            globalData.append(j)

    globalData = [tuple(l) for l in globalData]

    freq_word = nltk.FreqDist(word for (word, tag, chunck, ne) in globalData)
    freq_tag = nltk.FreqDist(tag for (word, tag, chunck, ne) in globalData)
    freq_chunck = nltk.FreqDist(chunck
                                for (word, tag, chunck, ne) in globalData)
    freq_ne = nltk.FreqDist(ne for (word, tag, chunck, ne) in globalData)

    allbigrams = list(nltk.bigrams(globalData))

    #all unique words,tag   s,chuncks,ne
    allWords = [a for a in freq_word]
    allTags = [a for a in freq_tag]
    allChunck = [a for a in freq_chunck]
    allNe = [a for a in freq_ne]
    #P(NE|Sentence/context)=P(context|NE)*P(NE)

    dict1 = {}
    #P(NEi|NEi-1)
    for i in allNe:
        ne_type = [b[3] for (a, b) in allbigrams if a[3] == i]
        dict1[i] = ne_type
    for i in dict1:
        dict1[i] = dict(Counter(dict1[i]))
        totalCount = sum(dict1[i].values())
        for k in dict1[i]:
            dict1[i][k] /= float(totalCount)
        #print (i, ':', dict1[i])

    print('--------aa---------')
    #P(W|NE)
    dict2 = {}
    for i in allNe:
        dict2[i] = [a for (a, x, y, j) in globalData if j == i]
    for i in dict2:
        dict2[i] = dict(Counter(dict2[i]))
        totalCount = sum(dict2[i].values())
        for k in dict2[i]:
            dict2[i][k] /= float(totalCount)

    print('-------bb----------')
    #P(POSi|POSi-1)
    dict3 = {}
    for i in allTags:
        dict3[i] = [b[1] for (a, b) in allbigrams if a[1] == i]
    for i in dict3:
        dict3[i] = dict(Counter(dict3[i]))
        totalCount = sum(dict3[i].values())
        for k in dict3[i]:
            dict3[i][k] /= float(totalCount)
    print('--------cc---------')
    #P(POS|NE)
    dict5 = {}
    for i in allNe:
        dict5[i] = [x for (a, x, y, j) in globalData if j == i]
    for i in dict5:
        dict5[i] = dict(Counter(dict5[i]))
        totalCount = sum(dict5[i].values())
        for k in dict5[i]:
            dict5[i][k] /= float(totalCount)
    print('--------aa---------')
    #P(Chunk|NE)
    dict6 = {}
    for i in allNe:
        dict6[i] = [y for (a, x, y, j) in globalData if j == i]
    for i in dict6:
        dict6[i] = dict(Counter(dict6[i]))
        totalCount = sum(dict6[i].values())
        for k in dict6[i]:
            dict6[i][k] /= float(totalCount)

    print('-----------------')
    #P(POS,W|NE)
    dict4 = {}
    for i in allNe:
        dict4[i] = [(a, x) for (a, x, y, j) in globalData if j == i]
    for i in dict4:
        dict4[i] = dict(Counter(dict4[i]))
        totalCount = sum(dict4[i].values())
        for k in dict4[i]:
            dict4[i][k] /= float(totalCount)

    #P(POS|W)=P(W|POS)
    #P(fet|NE)

    #Probability of Named entity being starting starting NE
    sentencestart = dict(zip(allNe, [1 for x in range(0, len(allNe))]))
    for i in allData:
        if i[0][3] in sentencestart.keys():
            sentencestart[i[0][3]] += 1
        else:
            sentencestart[i[0][3]] = 1
    totalCount = sum(sentencestart.values())
    for i in sentencestart:
        sentencestart[i] /= float(totalCount)
    print('-----------------')
    if (option == '0'):
        predicted = []
        list_ne = []
        text = sent_tokenize(string)
        for sentence in text:
            sentence_list = word_tokenize(sentence)
            sent = []
            for j in sentence_list:
                sent += [j]
            param = {}
            param['states'] = tuple(allNe)  #named-entities
            param['observations'] = tuple(sent)  #word
            param['start_probability'] = sentencestart  #tag
            param['transition_probability'] = dict1
            param['emission_probability'] = dict2
            obj = viterbi.Viterbi(param)
            predicted = predicted + [obj.viterbi()[1]]
            print sent
            print obj.viterbi()[1]
        i = 0
        for sentence in text:
            j = 0
            for word in word_tokenize(sentence):
                list_ne.append(word)
                list_ne.append(str('[' + predicted[i][j] + ']'))
                j += 1
            i += 1
        return ' '.join(list_ne)
    if (option == '1'):
        print("-----testing-------")
        test = '../dataset/CoNLL-2003/eng.testb'
        allTestData = load_sentences(test)

        count = 0
        predicted = []
        tags = []
        for i in allTestData:
            sent = []
            for j in i:
                j[0] = j[0].encode('utf8')
                sent += [j[0]]
            #print(sent)
            tags = nltk.pos_tag(sent)
            pos = [b for (a, b) in tags]
            #chunk=nltk.chunk_sents(sent)
            param = {}

            param['states'] = tuple(allNe)  #named-entities
            param['observations'] = tuple(sent)  #word
            param['start_probability'] = sentencestart  #tag
            param['transition_probability'] = dict1
            param['emission_probability'] = dict2

            #    param['states'] = tuple(allNe) #named-entities
            #    param['observations'] = tuple(tags) #word
            #    param['start_probability'] = sentencestart #tag
            #    param['transition_probability'] = dict1
            #    param['emission_probability'] = dict4

            #    param['states'] = tuple(allNe) #named-entities
            #    param['observations'] = tuple(pos) #word
            #    param['start_probability'] = sentencestart #tag
            #    param['transition_probability'] = dict1
            #    param['emission_probability'] = dict5

            obj = viterbi.Viterbi(param)
            predicted = predicted + [obj.viterbi()[1]]
            count += 1

        #obj.efficiency()
        actual = []
        for i in allTestData:
            line = []
            for j in i:
                line = line + [j[3]]
            actual = actual + [line]
        x = 0
        correct = 0
        total = 0
        for i in actual:
            y = 0
            for j in i:
                if (j == predicted[x][y]):
                    correct += 1
                total += 1
                y += 1
            x += 1
        accuracy = ((correct + 0.0) / total) * 100
        predicted_label = [item for sublist in predicted for item in sublist]
        actual_label = [item for sublist in actual for item in sublist]
        cn = confusion_matrix(actual_label, predicted_label, labels=allNe)
        print(cn)
        print(accuracy)
示例#4
0
predicted_tag = []
predicted_chunk = []
for i in allTestData:
    sent = []
    for j in i:
        j[0] = j[0].encode('utf8')
        sent += [j[0]]
    #print(sent)d
    param = {}

    param['states'] = tuple(allTags)
    param['observations'] = tuple(sent)
    param['start_probability'] = sentencestart
    param['transition_probability'] = dict2
    param['emission_probability'] = dict3
    obj = viterbi.Viterbi(param)
    x = obj.viterbi()[1]
    predicted_tag = predicted_tag + [x]

    param1 = {}

    param1['states'] = tuple(allChunk)
    param1['observations'] = tuple(x)
    param1['start_probability'] = startchunk
    param1['transition_probability'] = dict1
    param1['emission_probability'] = dict4

    obj = viterbi.Viterbi(param1)
    predicted_chunk = predicted_chunk + [obj.viterbi()[1]]

    count += 1
示例#5
0
import hmm
import viterbi

if __name__ == '__main__':
    import sys
    input = sys.argv[1]
    output = sys.argv[2]

    model = hmm.HiddenMarkovModel(input)
    model.train()

    vito = viterbi.Viterbi(input, model, output)
    vito.calculateAccuracy()

示例#6
0
tags_without_start_period = []
for tag in tags_type:
    if tag != '.' and tag != 'S':
        tags_without_start_period.append(tag)

b = [{} for n in range(len(tags_without_start_period))]
for index, tag in enumerate(tags_without_start_period):
    for wordtag in dict_of_wordGivenTag_percentage:
        if wordtag[1] == tag:
            #             print wordtag[1], tag
            b[index][wordtag[0]] = dict_of_wordGivenTag_percentage[wordtag]
# print tags_without_start_period
# print 'b', b
# print len(b), len(b[0])
b_matrix = {}
for index, tag in enumerate(tags_without_start_period):
    b_matrix[tag] = b[index]
print 'b_matrix', b_matrix

observation = ['i', "'d", 'like', 'to', 'go', 'to', 'a', 'fancy', 'restaurant']
print observation

print '$' * 70
print viterbi.Viterbi(len(tags_without_start_period), len(observation),
                      a_matrix, b_matrix, observation,
                      tags_without_start_period)

# list = [0,0,1,4,5,4,3,6,7]
# for i in list:
#     print tags_without_start_period[list[i]]
# print tags_without_start_period, "00"