def main(argv): input_file = open(argv[0], 'r') output_file = open(argv[1], 'wb') sentences = input_file.read().split("\n\n") v = viterbi.Viterbi() oowv_count = 0.; for sentence in sentences: word_infos = sentence.split("\n") words = [] for word_info in word_infos: elements = word_info.split("\t") if elements.__len__() > 1: if elements[1] != '_': words.append(elements[1]) if words.__len__() > 0: tags, oowv_counts, tag_names = v.decode(words) for word, tag in zip(words, tags): output_file.writelines(word+"|"+tag+'\n') output_file.writelines('\n') oowv_count += oowv_counts fp_1 = open("oowv_count.pkl", "w") pickle.dump(oowv_count, fp_1) fp_2 = open("tag_names.pkl", "w") pickle.dump(tag_names, fp_2) output_file.close()
def tag(tags, vocab, A, B): """ Tag development/test data """ tagged = [] # Preprocess data data_fp = DEV_WORDS if split == "test": data_fp = TEST_WORDS orig, prep = preprocess(vocab, data_fp) # Decode decoder = viterbi.Viterbi(vocab, tags, prep, A, B) pred = decoder.decode() for word, tag in zip(orig, pred): tagged.append((word, tag)) # Write output file out_fp = DEV_OUT if split == "test": out_fp = TEST_OUT with open(out_fp, "w") as out: for word, tag in tagged: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close()
def main(string, option): train = '../dataset/CoNLL-2003/eng.train' allData = load_sentences(train) globalData = [] for i in allData: for j in i: globalData.append(j) globalData = [tuple(l) for l in globalData] freq_word = nltk.FreqDist(word for (word, tag, chunck, ne) in globalData) freq_tag = nltk.FreqDist(tag for (word, tag, chunck, ne) in globalData) freq_chunck = nltk.FreqDist(chunck for (word, tag, chunck, ne) in globalData) freq_ne = nltk.FreqDist(ne for (word, tag, chunck, ne) in globalData) allbigrams = list(nltk.bigrams(globalData)) #all unique words,tag s,chuncks,ne allWords = [a for a in freq_word] allTags = [a for a in freq_tag] allChunck = [a for a in freq_chunck] allNe = [a for a in freq_ne] #P(NE|Sentence/context)=P(context|NE)*P(NE) dict1 = {} #P(NEi|NEi-1) for i in allNe: ne_type = [b[3] for (a, b) in allbigrams if a[3] == i] dict1[i] = ne_type for i in dict1: dict1[i] = dict(Counter(dict1[i])) totalCount = sum(dict1[i].values()) for k in dict1[i]: dict1[i][k] /= float(totalCount) #print (i, ':', dict1[i]) print('--------aa---------') #P(W|NE) dict2 = {} for i in allNe: dict2[i] = [a for (a, x, y, j) in globalData if j == i] for i in dict2: dict2[i] = dict(Counter(dict2[i])) totalCount = sum(dict2[i].values()) for k in dict2[i]: dict2[i][k] /= float(totalCount) print('-------bb----------') #P(POSi|POSi-1) dict3 = {} for i in allTags: dict3[i] = [b[1] for (a, b) in allbigrams if a[1] == i] for i in dict3: dict3[i] = dict(Counter(dict3[i])) totalCount = sum(dict3[i].values()) for k in dict3[i]: dict3[i][k] /= float(totalCount) print('--------cc---------') #P(POS|NE) dict5 = {} for i in allNe: dict5[i] = [x for (a, x, y, j) in globalData if j == i] for i in dict5: dict5[i] = dict(Counter(dict5[i])) totalCount = sum(dict5[i].values()) for k in dict5[i]: dict5[i][k] /= float(totalCount) print('--------aa---------') #P(Chunk|NE) dict6 = {} for i in allNe: dict6[i] = [y for (a, x, y, j) in globalData if j == i] for i in dict6: dict6[i] = dict(Counter(dict6[i])) totalCount = sum(dict6[i].values()) for k in dict6[i]: dict6[i][k] /= float(totalCount) print('-----------------') #P(POS,W|NE) dict4 = {} for i in allNe: dict4[i] = [(a, x) for (a, x, y, j) in globalData if j == i] for i in dict4: dict4[i] = dict(Counter(dict4[i])) totalCount = sum(dict4[i].values()) for k in dict4[i]: dict4[i][k] /= float(totalCount) #P(POS|W)=P(W|POS) #P(fet|NE) #Probability of Named entity being starting starting NE sentencestart = dict(zip(allNe, [1 for x in range(0, len(allNe))])) for i in allData: if i[0][3] in sentencestart.keys(): sentencestart[i[0][3]] += 1 else: sentencestart[i[0][3]] = 1 totalCount = sum(sentencestart.values()) for i in sentencestart: sentencestart[i] /= float(totalCount) print('-----------------') if (option == '0'): predicted = [] list_ne = [] text = sent_tokenize(string) for sentence in text: sentence_list = word_tokenize(sentence) sent = [] for j in sentence_list: sent += [j] param = {} param['states'] = tuple(allNe) #named-entities param['observations'] = tuple(sent) #word param['start_probability'] = sentencestart #tag param['transition_probability'] = dict1 param['emission_probability'] = dict2 obj = viterbi.Viterbi(param) predicted = predicted + [obj.viterbi()[1]] print sent print obj.viterbi()[1] i = 0 for sentence in text: j = 0 for word in word_tokenize(sentence): list_ne.append(word) list_ne.append(str('[' + predicted[i][j] + ']')) j += 1 i += 1 return ' '.join(list_ne) if (option == '1'): print("-----testing-------") test = '../dataset/CoNLL-2003/eng.testb' allTestData = load_sentences(test) count = 0 predicted = [] tags = [] for i in allTestData: sent = [] for j in i: j[0] = j[0].encode('utf8') sent += [j[0]] #print(sent) tags = nltk.pos_tag(sent) pos = [b for (a, b) in tags] #chunk=nltk.chunk_sents(sent) param = {} param['states'] = tuple(allNe) #named-entities param['observations'] = tuple(sent) #word param['start_probability'] = sentencestart #tag param['transition_probability'] = dict1 param['emission_probability'] = dict2 # param['states'] = tuple(allNe) #named-entities # param['observations'] = tuple(tags) #word # param['start_probability'] = sentencestart #tag # param['transition_probability'] = dict1 # param['emission_probability'] = dict4 # param['states'] = tuple(allNe) #named-entities # param['observations'] = tuple(pos) #word # param['start_probability'] = sentencestart #tag # param['transition_probability'] = dict1 # param['emission_probability'] = dict5 obj = viterbi.Viterbi(param) predicted = predicted + [obj.viterbi()[1]] count += 1 #obj.efficiency() actual = [] for i in allTestData: line = [] for j in i: line = line + [j[3]] actual = actual + [line] x = 0 correct = 0 total = 0 for i in actual: y = 0 for j in i: if (j == predicted[x][y]): correct += 1 total += 1 y += 1 x += 1 accuracy = ((correct + 0.0) / total) * 100 predicted_label = [item for sublist in predicted for item in sublist] actual_label = [item for sublist in actual for item in sublist] cn = confusion_matrix(actual_label, predicted_label, labels=allNe) print(cn) print(accuracy)
predicted_tag = [] predicted_chunk = [] for i in allTestData: sent = [] for j in i: j[0] = j[0].encode('utf8') sent += [j[0]] #print(sent)d param = {} param['states'] = tuple(allTags) param['observations'] = tuple(sent) param['start_probability'] = sentencestart param['transition_probability'] = dict2 param['emission_probability'] = dict3 obj = viterbi.Viterbi(param) x = obj.viterbi()[1] predicted_tag = predicted_tag + [x] param1 = {} param1['states'] = tuple(allChunk) param1['observations'] = tuple(x) param1['start_probability'] = startchunk param1['transition_probability'] = dict1 param1['emission_probability'] = dict4 obj = viterbi.Viterbi(param1) predicted_chunk = predicted_chunk + [obj.viterbi()[1]] count += 1
import hmm import viterbi if __name__ == '__main__': import sys input = sys.argv[1] output = sys.argv[2] model = hmm.HiddenMarkovModel(input) model.train() vito = viterbi.Viterbi(input, model, output) vito.calculateAccuracy()
tags_without_start_period = [] for tag in tags_type: if tag != '.' and tag != 'S': tags_without_start_period.append(tag) b = [{} for n in range(len(tags_without_start_period))] for index, tag in enumerate(tags_without_start_period): for wordtag in dict_of_wordGivenTag_percentage: if wordtag[1] == tag: # print wordtag[1], tag b[index][wordtag[0]] = dict_of_wordGivenTag_percentage[wordtag] # print tags_without_start_period # print 'b', b # print len(b), len(b[0]) b_matrix = {} for index, tag in enumerate(tags_without_start_period): b_matrix[tag] = b[index] print 'b_matrix', b_matrix observation = ['i', "'d", 'like', 'to', 'go', 'to', 'a', 'fancy', 'restaurant'] print observation print '$' * 70 print viterbi.Viterbi(len(tags_without_start_period), len(observation), a_matrix, b_matrix, observation, tags_without_start_period) # list = [0,0,1,4,5,4,3,6,7] # for i in list: # print tags_without_start_period[list[i]] # print tags_without_start_period, "00"