def train_pos_tag(dataset_dir, output_path): jumSample = 500000 namaFile = dataset_dir with open(namaFile, 'r', encoding='utf-8') as f: lines = f.read().split('\n') pasangan = [] allPasangan = [] for line in lines[:min(jumSample, len(lines))]: # Remove Wiki Tags line = re.sub('<[^>]*>', '', line) if line == '': if len(pasangan) != 0: allPasangan.append(pasangan) pasangan = [] else: kata, tag = line.split('\t') p = (kata, tag) pasangan.append(p) ct = CRFTagger() print("Training Tagger...") ct.train(allPasangan, output_path) print("Training Complete")
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = CRFTagger( feature_func=features ) self.tagger.train(train_sents, 'model.crf.tagger') # self.tagger = ClassifierBasedTagger( # train=train_sents, # feature_detector=features, # **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model.crf.tagger') print(tagger.evaluate(test_sentences)) return
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def run_crf(trainfile, testfile, model_file=None): maxlen = 100 sents_train, tags_train, unique_words_train, unique_tags_train = \ P.retrieve_sentences_tags(trainfile, maxlen=maxlen) sents_test, tags_test, unique_word_test, unique_tags_test = \ P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train) train_data = [] for n, st in enumerate(sents_train): s = [] for m, _ in enumerate(st): s.append((unicode(sents_train[n][m], "utf-8") , unicode(tags_train[n][m], "utf-8"))) train_data.append(s) crf = CRFTagger() if model_file is None: crf.train(train_data, model_file='data/crf.mdl') else: crf.set_model_file(model_file) test_data = [] for n, st in enumerate(sents_test): s = [] for m, _ in enumerate(st): s.append((unicode(sents_test[n][m], "utf-8") , unicode(tags_test[n][m], "utf-8"))) test_data.append(s) print(crf.evaluate(test_data))
def tagSentences(path, training_list=[], testing_list=[]): ct = CRFTagger() train_list = getTrainList(training_list) ct.train(train_list, 'model.crf.tagger') sentences = getSentences(path, testing_list) tagged_sentences = ct.tag_sents(sentences) return tagged_sentences
def train_taggers(): train_sents = load_pkl('train_sents') # instantiate taggers unigram_tagger = nltk.UnigramTagger(train_sents) tnt_tagger = tnt.TnT() perceptron_tagger = perceptron.PerceptronTagger(load=False) # limit the number of iteractions as the training takes too long crf_tagger = CRFTagger(training_opt={'max_iterations': 100}) print('Unigram tagger has already been trained') save_pkl(unigram_tagger, 'unigram-tagger') print('training TnT tagger ...', end='', flush=True) tnt_tagger.train(train_sents) print('Done') save_pkl(tnt_tagger, 'tnt-tagger') print('training Perceptron tagger ...', end='', flush=True) perceptron_tagger.train(train_sents) print('Done') save_pkl(perceptron_tagger, 'perceptron-tagger') print('training CRF tagger ...', end='', flush=True) crf_tagger.train(train_sents, 'crf-tagger.model') print('Done')
class CRF: def __init__(self): self.__model = type('test', (object,), {})() pass def train(self, X_training_data): self.__model = CRFTagger() self.__model.train(X_training_data, 'crf.model') pass def test(self, X_test_data): total = 0 correct = 0 for kalimat in X_test_data: temp = [] for word in kalimat: temp.append(word[0]) if len(temp) != 0: predicted_y = self.__model.tag(temp) for i in range(len(predicted_y)): total += 1 if predicted_y[i][1] == kalimat[i][1]: correct += 1 print(correct, total) print(correct / total) pass
class SlotTaggingModel(object): def __init__(self, **argparams): self.train_data = argparams['train_data'] if self.train_data is not None: assert isinstance(self.train_data, DataSetCSVagentActPred) self.model_folder = argparams['model_folder'] self.model_fname = '{}/slotTagging.model'.format(self.model_folder) def train(self, verbose=True): assert self.train_data is not None, 'train_data is required.' print('\ttraining ...') # transform data instance_list = self._transform_data(self.train_data) userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_train_fname) print('\ttrain_data={}'.format(userUtterTag_train_fname)) # train model self.model = CRFTagger(verbose=verbose) self.model.train(instance_list, self.model_fname) print('\tmodel_fname={}'.format(self.model_fname)) print('\tsaving model ...') def _transform_data(self, data): ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs ''' userUtter_txt = data.userUtter_txt userTag_txt = data.userTag_txt instance_list = list() for words, tags in zip(userUtter_txt, userTag_txt): instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())] instance_list.append(instance) return instance_list def predict(self, test_data): '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]] ''' assert test_data is not None, 'test_data is required.' assert isinstance(test_data, DataSetCSVagentActPred) print('\tpredicting Slot Tags ...') # transform data instance_list = self._transform_data(test_data) userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_test_fname) print('\ttag_target={}'.format(userUtterTag_test_fname)) instance_utter_list = getUtterList(instance_list) # testing results = self.model.tag_sents(instance_utter_list) self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder) print('\ttag_pred={}'.format(self.result_fname)) writeUtterTag(results, self.result_fname) precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size) print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame)) return results def load_model(self, verbose=True): print('\tloading model ...') self.model = CRFTagger(verbose=verbose) self.model.set_model_file(self.model_fname)
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model_windows_size_1.crf.tagger') #tagger = CRFTagger(feature_func=feature_func) #tagger.set_model_file('model_windows_size_1.crf.tagger') print(tagger.evaluate(test_sentences)) return
def main(no_stopwords, use_manual_train_set): print "MAINTAIN COMMON WORDS: " + str(not no_stopwords) print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set) full_set = get_domain_set(no_stopwords) if not no_stopwords: full_set.extend(get_other_set()) train_set, test_set_auto = divide_sets(full_set, 0.75) set_manual = get_manual_set(no_stopwords) train_set_manual = [] test_set_manual = [] if use_manual_train_set: train_set_manual, test_set_manual = divide_sets(set_manual, 0.28) train_set.extend(train_set_manual) else: test_set_manual = set_manual tagger = CRFTagger(feature_func=feature_extraction) try: tagger.train(train_set, 'laptop.crf.tagger') except ValueError: fi = open('DEBUG', 'w') for li in DEBUG: fi.write(str(li.encode('utf-8')) + '\n') fi.close() print "AUTOMATIC LABELED TEST" tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True)) predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto) golden_auto = create_vector_of_predicted_labels(test_set_auto) print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords) print "MANUAL LABELED TEST" tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True)) predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual) golden_manual = create_vector_of_predicted_labels(test_set_manual) print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords) print ""
def main(): import pickle from nltk.tag import CRFTagger infolist = pickle.load(open('infolist.pickle', 'rb')) ct = CRFTagger() train_data = [[(x, z) for [x, y, z] in infolist[:round(0.9 * len(infolist))]]] ct.train(train_data, 'model.crf.tagger') ners = ct.tag_sents( [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]) print(ners) gold_sentences = [[(x, z) for [x, y, z] in infolist[round(0.9 * len(infolist)):]]] ct.evaluate(gold_sentences) print(ct.evaluate(gold_sentences))
def load(training, testing): ct = CRFTagger() # split the training into sentences t = "\n".join(training) sents = t.split("###/###") # split the sentences into tokens train = [] for sent in sents: if sent: new = [] words = sent.split("\n") for word in words: if word: # split the tokens into word and tag new.append(tuple(word.split("/"))) train.append(new) # remove any blank sentences that have been added for t in train: if not t: train.remove(t) ct.train(train, 'model.crf.tagger') # test on the testing data s = "\n".join(testing) s_sents = s.split("###/###") test = [] sent_tags = [] for t in s_sents: if t: new = [] right_tags = [] words = t.split("\n") for word in words: if word: # split the tokens into just words new.append(word.split("/")[0]) # save the tags in a list to be used later right_tags.append(word.split("/")[1]) sent_tags.append(right_tags) test.append(new) tags = ct.tag_sents(test) return tags, sent_tags
def fit(self, data): """ Fits a tagging model to object's data based on object's tagger name :return: a tagger object """ tagger = None self.X = data if self.tagger_type == 'hmm': # Setup a trainer with default(None) values # And train with the data trainer = hmm.HiddenMarkovModelTrainer() tagger = trainer.train_supervised(data) elif self.tagger_type == 'crf': trainer = CRFTagger() trainer.train(self.train_data, 'model.crf.tagger') tagger = trainer self.tagger = tagger return tagger
def crf_tag(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] ct = CRFTagger() tagger = ct.train(train_sents, 'model.crf.tagger') test = ct.evaluate(test_sents) print test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode( 'utf-8') sent_w = sent3.lower().split() print sent_w tag = ct.tag(sent_w) print "The Tag Is:", tag
def ontweetdata(): tweetinfolist = pickle.load(open('tweetinfolist.pickle', 'rb')) #data from tweets counter = 0 for item in tweetinfolist: if item[1] == "O": counter = counter + 1 print("BASELINE: ", (counter) / len(tweetinfolist)) ct = CRFTagger() train_data = [[ (x.lower(), y.lower()) for [x, y] in tweetinfolist[round(0.9 * len(tweetinfolist)):] ]] ct.set_model_file('model.crf.tagger') ct.train(train_data, 'model.crf.tagger') gold_sentences = [[ (x.lower(), y.lower()) for [x, y] in tweetinfolist[:round(0.9 * len(tweetinfolist))] ]] print(ct.evaluate(gold_sentences))
entrenar_bill(tagger,"UnigramTagger") # In[ ]: tagger = BigramTagger(train_reducido[:1000]) tagger.evaluate(test_reducido[:1000]) entrenar_bill(tagger,"BigramTagger") # In[ ]: ct = CRFTagger() ct.train(train_reducido[:1000],'model.crf.tagger') evaluacion = ct.evaluate(test_reducido[:1000]) xlabels.append("CRF Tagger") accuracys.append(evaluacion) # In[ ]: tagger = PerceptronTagger(load=False) tagger.train(train_reducido[:1000]) evaluacion = tagger.evaluate(test_reducido[:1000]) xlabels.append("Perceptron Tagger") accuracys.append(evaluacion)
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
from nltk.tag import CRFTagger _model_file = 'model.crf.tagger_2' def get_data(ifile='vtb.txt'): data = [] with open(ifile, encoding='utf-8') as vtb: for _ in range(10383): temp = [] a = vtb.readline().split() for i in a: if i == '/': i = ['/', '/'] temp.append(tuple(i)) else: temp.append(tuple(i.split('/'))) data.append(temp) return data data = get_data() # with open('vtb3.txt','w',encoding='utf-8') as vtb3: # vtb3.write(str(data)) ct = CRFTagger() ct.train(data, _model_file)
def create_trainingModel(train_data,ModelPath): if os.path.isfile(ModelPath): os.remove(ModelPath) ct = CRFTagger() ct.train(train_data,ModelPath)
#print "\nrequiredFormat = ",requiredFormat return requiredFormat print "\nReading training corpus...." ListOfSentences_Training = corpusRead(Training_Data) print "Reading test corpus...." ListOfSentences_Test = corpusRead(Test_Data) #CRF Training ct = CRFTagger() print "CRF Training starts..." ct.train(ListOfSentences_Training,'model.crf.tagger') print "CRF Training is done." print "Testing starts" print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100 #Tagging by CRF Tagger ch = 'y' while (ch != 'n'): text = raw_input("Enter the text to be tagged : \n") text = converter(text) print ct.tag_sents(text) print "\nDo you want to continue ?" ch = raw_input()
def train_crf_tagger(train_corpus: Corpus, model_file='crfmodel'): train_sents = gen_tagged_sents(train_corpus) crf_tagger = CRFTagger() crf_tagger.train(train_sents, model_file) return crf_tagger
from nltk.tag import CRFTagger from nltk.corpus import brown ct = CRFTagger() brown_tagged_sents = brown.tagged_sents(tagset='universal') size = int(len(brown_tagged_sents) * 0.7) train_sents = brown_tagged_sents[:size] ct.train(train_sents, 'model.crf.tagger') #brown_sents = brown.sents() #test_sents = brown_sents[size:] #print(ct.tag(test_sents))
from nltk.tag import CRFTagger from nltk.corpus import brown import pickle #from tag_utils import * tagged_sents = brown.tagged_sents() train = tagged_sents[:50000] test = tagged_sents[50000:] crf = CRFTagger() ''' ############# Train ############# crf.train(train, 'crf_brown.tagger') print crf.evaluate(test) # 0.954383534534 ''' ############# Test ############# crf.set_model_file('crf_brown.tagger') tokens = [] for i in test: for j in i: tokens.append(j[0]) test_tagged = crf.tag(tokens) ''' f = open("test_tagged_obj.pickle", 'w') pickle.dump(test_tagged, f) f.close() '''
class DataAdapter(object): def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data) # print('TAGGING', self.data_tagging) # print('TESTING', self.data_testing) def tokenize_tag(self, text): text = text.replace('\r', ' | ').replace('\n', ' | ') tokens = word_tokenize(text, preserve_line=True) labels = [] for label in self.tag(tokens): labels.append(label[1]) return tokens, labels def for_tagging_testing(self, data): # self.data = data array_tagging = [] array_testing = [] for d in data: all_tags = [] all_test = [] for index, t in enumerate(d['text']): one_tag = [t, d['label'][index]] all_test.append(one_tag) all_tags.append(t) array_tagging.append(all_tags) array_testing.append(all_test) # print(all_tags) return array_tagging, array_testing def for_testing(self, data): # self.data = data array = [] # print('TEST', data.count()) for d in data: all_tags = [] for index, t in enumerate(d['text']): # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')] one_tag = [t, d['label'][index]] all_tags.append(one_tag) array.append(all_tags) # print(all_tags) return array def for_tagging(self, data): # self.data = data array = [] for d in data: all_tags = [] for t in d['text']: all_tags.append(t) array.append(all_tags) # print(all_tags) return array def tag_sents(self): if self.data_tagging is not None: return self.tagger.tag_sents(self.data_tagging) else: return 'NoData' def tag(self, data): return self.tagger.tag(data) def evaluate(self): if self.data_testing is not None: return self.tagger.evaluate(self.data_testing) else: return 'NoData' def train(self, data): data = self.for_testing(data) self.tagger.train(data, 'model.crf.tagger') print('ACCURACY:', self.tagger.evaluate(data))
comb_results = np.zeros((5, 4)) ind_results = np.zeros((5, 4)) for ki in range(data_batch): from nltk import TnT from nltk.tag import hmm from nltk.tag.perceptron import PerceptronTagger from nltk.tag import CRFTagger perc_tagger = PerceptronTagger(load=False) tnt_tagger = TnT() crf_tagger = CRFTagger() tnt_tagger.train(training_data[ki]) hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki]) perc_tagger.train(training_data[ki]) crf_tagger.train(training_data[ki], 'model.crf.tagger') # t.tagdata(test_data[800:]) perc_pred = [] hmm_pred = [] for i in testing_data[ki]: perc_pred.append(perc_tagger.tag(i)) hmm_pred.append(hmm_tagger.tag(i)) crf_pred = crf_tagger.tag_sents(testing_data[ki]) tnt_pred = tnt_tagger.tagdata(testing_data[ki]) pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred} def most_frequent(List): return max(set(List), key=List.count)
line_list=[] while line: #print(line) words=line.replace("\r","").replace("\n","").split("\t") #print(words) if(len(words)<2): train_data.append(line_list) line_list=[] else: tup1=(words[0],words[1]) line_list.append(tup1) line=f.readline() f.close() ct = CRFTagger() ct.train(train_data,'model.crf.tagger') test_actual=[] test_sentences=[] #with codecs.open("nepali-english-demo-20%training-data.txt","r","utf-8") as f: with codecs.open("/Users/Preethi/nlp_project/EMNLP/spanish_english/training/spanish-english-training-20%.txt","r","utf-8") as f: #with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-testing-answers.txt","r","utf-8") as f: line=f.readline() test=[] sentence=[] while line: words=line.replace("\r","").replace("\n","").split("\t") #print(words) if(len(words)<2): test_actual.append(test)
class SimpleSLU: def __init__(self): self.__semantic_instance_list = [] self.__speech_act_instance_list = [] self.__semantic_model = None self.__speech_act_model = None self.__speech_act_lb = None def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True def add_instance(self, utter, speech_act, semantic_tagged): tokenized = self.__tokenize(utter, semantic_tagged) if tokenized is None: return False semantic_instance = [] for word, (bio, tag, attrs) in tokenized: if bio is None: sem_label = 'O' else: cat = None for attr, val in attrs: if attr == 'cat': cat = val sem_label = '%s-%s_%s' % (bio, tag, cat) semantic_instance.append((unicode(word.lower()), unicode(sem_label))) self.__semantic_instance_list.append(semantic_instance) sa_label_list = [] for sa in speech_act: sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list += sa_labels sa_label_list = sorted(set(sa_label_list)) word_feats = ' '.join([word.lower() for word, _ in tokenized]) self.__speech_act_instance_list.append((word_feats, sa_label_list)) return True def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.MultiLabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile) def pred(self, utter): tokenized = self.__tokenize(utter) word_feats = ' '.join([word.lower() for word, _ in tokenized]) pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats])) pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized]) return (pred_act, pred_semantic) def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
from nltk.tag import CRFTagger from nltk.corpus import brown tagged_sents = brown.tagged_sents() train = tagged_sents[:50000] test = tagged_sents[50000:] crf = CRFTagger() crf.train(train, 'crf_tagger.model') a = crf.evaluate(test) print a
def main(): # start timer for item in [ "UD_Ukrainian", "Brown", ]: print("in process " + item) # open Brown training data infile = open(DATA_PATH + item + "_tagged_train.txt", "r", encoding="utf-8") brown_train = infile.readlines() infile.close() # split words and tags, and add start and stop symbols (question 1) brown_words, brown_tags = split_wordtags(brown_train) # calculate tag trigram probabilities (question 2) q_values = calc_trigrams(brown_tags) # question 2 output q2_output(q_values, OUTPUT_PATH + item + '_B2.txt') # calculate list of words with count > 5 (question 3) known_words = calc_known(brown_words) # get a version of brown_words with rare words replace with '_RARE_' (question 3) brown_words_rare = replace_rare(brown_words, known_words) # question 3 output q3_output(brown_words_rare, OUTPUT_PATH + item + "_B3.txt") # calculate emission probabilities (question 4) e_values, taglist = calc_emission(brown_words_rare, brown_tags) # question 4 output q4_output(e_values, OUTPUT_PATH + item + "_B4.txt") # delete unneceessary data del brown_train del brown_words_rare # open Brown development data (question 5) infile = open(DATA_PATH + item + "_test.txt", "r") brown_dev = infile.readlines() infile.close() # format Brown development data here brown_dev_words = [] for sentence in brown_dev: brown_dev_words.append(sentence.split(" ")[:-1]) # do viterbi on brown_dev_words (question 5) viterbi_tagged = viterbi(brown_dev_words, taglist, known_words, q_values, e_values) # question 5 output q5_output(viterbi_tagged, OUTPUT_PATH + item + "_B5.txt") # # do nltk tagging here # nltk_tagged = nltk_tagger(brown_words, brown_tags, brown_dev_words) # # # question 6 output # q6_output(nltk_tagged, OUTPUT_PATH + item + "_B6.txt") for item in ["Brown", "UD_Ukrainian"]: print("in crf process " + item) # open Brown training data infile = open(DATA_PATH + item + "_tagged_train.txt", "rb", encoding="utf-8") brown_train = infile.readlines() infile.close() brown_words, brown_tags = split_wordtags(brown_train) train_words_tags = [] ct = CRFTagger() for i in range(len(brown_words)): tmp = [] for j in range(len(brown_words[i])): tmp.append((brown_words[i][j].decode('utf-8'), brown_tags[i][j].decode('utf-8'))) train_words_tags.append(tmp) ct.train(train_words_tags, u'model.crf.tagger') # open Brown development data (question 5) infile = open(DATA_PATH + item + "_test.txt", "r") brown_dev = infile.readlines() infile.close() # format Brown development data here tests_words = [] for sentence in brown_dev: tests_words.append([i for i in sentence.split(" ")[:-1]]) result_cfg = ct.tag_sents(tests_words) with open(OUTPUT_PATH + item + "_CFG.txt", "w") as file: for line in result_cfg: for word in line: file.write(word[0] + "/" + word[1] + " ") file.write("\n") # print total time to run Part B print("Part B time: ", str(time.clock()), ' sec')
from nltk.tag import CRFTagger jumSample = 500000 namaFile = "Indonesian_Manually_Tagged_Corpus.tsv" with open(namaFile, 'r', encoding='utf-8') as f: lines = f.read().split('\n') pasangan = [] allPasangan = [] for line in lines[:min(jumSample, len(lines))]: if line == '': allPasangan.append(pasangan) pasangan = [] else: kata, tag = line.split('\t') p = (kata, tag) pasangan.append(p) ct = CRFTagger() ct.train(allPasangan, 'all_indo_man_tag_corpus_model.crf.tagger') # test hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung'], ['Nama', 'saya', 'Yudi']]) print(hasil)
class SimpleSLU: def __init__(self): self.__semantic_instance_list = [] self.__speech_act_instance_list = [] self.__semantic_model = None self.__speech_act_model = None self.__speech_act_lb = None def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True def add_instance(self, utter, speech_act, semantic_tagged): tokenized = self.__tokenize(utter, semantic_tagged) if tokenized is None: return False semantic_instance = [] for word, (bio, tag, attrs) in tokenized: if bio is None: sem_label = 'O' else: cat = None for attr, val in attrs: if attr == 'cat': cat = val sem_label = '%s-%s_%s' % (bio, tag, cat) semantic_instance.append((unicode(word.lower()), unicode(sem_label))) self.__semantic_instance_list.append(semantic_instance) sa_label_list = [] for sa in speech_act: sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list += sa_labels sa_label_list = sorted(set(sa_label_list)) word_feats = ' '.join([word.lower() for word, _ in tokenized]) self.__speech_act_instance_list.append((word_feats, sa_label_list)) return True def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.LabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile) def pred(self, utter): tokenized = self.__tokenize(utter) word_feats = ' '.join([word.lower() for word, _ in tokenized]) pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats])) pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized]) return (pred_act, pred_semantic) def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents=None, tagger="ClassifierBasedTagger", model=None, model_name="../results/modelCRF_featured", entities=None, language="english", **kwargs): self.all_entities = [] self.acronyms = [] self.language = language if not model: assert isinstance(train_sents, Iterable) if tagger == "ClassifierBasedTagger": self.feature_detector = iob_features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=iob_features, **kwargs) elif tagger == "CRFTagger": self.set_entities(entities) if not model: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.train( train_data=train_sents, model_file="../results/{}".format(model_name)) else: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.set_model_file(model) else: raise Exception('Unknown tagger') def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) return chunks def get_position(self, w): positions = [] for e in self.all_entities: if w in e: positions.append(e.index(w)) return positions def get_positions(self, tokens, index): w = tokens[index][0] prev = tokens[index - 1][0] next = tokens[index + 1][0] positions = [] for e in self.all_entities: if w in e and prev in e and next in e: positions.append(e.index(w)) return list(set(positions)) def set_entities(self, entities): if entities: entities = [l.split() for l in entities] for l in entities: if len(l) == 1 and is_all_caps(l[0]): self.acronyms.append(l[0].lower()) else: self.all_entities.append([w.lower() for w in l]) self.all_entities = list( set([tuple(entity) for entity in self.all_entities])) self.acronyms = list(set(self.acronyms)) with open('../data/entities_{}.txt'.format(self.language), 'w') as f: f.write("\n".join( [" ".join(line) for line in self.all_entities])) with open('../data/acronyms_{}.txt'.format(self.language), 'w') as f: f.write("\n".join( [" ".join(line) for line in self.all_entities])) else: with open('../data/entities_{}.txt'.format(self.language), 'r') as f: for line in f: self.all_entities.append(line.strip().split()) with open('../data/acronyms_{}.txt'.format(self.language), 'r') as f: for line in f: self.acronyms.append(line.strip()) self.all_entities = list( set([tuple(entity) for entity in self.all_entities])) self.acronyms = list(set(self.acronyms)) def crf_features(self, tokens, index): """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for """ # init the stemmer stemmer = SnowballStemmer(self.language) # Pad the sequence with num_of_previous = 3 num_of_posterior = 2 tk = [] for i in range(0, num_of_previous): tk.append(('[START{}]'.format(num_of_previous - i), '[START{}]'.format(num_of_previous - i))) tk = tk + list(tokens) for i in range(1, num_of_posterior + 1): tk.append(('[END{}]'.format(i), '[END{}]'.format(i))) tokens = tk index += num_of_previous word, pos = tokens[index] contains_dash = ('–' in word or '-' in word or '_' in word) contains_dot = '.' in word prev2_words = tokens[index - 2][0] + "_._" + tokens[index - 1][0] prev2_pos = tokens[index - 2][1] + "_._" + tokens[index - 1][1] prev1_words = tokens[index - 1][0] + "_._" + tokens[index][0] prev1_pos = tokens[index - 1][1] + "_._" + tokens[index][1] prev1_lemma = stemmer.stem( tokens[index - 1][0]) + "_._" + stemmer.stem(tokens[index][0]) next1_words = tokens[index][0] + "_._" + tokens[index + 1][0] next1_pos = tokens[index][1] + "_._" + tokens[index + 1][1] next2_words = tokens[index + 1][0] + "_._" + tokens[index + 2][0] next2_pos = tokens[index + 1][1] + "_._" + tokens[index + 2][1] allcaps = is_all_caps(word) strange_cap = word[ 0] not in string.ascii_uppercase and word != word.lower() inside_ent = word.lower() in self.all_entities is_acronym = word.lower() in self.acronyms features = { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-caps': allcaps, 'strange-cap': strange_cap, 'prev2-pos': prev2_pos, 'prev2-word': prev2_words, 'next2-pos': next2_pos, 'next2-word': next2_words, 'prev1-pos': prev1_pos, 'prev1-word': prev1_words, 'prev1-lemma': prev1_lemma, 'next1-pos': next1_pos, 'next1-word': next1_words, } features['inside-entities'] = inside_ent if is_acronym: features['is-acronym'] = is_acronym positions = self.get_position(word.lower()) for p in positions: features['position-{}'.format(p)] = True features['total-position-{}'.format(len(positions))] = True if contains_dash: features['contains-dash'] = contains_dash if contains_dot: features['contains-dot'] = contains_dot for i in range(1, num_of_previous + 1): word, pos = tokens[index - i] lemma = stemmer.stem(word) features['prev-{}-word'.format(i)] = word features['prev-{}-pos'.format(i)] = pos features['prev-{}-lemma'.format(i)] = lemma for i in range(1, num_of_posterior + 1): word, pos = tokens[index + i] inside_ent = word.lower() in self.all_entities features['next-{}-word'.format(i)] = word features['next-{}-pos'.format(i)] = pos features['next-{}-inside-ent'.format(i)] = inside_ent return features
# Divide data into train and test sets eightyPercent = count * 0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] # print training_set # Train ct = CRFTagger() train_data = training_set train_data_new = [] for i in range(len(train_data)): if len(train_data[i]) != 0: train_data_new.append(train_data[i]) ct.train(train_data_new, 'model.crf.tagger') # Accuracy test_data_new = [] test_data_tags = [] for i in range(len(test_set)): if len(test_set[i]) != 0: for j in range(len(test_set[i])): test_data_new.append(test_set[i][j][0]) test_data_tags.append(test_set[i][j][1]) gold_sentences = test_data_new # print ct.evaluate(gold_sentences) # print test_data_new pred_tags = [] refsets = collections.defaultdict(set)
from nltk.corpus import treebank from nltk.tag import tnt, CRFTagger # split training data from test data train_data = treebank.tagged_sents()[:3000] test_data = treebank.tagged_sents()[3000:] # train a trigram N tagger (TnT) tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) print tnt_pos_tagger.evaluate(test_data) # train a CRF tagger crf_tagger = CRFTagger() crf_tagger.train(train_data, '~/Documents/NLP/NLP/crf_model.txt') print crf_tagger.evaluate(test_data)