class CRF: def __init__(self): self.__model = type('test', (object,), {})() pass def train(self, X_training_data): self.__model = CRFTagger() self.__model.train(X_training_data, 'crf.model') pass def test(self, X_test_data): total = 0 correct = 0 for kalimat in X_test_data: temp = [] for word in kalimat: temp.append(word[0]) if len(temp) != 0: predicted_y = self.__model.tag(temp) for i in range(len(predicted_y)): total += 1 if predicted_y[i][1] == kalimat[i][1]: correct += 1 print(correct, total) print(correct / total) pass
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = CRFTagger( feature_func=features ) self.tagger.train(train_sents, 'model.crf.tagger') # self.tagger = ClassifierBasedTagger( # train=train_sents, # feature_detector=features, # **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def tag_crf(self, untagged_string: str): """Tag POS with CRF tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['crf'] tagger = CRFTagger() tagger.set_model_file(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def crf_tag(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] ct = CRFTagger() tagger = ct.train(train_sents, 'model.crf.tagger') test = ct.evaluate(test_sents) print test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode( 'utf-8') sent_w = sent3.lower().split() print sent_w tag = ct.tag(sent_w) print "The Tag Is:", tag
def train_and_save_tagger(language, model_type, feature, untagged_text, verbose = False): training_file = 'corpora/{0}/{0}_train.{1}'.format(language, feature) tagger, acc, _, _ = make_morpho_model(language, model_type, feature, training_file) model_file = 'taggers/{0}/{1}/{2}.pickle'.format(language, feature, model_type) if model_type != 'crf': #annoying hack because the crf model saves itself. with open(model_file, 'wb') as f: pickle.dump(tagger, f) # check the model file by reloading it and using it to tag sample text if model_type != 'crf': with open(model_file, 'rb') as f: tagger2 = pickle.load(f) else: tagger2 = CRFTagger() tagger2.set_model_file(model_file) tagged_text = tagger2.tag(untagged_text) if verbose == True: print("Model {0} for language {4}, feature {1} saved at {2}. Training accuracy = {3:.3f}".format(model_type, feature, model_file, acc, language)) print("Sample tagging output: {0}".format(tagged_text[:10]))
# In[26]: TAGGER_PATH = "crfpostagger" # pre-trained POS-tagger # In[27]: tagger = CRFTagger() # initialize tagger tagger.set_model_file(TAGGER_PATH) # In[30]: # try some sentences out- must all be unicode strings- trained on lower case print(tagger.tag([u"i", u"like", u"revision"])) print(tagger.tag([u"i", u"like", u"natural", u"language", u"processing"])) # In[31]: # scaling up as you might get them in text- make sure unicode and lower case sentences = ["I like revision", "I like Natural Language Processing"] print(tagger.tag_sents([unicode(word.lower()) for word in s.split()] for s in sentences)) # In[ ]:
crf = CRFTagger() ''' ############# Train ############# crf.train(train, 'crf_brown.tagger') print crf.evaluate(test) # 0.954383534534 ''' ############# Test ############# crf.set_model_file('crf_brown.tagger') tokens = [] for i in test: for j in i: tokens.append(j[0]) test_tagged = crf.tag(tokens) ''' f = open("test_tagged_obj.pickle", 'w') pickle.dump(test_tagged, f) f.close() ''' #print test_tagged temp = "" for i in test_tagged: temp += str(i[0])+'\t\t'+str(i[1])+'\n' f = open('crf_brown_tagged.txt', 'w') f.write(temp) f.close()
class DataAdapter(object): def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data) # print('TAGGING', self.data_tagging) # print('TESTING', self.data_testing) def tokenize_tag(self, text): text = text.replace('\r', ' | ').replace('\n', ' | ') tokens = word_tokenize(text, preserve_line=True) labels = [] for label in self.tag(tokens): labels.append(label[1]) return tokens, labels def for_tagging_testing(self, data): # self.data = data array_tagging = [] array_testing = [] for d in data: all_tags = [] all_test = [] for index, t in enumerate(d['text']): one_tag = [t, d['label'][index]] all_test.append(one_tag) all_tags.append(t) array_tagging.append(all_tags) array_testing.append(all_test) # print(all_tags) return array_tagging, array_testing def for_testing(self, data): # self.data = data array = [] # print('TEST', data.count()) for d in data: all_tags = [] for index, t in enumerate(d['text']): # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')] one_tag = [t, d['label'][index]] all_tags.append(one_tag) array.append(all_tags) # print(all_tags) return array def for_tagging(self, data): # self.data = data array = [] for d in data: all_tags = [] for t in d['text']: all_tags.append(t) array.append(all_tags) # print(all_tags) return array def tag_sents(self): if self.data_tagging is not None: return self.tagger.tag_sents(self.data_tagging) else: return 'NoData' def tag(self, data): return self.tagger.tag(data) def evaluate(self): if self.data_testing is not None: return self.tagger.evaluate(self.data_testing) else: return 'NoData' def train(self, data): data = self.for_testing(data) self.tagger.train(data, 'model.crf.tagger') print('ACCURACY:', self.tagger.evaluate(data))
class DeepDisfluencyTagger(IncrementalTagger): """A deep-learning driven incremental disfluency tagger (and optionally utterance-segmenter). Tags each word with the following: <f/> - a fluent word <e/> - an edit term word, not necessarily inside a repair structure <rms id="N"/> - reparandum start word for repair with ID number N <rm id="N"/> - mid-reparandum word for repair N <i id="N"/> - interregnum word for repair N <rps id="N"/> - repair onset word for repair N <rp id="N"/> - mid-repair word for repair N <rpn id="N"/> - repair end word for substitution or repetition repair N <rpnDel id="N"/> - repair end word for a delete repair N If in joint utterance segmentation mode according to the config file, the following utterance segmentation tags are used: <cc/> - a word which continues the current utterance and whose following word will continue it <ct/> - a word which continues the current utterance and is the last word of it <tc/> - a word which is the beginning of an utterance and whose following word will continue it <tt/> - a word constituting an entire utterance """ def __init__(self, config_file=None, config_number=None, saved_model_dir=None, pos_tagger=None, language_model=None, pos_language_model=None, edit_language_model=None, timer=None, timer_scaler=None, use_timing_data=False): if not config_file: config_file = os.path.dirname(os.path.realpath(__file__)) +\ "/../experiments/experiment_configs.csv" config_number = 35 print "No config file, using default", config_file, config_number super(DeepDisfluencyTagger, self).__init__(config_file, config_number, saved_model_dir) print "Processing args from config number {} ...".format(config_number) self.args = process_arguments(config_file, config_number, use_saved=False, hmm=True) # separate manual setting setattr(self.args, "use_timing_data", use_timing_data) print "Intializing model from args..." self.model = self.init_model_from_config(self.args) # load a model from a folder if specified if saved_model_dir: print "Loading saved weights from", saved_model_dir self.load_model_params_from_folder(saved_model_dir, self.args.model_type) else: print "WARNING no saved model params, needs training." print "Loading original embeddings" self.load_embeddings(self.args.embeddings) if pos_tagger: print "Loading POS tagger..." self.pos_tagger = pos_tagger elif self.args.pos: print "No POS tagger specified,loading default CRF switchboard one" self.pos_tagger = CRFTagger() tagger_path = os.path.dirname(os.path.realpath(__file__)) +\ "/../feature_extraction/crfpostagger" self.pos_tagger.set_model_file(tagger_path) if self.args.n_language_model_features > 0 or \ 'noisy_channel' in self.args.decoder_type: print "training language model..." self.init_language_models(language_model, pos_language_model, edit_language_model) if timer: print "loading timer..." self.timing_model = timer self.timing_model_scaler = timer_scaler else: # self.timing_model = None # self.timing_model_scaler = None print "No timer specified, using default switchboard one" timer_path = os.path.dirname(os.path.realpath(__file__)) +\ '/../decoder/timing_models/' + \ 'LogReg_balanced_timing_classifier.pkl' with open(timer_path, 'rb') as fid: self.timing_model = cPickle.load(fid) timer_scaler_path = os.path.dirname(os.path.realpath(__file__)) +\ '/../decoder/timing_models/' + \ 'LogReg_balanced_timing_scaler.pkl' with open(timer_scaler_path, 'rb') as fid: self.timing_model_scaler = cPickle.load(fid) # TODO a hack # self.timing_model_scaler.scale_ = \ # self.timing_model_scaler.std_.copy() print "Loading decoder..." hmm_dict = deepcopy(self.tag_to_index_map) # add the interegnum tag if "disf" in self.args.tags: intereg_ind = len(hmm_dict.keys()) interreg_tag = \ "<i/><cc/>" if "uttseg" in self.args.tags else "<i/>" hmm_dict[interreg_tag] = intereg_ind # add the interregnum tag # decoder_file = os.path.dirname(os.path.realpath(__file__)) + \ # "/../decoder/model/{}_tags".format(self.args.tags) noisy_channel = None if 'noisy_channel' in self.args.decoder_type: noisy_channel = SourceModel(self.lm, self.pos_lm, uttseg=self.args.do_utt_segmentation) self.decoder = FirstOrderHMM( hmm_dict, markov_model_file=self.args.tags, timing_model=self.timing_model, timing_model_scaler=self.timing_model_scaler, constraint_only=True, noisy_channel=noisy_channel) # getting the states in the right shape self.state_history = [] self.softmax_history = [] # self.convert_to_output_tags = get_conversion_method(self.args.tags) self.reset() def init_language_models(self, language_model=None, pos_language_model=None, edit_language_model=None): clean_model_dir = os.path.dirname(os.path.realpath(__file__)) +\ "/../data/lm_corpora" if language_model: self.lm = language_model else: print "No language model specified, using default switchboard one" lm_corpus_file = open(clean_model_dir + "/swbd_disf_train_1_clean.text") lines = [ line.strip("\n").split(",")[1] for line in lm_corpus_file if "POS," not in line and not line.strip("\n") == "" ] split = int(0.9 * len(lines)) lm_corpus = "\n".join(lines[:split]) heldout_lm_corpus = "\n".join(lines[split:]) lm_corpus_file.close() self.lm = KneserNeySmoothingModel( order=3, discount=0.7, partial_words=self.args.partial_words, train_corpus=lm_corpus, heldout_corpus=heldout_lm_corpus, second_corpus=None) if pos_language_model: self.pos_lm = pos_language_model elif self.args.pos: print "No pos language model specified, \ using default switchboard one" lm_corpus_file = open(clean_model_dir + "/swbd_disf_train_1_clean.text") lines = [ line.strip("\n").split(",")[1] for line in lm_corpus_file if "POS," in line and not line.strip("\n") == "" ] split = int(0.9 * len(lines)) lm_corpus = "\n".join(lines[:split]) heldout_lm_corpus = "\n".join(lines[split:]) lm_corpus_file.close() self.pos_lm = KneserNeySmoothingModel( order=3, discount=0.7, partial_words=self.args.partial_words, train_corpus=lm_corpus, heldout_corpus=heldout_lm_corpus, second_corpus=None) if edit_language_model: self.edit_lm = edit_language_model else: edit_lm_corpus_file = open(clean_model_dir + "/swbd_disf_train_1_edit.text") edit_lines = [ line.strip("\n").split(",")[1] for line in edit_lm_corpus_file if "POS," not in line and not line.strip("\n") == "" ] edit_split = int(0.9 * len(edit_lines)) edit_lm_corpus = "\n".join(edit_lines[:edit_split]) heldout_edit_lm_corpus = "\n".join(edit_lines[edit_split:]) edit_lm_corpus_file.close() self.edit_lm = KneserNeySmoothingModel( train_corpus=edit_lm_corpus, heldout_corpus=heldout_edit_lm_corpus, order=2, discount=0.7) # TODO an object for getting the lm features incrementally # in the language model def init_model_from_config(self, args): # for feat, val in args._get_kwargs(): # print feat, val, type(val) if not test_if_using_GPU(): print "Warning: not using GPU, might be a bit slow" print "\tAdjust Theano config file ($HOME/.theanorc)" print "loading tag to index maps..." label_path = os.path.dirname(os.path.realpath(__file__)) +\ "/../data/tag_representations/{}_tags.csv".format(args.tags) word_path = os.path.dirname(os.path.realpath(__file__)) +\ "/../data/tag_representations/{}.csv".format(args.word_rep) pos_path = os.path.dirname(os.path.realpath(__file__)) +\ "/../data/tag_representations/{}.csv".format(args.pos_rep) self.tag_to_index_map = load_tags(label_path) self.word_to_index_map = load_tags(word_path) self.pos_to_index_map = load_tags(pos_path) self.model_type = args.model_type vocab_size = len(self.word_to_index_map.keys()) emb_dimension = args.emb_dimension n_hidden = args.n_hidden n_extra = args.n_language_model_features + args.n_acoustic_features n_classes = len(self.tag_to_index_map.keys()) self.window_size = args.window n_pos = len(self.pos_to_index_map.keys()) update_embeddings = args.update_embeddings lr = args.lr print "Initializing model of type", self.model_type, "..." if self.model_type == 'elman': model = Elman(ne=vocab_size, de=emb_dimension, nh=n_hidden, na=n_extra, n_out=n_classes, cs=self.window_size, npos=n_pos, update_embeddings=update_embeddings) self.initial_h0_state = model.h0.get_value() self.initial_c0_state = None elif self.model_type == 'lstm': model = LSTM(ne=vocab_size, de=emb_dimension, n_lstm=n_hidden, na=n_extra, n_out=n_classes, cs=self.window_size, npos=n_pos, lr=lr, single_output=True, cost_function='nll') self.initial_h0_state = model.h0.get_value() self.initial_c0_state = model.c0.get_value() else: raise NotImplementedError('No model init for {0}'.format( self.model_type)) return model def load_model_params_from_folder(self, model_folder, model_type): if model_type in ["lstm", "elman"]: self.model.load_weights_from_folder(model_folder) self.initial_h0_state = self.model.h0.get_value() if model_type == "lstm": self.initial_c0_state = self.model.c0.get_value() else: raise NotImplementedError( 'No weight loading for {0}'.format(model_type)) def load_embeddings(self, embeddings_name): # load pre-trained embeddings embeddings_dir = os.path.dirname(os.path.realpath(__file__)) +\ "/../embeddings/" pretrained = gensim.models.Word2Vec.load(embeddings_dir + embeddings_name) print "emb shape", pretrained[pretrained.index2word[0]].shape # print pretrained[0].shape # assign and fill in the gaps emb = populate_embeddings(self.args.emb_dimension, len(self.word_to_index_map.items()), self.word_to_index_map, pretrained) self.model.load_weights(emb=emb) def standardize_word_and_pos( self, word, pos=None, proper_name_pos_tags=["NNP", "NNPS", "CD", "LS", "SYM", "FW"]): word = word.lower() if not pos and self.pos_tagger: pos = self.pos_tagger.tag([]) # TODO if pos: pos = pos.upper() if pos in proper_name_pos_tags and "$unc$" not in word: word = "$unc$" + word if self.pos_to_index_map.get(pos) is None: # print "unknown pos", pos pos = "<unk>" if self.word_to_index_map.get(word) is None: # print "unknown word", word word = "<unk>" return word, pos def tag_new_word(self, word, pos=None, timing=None, extra=None, diff_only=True, rollback=0): """Tag new incoming word and update the word and tag graphs. :param word: the word to consume/tag :param pos: the POS tag to consume/tag (optional) :param timing: the duration of the word (optional) :param diff_only: whether to output only the diffed suffix, if False, outputs entire output tags :param rollback: the number of words to rollback in the case of changed word hypotheses from an ASR """ self.rollback(rollback) if pos is None and self.args.pos: # if no pos tag provided but there is a pos-tagger, tag word test_words = [ unicode(x) for x in get_last_n_features( "words", self.word_graph, len(self.word_graph) - 1, n=4) ] + [unicode(word.lower())] pos = self.pos_tagger.tag(test_words)[-1][1] # print "tagging", word, "as", pos # 0. Add new word to word graph word, pos = self.standardize_word_and_pos(word, pos) # print "New word:", word, pos self.word_graph.append((word, pos, timing)) # 1. load the saved internal rnn state # TODO these nets aren't (necessarily) trained statefully # The internal state in training self.args.bs words back # are the inital ones in training, however here # They are the actual state reached. if self.state_history == []: c0_state = self.initial_c0_state h0_state = self.initial_h0_state else: if self.model_type == "lstm": c0_state = self.state_history[-1][0][-1] h0_state = self.state_history[-1][1][-1] elif self.model_type == "elman": h0_state = self.state_history[-1][-1] if self.model_type == "lstm": self.model.load_weights(c0=c0_state, h0=h0_state) elif self.model_type == "elman": self.model.load_weights(h0=h0_state) else: raise NotImplementedError("no history loading for\ {0} model".format(self.model_type)) # 2. do the softmax output with converted inputs word_window = [ self.word_to_index_map[x] for x in get_last_n_features("words", self.word_graph, len(self.word_graph) - 1, n=self.window_size) ] pos_window = [ self.pos_to_index_map[x] for x in get_last_n_features("POS", self.word_graph, len(self.word_graph) - 1, n=self.window_size) ] # print "word_window, pos_window", word_window, pos_window if self.model_type == "lstm": h_t, c_t, s_t = self.model.\ soft_max_return_hidden_layer([word_window], [pos_window]) self.softmax_history.append(s_t) if len(self.state_history) == 20: # just saving history self.state_history.pop(0) # pop first one self.state_history.append((c_t, h_t)) elif self.model_type == "elman": h_t, s_t = self.model.soft_max_return_hidden_layer([word_window], [pos_window]) self.softmax_history.append(s_t) if len(self.state_history) == 20: self.state_history.pop(0) # pop first one self.state_history.append(h_t) else: raise NotImplementedError("no softmax implemented for\ {0} model".format(self.model_type)) softmax = np.concatenate(self.softmax_history) # 3. do the decoding on the softmax if "disf" in self.args.tags: edit_tag = "<e/><cc>" if "uttseg" in self.args.tags else "<e/>" # print self.tag_to_index_map[edit_tag] adjustsoftmax = np.concatenate( (softmax, softmax[:, self.tag_to_index_map[edit_tag]].reshape( softmax.shape[0], 1)), 1) else: adjustsoftmax = softmax last_n_timings = None if ((not self.args.use_timing_data) or not timing) \ else get_last_n_features("timings", self.word_graph, len(self.word_graph)-1, n=3) new_tags = self.decoder.viterbi_incremental( adjustsoftmax, a_range=(len(adjustsoftmax) - 1, len(adjustsoftmax)), changed_suffix_only=True, timing_data=last_n_timings, words=[word]) # print "new tags", new_tags prev_output_tags = deepcopy(self.output_tags) self.output_tags = self.output_tags[:len(self.output_tags) - (len(new_tags) - 1)] + new_tags # 4. convert to standardized output format if "simple" in self.args.tags: for p in range( len(self.output_tags) - (len(new_tags) + 1), len(self.output_tags)): rps = self.output_tags[p] self.output_tags[p] = rps.replace('rm-0', 'rps id="{}"'.format(p)) if "<i" in self.output_tags[p]: self.output_tags[p] = self.output_tags[p].\ replace("<e/>", "").replace("<i", "<e/><i") else: # new_words = [word] words = get_last_n_features("words", self.word_graph, len(self.word_graph) - 1, n=len(self.word_graph) - (self.window_size - 1)) self.output_tags = convert_from_inc_disfluency_tags_to_eval_tags( self.output_tags, words, start=len(self.output_tags) - (len(new_tags)), representation=self.args.tags) if diff_only: for i, old_new in enumerate(zip(prev_output_tags, self.output_tags)): old, new = old_new if old != new: return self.output_tags[i:] return self.output_tags[len(prev_output_tags):] return self.output_tags def tag_utterance(self, utterance): """Tags entire utterance, only possible on models trained on unsegmented data. """ if not self.args.utts_presegmented: raise NotImplementedError("Tagger trained on unsegmented data,\ please call tag_prefix(words) instead.") # non segmenting self.reset() # always starts in initial state if not self.args.pos: # no pos tag model utterance = [(w, None, t) for w, p, t in utterance] # print "Warning: not using pos tags as not pos tag model" if not self.args.use_timing_data: utterance = [(w, p, None) for w, p, t in utterance] # print "Warning: not using timing durations as no timing model" for w, p, t in utterance: if self.args.pos: self.tag_new_word(w, pos=p, timing=t) return self.output_tags def rollback(self, backwards): super(DeepDisfluencyTagger, self).rollback(backwards) self.state_history = self.state_history[:len(self.state_history) - backwards] self.softmax_history = self.softmax_history[:len(self.softmax_history ) - backwards] self.decoder.rollback(backwards) def init_deep_model_internal_state(self): if self.model_type == "lstm": self.model.load_weights(c0=self.initial_c0_state, h0=self.initial_h0_state) elif self.model_type == "elman": self.model.load_weights(h0=self.initial_h0_state) def reset(self): super(DeepDisfluencyTagger, self).reset() self.word_graph = [("<s>", "<s>", 0)] * \ (self.window_size - 1) self.state_history = [] self.softmax_history = [] self.decoder.viterbi_init() self.init_deep_model_internal_state() def evaluate_fast_from_matrices(self, validation_matrices, tag_file, idx_to_label_dict): output = [] true_y = [] for v in validation_matrices: words_idx, pos_idx, extra, y, indices = v if extra: output.extend( self.model.classify_by_index(words_idx, indices, pos_idx, extra)) else: output.extend( self.model.classify_by_index(words_idx, indices, pos_idx)) true_y.extend(y) p_r_f_tags = precision_recall_fscore_support(true_y, output, average='macro') tag_summary = classification_report( true_y, output, labels=[i for i in xrange(len(idx_to_label_dict.items()))], target_names=[ idx_to_label_dict[i] for i in xrange(len(idx_to_label_dict.items())) ]) print tag_summary results = { "f1_rmtto": p_r_f_tags[2], "f1_rm": p_r_f_tags[2], "f1_tto1": p_r_f_tags[2], "f1_tto2": p_r_f_tags[2] } results.update({'f1_tags': p_r_f_tags[2], 'tag_summary': tag_summary}) return results def train_net(self, train_dialogues_filepath=None, validation_dialogues_filepath=None, model_dir=None, tag_accuracy_file_path=None): """Train the internal deep learning model from a list of dialogue matrices. """ tag_accuracy_file = open(tag_accuracy_file_path, "a") print "Verifying files..." for filepath in [ train_dialogues_filepath, validation_dialogues_filepath ]: if not verify_dialogue_data_matrices_from_folder( filepath, word_dict=self.word_to_index_map, pos_dict=self.pos_to_index_map, tag_dict=self.tag_to_index_map, n_lm=self.args.n_language_model_features, n_acoustic=self.args.n_acoustic_features): raise Exception("Dialogue vectors in wrong format!\ See README.md.") lr = self.args.lr # even if decay, start with specific lr n_extra = self.args.n_language_model_features + \ self.args.n_acoustic_features # validation matrices filepath much smaller so can store these # and preprocess them all: validation_matrices = [ np.load(validation_dialogues_filepath + "/" + fp) for fp in os.listdir(validation_dialogues_filepath) ] validation_matrices = [ dialogue_data_and_indices_from_matrix( d_matrix, n_extra, pre_seg=self.args.utts_presegmented, window_size=self.window_size, bs=self.args.bs, tag_rep=self.args.tags, tag_to_idx_map=self.tag_to_index_map, in_utterances=self.args.utts_presegmented) for d_matrix in validation_matrices ] idx_2_label_dict = {v: k for k, v in self.tag_to_index_map.items()} if not os.path.exists(model_dir): os.mkdir(model_dir) start = 1 # by default start from the first epoch best_score = 0 best_epoch = 0 print "Net training started..." for e in range(start, self.args.n_epochs + 1): tic = time.time() epoch_folder = model_dir + "/epoch_{}".format(e) if not os.path.exists(epoch_folder): os.mkdir(epoch_folder) train_loss = 0 # TODO IO is slow, where the memory allows do in one load_separately = True test = False if load_separately: for i, dialogue_f in enumerate( os.listdir(train_dialogues_filepath)): if test and i > 3: break print dialogue_f d_matrix = np.load(train_dialogues_filepath + "/" + dialogue_f) word_idx, pos_idx, extra, y, indices = \ dialogue_data_and_indices_from_matrix( d_matrix, n_extra, window_size=self.window_size, bs=self.args.bs, pre_seg=self.args.utts_presegmented ) # for i in range(len(indices)): # print i, word_idx[i], pos_idx[i], \ # y[i], indices[i] train_loss += self.model.fit(word_idx, y, lr, indices, pos_idx=pos_idx, extra_features=extra) print '[learning] file %i >>' % (i+1),\ 'completed in %.2f (sec) <<\r' % (time.time() - tic) # save the initial states we've learned to override the random self.initial_h0_state = self.model.h0.get_value() if self.args.model_type == "lstm": self.initial_c0_state = self.model.c0.get_value() # reset and evaluate simply self.reset() results = self.evaluate_fast_from_matrices( validation_matrices, tag_accuracy_file, idx_to_label_dict=idx_2_label_dict) val_score = results['f1_tags'] #TODO get best score type print "epoch training loss", train_loss print '[learning] epoch %i >>' % (e),\ 'completed in %.2f (sec) <<\r' % (time.time() - tic) print "validation score", val_score tag_accuracy_file.write( str(e) + "\n" + results['tag_summary'] + "\n%%%%%%%%%%\n") tag_accuracy_file.flush() print "saving model..." self.model.save(epoch_folder) # Epoch file dump # checking patience and decay, if applicable # stopping criterion if val_score > best_score: self.model.save(model_dir) best_score = val_score print 'NEW BEST raw labels at epoch ', e, 'best valid',\ best_score best_epoch = e # stopping criteria = if no improvement in 10 epochs if e - best_epoch >= 10: print "stopping, no improvement in 10 epochs" break if self.args.decay and (e - best_epoch) > 1: # just a steady decay if things aren't improving for 2 epochs # a hidden hyperparameter decay_rate = 0.85 lr *= decay_rate print "learning rate decayed, now ", lr if lr < 1e-5: print "stopping, below learning rate threshold" break print '[learning and testing] epoch %i >>' % (e),\ 'completed in %.2f (sec) <<\r' % (time.time()-tic) print 'BEST RESULT: epoch', best_epoch, 'valid score', best_score tag_accuracy_file.close() return best_epoch def incremental_output_from_file(self, source_file_path, target_file_path=None, is_asr_results_file=False): """Return the incremental output in an increco style given the incoming words + POS. E.g.: Speaker: KB3_1 Time: 1.50 KB3_1:1 0.00 1.12 $unc$yes NNP <f/><tc/> Time: 2.10 KB3_1:1 0.00 1.12 $unc$yes NNP <rms id="1"/><tc/> KB3_1:2 1.12 2.00 because IN <rps id="1"/><cc/> Time: 2.5 KB3_1:2 1.12 2.00 because IN <rps id="1"/><rpndel id="1"/><cc/> from an ASR increco style input without the POStags: or a normal style disfluency dectection ground truth corpus: Speaker: KB3_1 KB3_1:1 0.00 1.12 $unc$yes NNP <rms id="1"/><tc/> KB3_1:2 1.12 2.00 $because IN <rps id="1"/><cc/> KB3_1:3 2.00 3.00 because IN <f/><cc/> KB3_1:4 3.00 4.00 theres EXVBZ <f/><cc/> KB3_1:6 4.00 5.00 a DT <f/><cc/> KB3_1:7 6.00 7.10 pause NN <f/><cc/> :param source_file_path: str, file path to the input file :param target_file_path: str, file path to output in the above format :param is_asr_results_file: bool, whether the input is increco style """ if target_file_path: target_file = open(target_file_path, "w") if not self.args.do_utt_segmentation: print "not doing utt seg, using pre-segmented file" if is_asr_results_file: return NotImplementedError if 'timings' in source_file_path: print "input file has timings" if not is_asr_results_file: dialogues = [] IDs, timings, words, pos_tags, labels = \ get_tag_data_from_corpus_file(source_file_path) for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags, labels): dialogues.append((dialogue, (a, b, c, d))) else: print "no timings in input file, creating fake timings" raise NotImplementedError for speaker, speaker_data in dialogues: # if "4565" in speaker: quit() print speaker self.reset() # reset at the beginning of each dialogue if target_file_path: target_file.write("Speaker: " + str(speaker) + "\n\n") timing_data, lex_data, pos_data, labels = speaker_data # iterate through the utterances # utt_idx = -1 current_time = 0 for i in range(0, len(timing_data)): # print i, timing_data[i] _, end = timing_data[i] if (not self.args.do_utt_segmentation) \ and "<t" in labels[i]: self.reset() # reset after each utt if non pre-seg # utt_idx = frames[i] timing = None if 'timings' in source_file_path and self.args.use_timing_data: timing = end - current_time word = lex_data[i] pos = pos_data[i] diff = self.tag_new_word(word, pos, timing, diff_only=True, rollback=0) current_time = end if target_file_path: target_file.write("Time: " + str(current_time) + "\n") new_words = lex_data[i - (len(diff) - 1):i + 1] new_pos = pos_data[i - (len(diff) - 1):i + 1] new_timings = timing_data[i - (len(diff) - 1):i + 1] for t, w, p, tag in zip(new_timings, new_words, new_pos, diff): target_file.write("\t".join( [str(t[0]), str(t[1]), w, p, tag])) target_file.write("\n") target_file.write("\n") target_file.write("\n") def train_decoder(self, tag_file): raise NotImplementedError def save_decoder_model(self, dir_path): raise NotImplementedError
class SimpleSLU: def __init__(self): self.__semantic_instance_list = [] self.__speech_act_instance_list = [] self.__semantic_model = None self.__speech_act_model = None self.__speech_act_lb = None def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True def add_instance(self, utter, speech_act, semantic_tagged): tokenized = self.__tokenize(utter, semantic_tagged) if tokenized is None: return False semantic_instance = [] for word, (bio, tag, attrs) in tokenized: if bio is None: sem_label = 'O' else: cat = None for attr, val in attrs: if attr == 'cat': cat = val sem_label = '%s-%s_%s' % (bio, tag, cat) semantic_instance.append((unicode(word.lower()), unicode(sem_label))) self.__semantic_instance_list.append(semantic_instance) sa_label_list = [] for sa in speech_act: sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list += sa_labels sa_label_list = sorted(set(sa_label_list)) word_feats = ' '.join([word.lower() for word, _ in tokenized]) self.__speech_act_instance_list.append((word_feats, sa_label_list)) return True def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.MultiLabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile) def pred(self, utter): tokenized = self.__tokenize(utter) word_feats = ' '.join([word.lower() for word, _ in tokenized]) pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats])) pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized]) return (pred_act, pred_semantic) def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
X_train_preprocessed = [] print('\n\nPreprocessing The Data .......\n\n') for sent in X_train: sent_processed = [] sent = emoji.demojize(sent) sent = sent.lower() sent = re.sub(r"http\S+", "", sent) sent = re.sub(r"@\S+", "", sent) sent = sent.replace('url', '') words = tokenizer.tokenize(sent) words = Convert_Short_Hands(words) for word, tag in POS_tagger.tag(words): if word in apost_Dict: word = apost_Dict[word] word = remove_PunctuationAndNum(word) word = word.lower() if word != "": word = lemmatizer.lemmatize(word, tag_map[tag[0]]) sent_processed.append(word) X_train_preprocessed.append(sent_processed) # print(X_train_preprocessed) t = keras.preprocessing.text.Tokenizer()
ct = CRFTagger() ct.set_model_file("model.crf.tagger") brown_sents = brown.sents() size = int(len(brown_sents) * 0.7) test_sents = brown_sents[size:] flat_list = [] for sublist in test_sents: for item in sublist: flat_list.append(item) l = ct.tag(flat_list) y_pred = [] for each in l: y_pred.append(each[1]) #print(y_pred[:10]) tagged_sents = brown.tagged_sents(tagset="universal")[size:] y_true = [] for each in tagged_sents: for e in each: y_true.append(e[1]) #print(y_true[0:10])
from nltk.tag import CRFTagger crflan = CRFTagger() crf = CRFTagger() crflan.set_model_file('model.crf.tagger') crf.set_model_file('model1.crf.tagger') print "Give a sentence..." # Test test_sent = raw_input() test_sent = test_sent.encode('utf-8').decode('utf-8').split(' ') print test_sent half_ans = crflan.tag(test_sent) print half_ans # print test_sent print crf.tag(test_sent)
def onsentencelist(): ct = CRFTagger() """sentencelist contains nertaged sentences""" sentencelist = pickle.load(open('sentencelist.pickle','rb')) """training size as percentage""" trainingsize = 0.9 """ calculate where to split data """ limit = round(trainingsize*len(sentencelist)) """wordsentencelist contains the same sentences not ner-tagged""" wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb")) """train the data / choose one of the 2 blocks """ #train_data = sentencelist[:limit] #ct.train(train_data,'model.crf.tagger') ct.set_model_file('tweetmodel.crf.tagger') """Test data and evaluate""" test_data = wordsentencelist[limit:] ct.tag_sents(test_data) # tagging sentences gold_sentences = sentencelist[limit:] print("\nAccuracy:", ct.evaluate(gold_sentences)) """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING ONLY THE TRUE AND PREDTAGS""" pred_nerlist = [] for sentence in wordsentencelist[:limit]: for (word,nertag) in ct.tag(sentence): #pred_nerlist.append((word,nertag)) pred_nerlist.append(nertag.lower()) true_nerlist = [] #ct_true = gold_sentences for sentence in sentencelist[:limit]: for (word,nertag) in sentence: #true_nerlist.append((word,nertag)) true_nerlist.append(nertag.lower()) """ Print baseline """ #print("\nBaseline = 0.9048987094135446 (everything tagged O)") """"Print F-score and confusion matrix """ #print(len(pred_nerlist)) #print(len(true_nerlist)) """"Print F-score and confusion matrix """ print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') ) print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') ) print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') ) print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"])) print("\nConfusion matrix:\n") for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print(" ",item,end="") print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
class SimpleSLU: def __init__(self): self.__semantic_instance_list = [] self.__speech_act_instance_list = [] self.__semantic_model = None self.__speech_act_model = None self.__speech_act_lb = None def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True def add_instance(self, utter, speech_act, semantic_tagged): tokenized = self.__tokenize(utter, semantic_tagged) if tokenized is None: return False semantic_instance = [] for word, (bio, tag, attrs) in tokenized: if bio is None: sem_label = 'O' else: cat = None for attr, val in attrs: if attr == 'cat': cat = val sem_label = '%s-%s_%s' % (bio, tag, cat) semantic_instance.append((unicode(word.lower()), unicode(sem_label))) self.__semantic_instance_list.append(semantic_instance) sa_label_list = [] for sa in speech_act: sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list += sa_labels sa_label_list = sorted(set(sa_label_list)) word_feats = ' '.join([word.lower() for word, _ in tokenized]) self.__speech_act_instance_list.append((word_feats, sa_label_list)) return True def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.LabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile) def pred(self, utter): tokenized = self.__tokenize(utter) word_feats = ' '.join([word.lower() for word, _ in tokenized]) pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats])) pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized]) return (pred_act, pred_semantic) def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents=None, tagger="ClassifierBasedTagger", model=None, model_name="../results/modelCRF_featured", entities=None, language="english", **kwargs): self.all_entities = [] self.acronyms = [] self.language = language if not model: assert isinstance(train_sents, Iterable) if tagger == "ClassifierBasedTagger": self.feature_detector = iob_features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=iob_features, **kwargs) elif tagger == "CRFTagger": self.set_entities(entities) if not model: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.train( train_data=train_sents, model_file="../results/{}".format(model_name)) else: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.set_model_file(model) else: raise Exception('Unknown tagger') def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) return chunks def get_position(self, w): positions = [] for e in self.all_entities: if w in e: positions.append(e.index(w)) return positions def get_positions(self, tokens, index): w = tokens[index][0] prev = tokens[index - 1][0] next = tokens[index + 1][0] positions = [] for e in self.all_entities: if w in e and prev in e and next in e: positions.append(e.index(w)) return list(set(positions)) def set_entities(self, entities): if entities: entities = [l.split() for l in entities] for l in entities: if len(l) == 1 and is_all_caps(l[0]): self.acronyms.append(l[0].lower()) else: self.all_entities.append([w.lower() for w in l]) self.all_entities = list( set([tuple(entity) for entity in self.all_entities])) self.acronyms = list(set(self.acronyms)) with open('../data/entities_{}.txt'.format(self.language), 'w') as f: f.write("\n".join( [" ".join(line) for line in self.all_entities])) with open('../data/acronyms_{}.txt'.format(self.language), 'w') as f: f.write("\n".join( [" ".join(line) for line in self.all_entities])) else: with open('../data/entities_{}.txt'.format(self.language), 'r') as f: for line in f: self.all_entities.append(line.strip().split()) with open('../data/acronyms_{}.txt'.format(self.language), 'r') as f: for line in f: self.acronyms.append(line.strip()) self.all_entities = list( set([tuple(entity) for entity in self.all_entities])) self.acronyms = list(set(self.acronyms)) def crf_features(self, tokens, index): """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for """ # init the stemmer stemmer = SnowballStemmer(self.language) # Pad the sequence with num_of_previous = 3 num_of_posterior = 2 tk = [] for i in range(0, num_of_previous): tk.append(('[START{}]'.format(num_of_previous - i), '[START{}]'.format(num_of_previous - i))) tk = tk + list(tokens) for i in range(1, num_of_posterior + 1): tk.append(('[END{}]'.format(i), '[END{}]'.format(i))) tokens = tk index += num_of_previous word, pos = tokens[index] contains_dash = ('–' in word or '-' in word or '_' in word) contains_dot = '.' in word prev2_words = tokens[index - 2][0] + "_._" + tokens[index - 1][0] prev2_pos = tokens[index - 2][1] + "_._" + tokens[index - 1][1] prev1_words = tokens[index - 1][0] + "_._" + tokens[index][0] prev1_pos = tokens[index - 1][1] + "_._" + tokens[index][1] prev1_lemma = stemmer.stem( tokens[index - 1][0]) + "_._" + stemmer.stem(tokens[index][0]) next1_words = tokens[index][0] + "_._" + tokens[index + 1][0] next1_pos = tokens[index][1] + "_._" + tokens[index + 1][1] next2_words = tokens[index + 1][0] + "_._" + tokens[index + 2][0] next2_pos = tokens[index + 1][1] + "_._" + tokens[index + 2][1] allcaps = is_all_caps(word) strange_cap = word[ 0] not in string.ascii_uppercase and word != word.lower() inside_ent = word.lower() in self.all_entities is_acronym = word.lower() in self.acronyms features = { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-caps': allcaps, 'strange-cap': strange_cap, 'prev2-pos': prev2_pos, 'prev2-word': prev2_words, 'next2-pos': next2_pos, 'next2-word': next2_words, 'prev1-pos': prev1_pos, 'prev1-word': prev1_words, 'prev1-lemma': prev1_lemma, 'next1-pos': next1_pos, 'next1-word': next1_words, } features['inside-entities'] = inside_ent if is_acronym: features['is-acronym'] = is_acronym positions = self.get_position(word.lower()) for p in positions: features['position-{}'.format(p)] = True features['total-position-{}'.format(len(positions))] = True if contains_dash: features['contains-dash'] = contains_dash if contains_dot: features['contains-dot'] = contains_dot for i in range(1, num_of_previous + 1): word, pos = tokens[index - i] lemma = stemmer.stem(word) features['prev-{}-word'.format(i)] = word features['prev-{}-pos'.format(i)] = pos features['prev-{}-lemma'.format(i)] = lemma for i in range(1, num_of_posterior + 1): word, pos = tokens[index + i] inside_ent = word.lower() in self.all_entities features['next-{}-word'.format(i)] = word features['next-{}-pos'.format(i)] = pos features['next-{}-inside-ent'.format(i)] = inside_ent return features
test_data_new = [] test_data_tags = [] for i in range(len(test_set)): if len(test_set[i]) != 0: for j in range(len(test_set[i])): test_data_new.append(test_set[i][j][0]) test_data_tags.append(test_set[i][j][1]) gold_sentences = test_data_new # print ct.evaluate(gold_sentences) # print test_data_new pred_tags = [] refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) pred = ct.tag(gold_sentences) for i in range(len(pred)): pred_tags.append(pred[i][1]) for i in range(len(test_data_tags)): refsets[test_data_tags[i]].add(i) testsets[pred_tags[i]].add(i) print "CRF language model" print 'Accuracy:', accuracy(pred_tags, test_data_tags) print "\n" print 'Precision of en:', precision(refsets['en'], testsets['en']) print 'Precision of hi:', precision(refsets['hi'], testsets['hi']) print "\n" print 'Recall of en:', recall(refsets['en'], testsets['en']) print 'Recall of hi:', recall(refsets['hi'], testsets['hi'])
ct = CRFTagger() # initialize tagger ct.set_model_file(TAGGER_PATH) dialogue_speakers = [] for disf_file in DISFLUENCY_TEST_FILES: IDs, mappings, utts, pos_tags, labels = \ load_data_from_disfluency_corpus_file(disf_file) dialogue_speakers.extend(sort_into_dialogue_speakers(IDs, mappings, utts, pos_tags, labels)) word_pos_data = {} # map from the file name to the data for data in dialogue_speakers: dialogue, a, b, c, d = data word_pos_data[dialogue] = (a, b, c, d) ct.tag([unicode(w) for w in "uh my name is john".split()]) # either gather training data or test data training_data = [] for speaker in word_pos_data.keys(): # print speaker sp_data = [] prefix = [] predictions = [] for word, pos in zip(word_pos_data[speaker][1], word_pos_data[speaker][2]): prefix.append(unicode(word.replace("$unc$", "") .encode("utf8"))) prediction = ct.tag(prefix[-5:])[-1][1] sp_data.append((unicode(word.replace("$unc$", "") .encode("utf8")), unicode(pos.encode("utf8"))))