def test(): tagger = pycrfsuite.Tagger() tagger.open('crfsuite.model') test_file = os.path.join(os.path.dirname(__file__), '../data/test.txt') test_sents, test_tags = NnPreprocessor.read_tagged_file(test_file) X_test = [sent2features(s) for s in test_sents] y_test = test_tags y_pred = [tagger.tag(x) for x in X_test] print(bio_classification_report(y_test, y_pred))
def main(training_file, testing_file, model_file, ft): start = time.time() # Get training and testing set of data #Ignoring the ones where RESTR/NON-RESTR is not defined, i.e. the ones written as "_" training_set = get_input(training_file) testing_set = get_input(testing_file) #Special training case for Honnibal et al. if ft == "honnibal": y_test = [MyClassifiers.get_labels(s) for s in testing_set] y_pred = [MyClassifiers.get_features(s, ft) for s in testing_set] printReport(y_pred, y_test, ft) return # Get features of each word on training set X_train = [] count = 0 for s in training_set: X_train.append(MyClassifiers.get_features(s, ft)) count += 1 y_train = [MyClassifiers.get_labels(s) for s in training_set] # Get features of each word on testing set X_test = [MyClassifiers.get_features(s, ft) for s in testing_set] y_test = [MyClassifiers.get_labels(s) for s in testing_set] # Create trainer model of CRF trainer = pycrfsuite.Trainer(verbose=False) trainer.set_params(getTrainerFeatures(ft)) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) # Train the model and save the trained model into model_file trainer.train(model_file_loc + ft + "Model.crfsuite") #print ("Log of last iteration={}".format(trainer.logparser.iterations[-1])) # Initial tagger for prediction task trained_model = pycrfsuite.Tagger() trained_model.open(model_file_loc + ft + "Model.crfsuite") # Load the trained model. # Get prediction tag results from trained model y_pred = [trained_model.tag(xseq) for xseq in X_test] # Print the Precision, Recall, and F-1 score end = time.time() print('Runtime:', end - start) printReport(y_test, y_pred, ft)
def nerc(input_dir, model_file, output_file): input_files = os.listdir(input_dir) output_file = open(output_file, 'w') crf_tagger = pycrfsuite.Tagger() crf_tagger.open(model_file) for file in input_files: tree = parseXML(input_dir + '/' + file) for sentence in tree: (id, text, _) = get_sentence_info(sentence) tokens = tokenize(text) features = extract_features(tokens) # output_features(id, tokens, features) classes = crf_tagger.tag(features) output_entities(id, tokens, classes, output_file)
def test_crf(file, tagbot=None): ''' Use pre trained model (crf_alltrain) to classify file ''' text, indices = parse_html(file) length = len(indices) test_sentences = [] for line in text: tokens = nltk.tokenize.word_tokenize(line) labels = [(token, '-') for token in tokens] test_sentences.append(labels) test_postag = crf_train.add_postag(test_sentences) X_test = [ crf_train.sentenceToFeatures(test_postag[i], indices[i] * 1.0 / length) for i in range(len(test_postag)) ] # test is not known at all # Y_test = [crf_train.sentenceToLabels(sentence) for sentence in test_postag] tagger = tagbot if tagger == None: tagger = pycrfsuite.Tagger() tagger.open('crf_alltrain.model') Y_pred = [tagger.tag(xseq) for xseq in X_test] # labels = {'-': 0, '?': 0} # counter = 1 # for category in crf_train.cue_phrases: # labels[category] = counter # counter += 1 # predictions = [labels[tag] for row in Y_pred for tag in row] ## Test is not known # # truths = [labels[tag] for row in Y_test for tag in row] # c = 0 # for p, t in zip(predictions, truths): # if p+t!=0: # print(p, t) # c+=1 # # predictions = np.array([labels[tag] for row in Y_pred for tag in row]) # # truths = np.array([labels[tag] for row in Y_test for tag in row]) # # # print(len(predictions)) # # # print(len(truths)) # # print(c, 'identified labels') # # print(predictions) # print( classification_report(truths, predictions,target_names=list(crf_train.cue_phrases.keys()) ) ) return text, X_test, Y_pred
def from_disk(self, model_path, tokenizer_list, *args, **kwargs): self.model_file = self.get_model_file(model_path) self.crf_tagger = pycrfsuite.Tagger() self.crf_tagger.open(self.model_file) pickle_file = self.get_char2feature_file(model_path) with open(pickle_file, 'rb') as fd: self.feature_func_list = pickle.load(fd) for tokenizer in tokenizer_list: tokenizer.assign_from_loader( crf_tagger=self.crf_tagger, feature_func_list=self.feature_func_list)
def predict(): tagger = crf.Tagger() tagger.open(trained_model) predictedY = [] confidences = [] confidences_beam = [] for xseq in testX: yseq = tagger.tag(xseq) predictedY.append(yseq) confidences.append([tagger.marginal(yseq[i],i) for i in range(len(yseq))]) confidences_beam.append([ [tagger.marginal(tag, i) for tag in train.int2tags] for i in range(len(yseq))]) return predictedY, testY, confidences, confidences_beam, tagger.info()
def prediction(): filename = 'msr_test_gold.utf8' test_set = LoadData(filename) tagger = pycrfsuite.Tagger() tagger.open('msr.crfsuite') example_sent = test_set[100] print(example_sent) print(sent2tokens(example_sent)) print(' '.join(sent2tokens(example_sent)), end='\n\n') print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent)))) print("Correct: ", ' '.join(sent2labels(example_sent)))
def __init__(self): self.model = pycrfsuite.Tagger() filepath = join(dirname(__file__), "model_9.bin") self.model.open(filepath) template = [ "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower", "T[-1].isdigit", "T[0].isdigit", "T[1].isdigit", "T[-1].istitle", "T[0].istitle", "T[1].istitle", "T[0,1].istitle", "T[0,2].istitle", "T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict", "T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict", "T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict", # word unigram and bigram and trigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", "T[-2,0]", "T[-1,1]", "T[0,2]", # BI tag "T[-2][1]", "T[-1][1]" ] self.transformer = CustomTransformer(template)
def get_confidence(self, sent): sent = sent.split() # Add a dummy label because sent2features using this interface sent = [(s, '') for s in sent] sent = sent2features(sent) tagger = pycrfsuite.Tagger() if not os.path.isfile(self.model_file): confidence = 0.2 return [confidence] tagger.open(self.model_file) tagger.set(sent) Y_pred = tagger.tag() p_y_pred = tagger.probability(Y_pred) confidence = pow(p_y_pred, 1. / len(Y_pred)) return [confidence]
def __init__(self, model_path, model_name, save_path=None, start_iter=0): self.model_path = model_path self.model_name = model_name self.trainer = pycrfsuite.Trainer(verbose=False) self.tagger = pycrfsuite.Tagger() self.iter = start_iter # self.save_path = save_path if not os.path.exists(self.model_path): os.makedirs(self.model_path) if self.save_path is not None: if not os.path.exists(self.save_path): os.makedirs(self.save_path) if st.DICTIONARY is True or st.SELF_ITER_N > 1: self.X_total = [] ## added for dicionary self.y_total = []
def predict(filepath='', x_test=[]): twitter = Twitter() predicted_labels = [] # file check is_filepath_existed(filepath) tagger = pycrfsuite.Tagger() tagger.open(filepath) for text in x_test: sent = twitter.pos(text, norm=True, stem=True) sent = [(sent[i][0], sent[i][1]) for i in range(len(sent))] predicted_labels.append(tagger.tag(sent2features(sent))) return predicted_labels
def test_sequenceLabeler_predict(self): print("test_sequenceLabeler_predict") global id global model_file sentence = "I want to book a cab from Beijing" tokenizedSentence = word_tokenize(sentence) taggedToken = posTagger(sentence) tagger = pycrfsuite.Tagger() tagger.open(model_file) predictedLabels = tagger.tag( sequenceLabeler.sentToFeatures(taggedToken)) extractedEntities = sequenceLabeler.extractEntities( zip(tokenizedSentence, predictedLabels)) print("extractedEntities:") print(extractedEntities)
def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = pycrfsuite.Trainer(algorithm, verbose=False) trainer.set_params(params) encoder = self.encoder() for doc in docs: idx = 0 for sent in doc.sents: tokens = list(sent) features = self.feature_extractor.extract([str(token) for token in tokens],idx) encoding = encoder.encode(tokens) trainer.append(features, encoding) idx +=1 trainer.train(path) self.tagger = pycrfsuite.Tagger() self.tagger.open(path)
def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = pycrfsuite.Trainer(algorithm, verbose=False) trainer.set_params(params) for doc in docs: #print(doc) for sent in doc.sents: tokens = list(sent) features = self.feature_extractor.extract( [token.text for token in tokens]) encoded_labels = self._encoder.encode(tokens) trainer.append(features, encoded_labels) trainer.train(path) self.tagger = pycrfsuite.Tagger() self.tagger.open(path)
def main(): inputdir = sys.argv[1] testdir = sys.argv[2] outputfile = sys.argv[3] x_list = [] y_list = [] for root, dirs, files in os.walk(inputdir): for filename in files: if filename.endswith(".csv"): filepath = os.path.abspath(os.path.join(root, filename)) utterances = inputtool.get_utterances_from_filename(filepath) x_train = sent2features(utterances) y_train = sent2labels(utterances) for x in x_train: x_list.append(x) for y in y_train: y_list.append(y) trainer = pycrfsuite.Trainer(verbose=False) trainer.append(x_list, y_list) trainer.set_params({ 'c1': 1, 'c2': 1e-3, 'max_iterations': 85, 'feature.possible_states': True, 'feature.possible_transitions': True }) trainer.train('baseline.crfsuite') tagger = pycrfsuite.Tagger() tagger.open('baseline.crfsuite') f = open(outputfile, "a") f.truncate(0) for root, dirs, files in os.walk(testdir): for filename in files: if filename.endswith(".csv"): filepath = os.path.abspath(os.path.join(root, filename)) utterances = inputtool.get_utterances_from_filename(filepath) x_tag = sent2features(utterances) outputlist = tagger.tag(x_tag) f.write('Filename="') f.write(filename) f.write('"') f.write('\n') for y in outputlist: f.write(y) f.write('\n') f.write('\n') f.close()
def get_summary(file): ''' Combine crf predictions with k-mix-model ''' # text, indices = crf_test.parse_html(file) with open(file, 'r') as f: text = f.readlines() # we have list of sentences and indices, without para information doc_length = sum([len(line.split(' ')) for line in text]) tagger = pycrfsuite.Tagger() tagger.open('crf_alltrain.model') text, X_test, Y_pred = crf_test.test_crf(file, tagger) kmm = k_mix_model_test.KMM(file) # kmm contains score for each line in text, in serialized order kmix_sorted = sorted(kmm.items(), key=operator.itemgetter(1), reverse=True) # generate_summary visited = {} summary = {} for pair in kmix_sorted: sentence_id = pair[0] label = Y_pred[sentence_id - 1][0] if label not in visited: summary[text[sentence_id - 1]] = label visited[label] = 1 elif visited[label] == 2: continue else: visited[label] = 2 summary[text[sentence_id - 1]] = label length = sum([len(key.split(' ')) for key in summary.keys()]) # print(length, SUMMARY_PERCENT * 0.01 * doc_length) if length > SUMMARY_PERCENT * 0.01 * doc_length: break summary_txt = '' order = ['F', 'I', 'A', 'LR', 'SS', 'SP', 'SO', 'R'] for category in order: summary_txt += ''.join([key for key in summary if summary[key] == category]) + '\n' # print(summary_txt) # return summary_txt return summary
def main(training_file, testing_file, model_file): start = time.time() # Get training and testing set of data training_set = get_input(training_file) testing_set = get_input(testing_file) # Get features of each word on training set X_train = [get_features(s) for s in training_set] y_train = [get_labels(s) for s in training_set] # Get features of each word on testing set X_test = [get_features(s) for s in testing_set] y_test = [get_labels(s) for s in testing_set] # Create trainer model of CRF trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.5, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) # Train the model and save the trained model into model_file trainer.train(model_file) print ("Log of last iteration={}".format(trainer.logparser.iterations[-1])) # Initial tagger for prediction task trained_model = pycrfsuite.Tagger() trained_model.open(model_file) # Load the trained model. # Get prediction tag results from trained model y_pred = [trained_model.tag(xseq) for xseq in X_test] # Print the Precision, Recall, and F-1 score print(bio_classification_report(y_test, y_pred)) end = time.time() print('CRF model has been generated.') print('runtime:', end - start)
def predict(test_features, test_labels): predictor = pycrfsuite.Tagger(verbose = False) predictor.open("super_advanced_dialog_act_tagger.crfsuite") output_file = open("super_output.txt", "w+") correct_predictions = 0 total_predictions = 0 for conversation in range(len(test_features)): for label_index, predicted_label in enumerate(predictor.tag(test_features[conversation])): if predicted_label == test_labels[conversation][label_index]: correct_predictions += 1 total_predictions += 1 predicted_label += "\n" output_file.writelines(predicted_label) output_file.writelines("\n") output_file.close() print ("Accuracy is " , (correct_predictions / total_predictions))
def __init__(self, debug=False): cdr_file = get_data_file("named.rdr", folder="vietnamese") self.__root = SCRDRTree() self.__root.constructSCRDRtreeFromRDRfile(cdr_file) crf_file = get_data_file("ner.crf.bin", folder="models") if not path.isfile(crf_file): logging.error("Model %s not found " % crf_file) print("Model %s not found " % crf_file) exit() self.__crf: pycrfsuite.Tagger = pycrfsuite.Tagger() self.__crf.open(crf_file) self.__nlp = None self.__debug = debug self.__adapter: DocFeatures = DocFeatures() logging.info("Labels in model(Semi_Supervised_Doc_Ner) : %s" % str(self.__crf.labels()))
def predict(op_file): tagger = pycrfsuite.Tagger() tagger.open('train.crfsuite') total = 0 correct = 0 with open(op_file, 'w') as file: for name, features in map_features.items(): tags = tagger.tag(features[0]) file.write("Filename=\"" + name + "\"\n") for i in range(len(tags)): total += 1 if features[1][i] == tags[i]: correct += 1 file.write(tags[i] + "\n") file.write("\n") return correct, total
def Test(test_file): with open(test_file, 'rb') as rp: test_set = pickle.load(rp) test_set = test_set[:3000] X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] tagger = pycrfsuite.Tagger() tagger.open('PKU.crfsuite') y_pred = [tagger.tag(xseq) for xseq in X_test] res = Evaluation(y_test, y_pred) print(res)
def load_models(lang, dir=None): global trie global tagger global lemmatiser if dir != None: reldir = dir trie = pickle.load(open(os.path.join(reldir, lang + '.marisa'), 'rb')) tagger = pycrfsuite.Tagger() tagger.open(os.path.join(reldir, lang + '.msd.model')) lemmatiser = { 'model': pickle.load(open(os.path.join(reldir, lang + '.lexicon.guesser'), 'rb')), 'lexicon': pickle.load(open(os.path.join(reldir, lang + '.lexicon'), 'rb')) }
def classifier(features): """ Creates a list with predicted labels for each feature, using a previously instanciated learner. :param features: A list of feature vectors :returns: A list of predicted classes for each feature """ predicted_classes = [] tagger = pycrfsuite.Tagger() tagger.open(MODEL) # TODO: Placeholder, feature vector might need to be prepared for feature in features: predicted_classes.append(tagger.tag(feature)) return predicted_classes
def pred(data): def get_labels(doc): return [label for (token, postag, label) in doc] def get_token(doc): return [token for (token, postag, label) in doc] # test_data = get_test_data_unlabel(input_file) X_test = [[train_ashford.create_feature(d, i) for i in range(len(d))] for d in data] tagger = pycrfsuite.Tagger() tagger.open(modelM) y_pred = [tagger.tag(xseq) for xseq in X_test] #list[list[string]] return y_pred
def load(cls, model_dir, model_name): # type: (Text, Text) -> CRFEntityExtractor import pycrfsuite if model_dir and model_name: ent_tagger = pycrfsuite.Tagger() ent_tagger.open(os.path.join(model_dir, 'ner', model_name)) config = json.load( io.open(os.path.join(model_dir, 'ner', 'crf_config.json'), 'r')) return CRFEntityExtractor(ent_tagger=ent_tagger, crf_features=config['crf_features'], BILOU_flag=config['BILOU_flag']) else: return CRFEntityExtractor()
def perform_k_fold(k, x_train, y_train): kf = KFold(n_splits=k, shuffle=True) scores = [] for i in range(k): result = next(kf.split(x_train), None) train_indices = result[0] test_indices = result[1] x_training_data = [x_train[i] for i in train_indices] y_training_data = [y_train[i] for i in train_indices] x_testing_data = [x_train[i] for i in test_indices] y_testing_data = [y_train[i] for i in test_indices] trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(x_training_data, y_training_data): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) print("start training") trainer.train('advanced_model') print("finish training ") print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1]) tagger = pycrfsuite.Tagger() tagger.open('advanced_model') y_pred = [tagger.tag(xseq) for xseq in x_testing_data] flat_list_true = [ item for sublist in y_testing_data for item in sublist ] flat_list_pred = [item for sublist in y_pred for item in sublist] acc_score = accuracy_score(flat_list_true, flat_list_pred, normalize=True, sample_weight=None) scores.append(acc_score)
def extract(text): tagger = pycrfsuite.Tagger() tagger.open('model/recommend_game.crfsuite') text_split = text.replace(' and', '.').split('.') sentence = input_prep(text_split) features = [sent2features(s) for s in sentence] tagList = [tagger.tag(s) for s in features] print(tagList) for idx_sent, sent in enumerate(tagList): for idx_word, word in enumerate(sent): if word != 'O': words = sentence[idx_sent][idx_word] words_new = (words[0], words[2], word) sentence[idx_sent][idx_word] = words_new #print(sentence) ratingList = [] genreList = [] priceList = [] ageList = [] characterList = [] for idx_sent, sent in enumerate(sentence): for idx_word, word in enumerate(sent): if 'genre' in word[2]: genreList.append(word[0]) elif 'age' in word[2]: if word[0].isdigit(): ageList.append(word[0]) elif 'price' in word[2]: if 'free' in word[0]: priceList.append('0') else: if word[0].replace('$', '').isdigit(): priceList.append(word[0].replace('$', '')) elif 'rating' in word[2]: ratingList.append(word[0]) elif 'character' in word[2]: characterList.append(word[0]) entitylist = { 'genre': genreList, 'age': ageList, 'price': priceList, 'rating': ratingList, 'characters': characterList } #print(f"entitylist: {entitylist}") return sentence, entitylist
def do_test(reader): test_sents = reader.iob_sents('test') x_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] tagger = pycrfsuite.Tagger() tagger.open('model.crfsuite') for example_sent in test_sents: sys.stdout.write('\t') print('\t'.join(sent2tokens(example_sent))) print("Predicted:\t", '\t'.join(tagger.tag(sent2features(example_sent)))) print("Correct:\t", '\t'.join(sent2labels(example_sent))) return (x_test, y_test)
def tag_evaluate(test_sents, crfsuite_model, templates, k): x_test = [ccks.sent2attributes(s, templates) for s in test_sents] tagger = pycrfsuite.Tagger() tagger.open(crfsuite_model) y_pred = [tagger.tag(xseq) for xseq in x_test] # 将单次迭代后的预测结果写入文件并评测 # w = open('result.txt', 'w') # for sents, pred in zip(test_sents, y_pred): # for i in range(len(sents)): # w.write('\t'.join([sents[i][0], sents[i][1], sents[i][2], pred[i]]) + '\n') # w.close() # report = os.popen('perl conlleval.pl -d "\t" < result.txt').readlines() # report = [line.encode('utf8') for line in report] # open(str(k+1) + '-report.txt', 'a').writelines(report) return y_pred
def predict(self, model_name, sentence): """ Predict NER labels for given model and query :param model_name: :param sentence: :return: """ from app.nlu.tasks import pos_tagger tokenized_sentence = word_tokenize(sentence) tagged_token = pos_tagger(sentence) tagger = pycrfsuite.Tagger() tagger.open("{}/{}.model".format(app.config["MODELS_DIR"], model_name)) predicted_labels = tagger.tag(self.sent_to_features(tagged_token)) extracted_entities = self.crf2json( zip(tokenized_sentence, predicted_labels)) return extracted_entities