def process_X(self, data, word2idx, max_sentence_length): sentence_getter = SentenceGetter(data, label_adapter=get_label) X = [[word2idx[w[0]] for w in s] for s in sentence_getter.sentences] X = pad_sequences(maxlen=max_sentence_length, sequences=X, padding="post", value=word2idx["PAD"]) return X
def process_Y(self, data, tag2idx, max_sentence_length, n_tags): sentence_getter = SentenceGetter(data, label_adapter=get_label) Y = [[tag2idx[w[1]] for w in s] for s in sentence_getter.sentences] Y_str = copy.deepcopy(Y) Y = pad_sequences(maxlen=max_sentence_length, sequences=Y, padding="post", value=tag2idx["PAD"]) Y = np.array([to_categorical(i, num_classes=n_tags + 1) for i in Y]) # n_tags+1(PAD) return Y, Y_str
import pandas as pd import numpy as np from utils import SentenceGetter from sklearn_crfsuite import CRF from sklearn.cross_validation import cross_val_predict from sklearn_crfsuite.metrics import flat_classification_report data = pd.read_csv("./data/ner_dataset.csv", encoding="utf8") data = data.fillna(method="ffill") sentences = SentenceGetter(data).sentences # 特征: # 1. 词 # 2. 词性 # 3. 比较候选词(0or1) # 4. 启发式位置(0or1) # 5. 浅层句法 # 模板: # 1. 该词前后三个词的所有特征(共5*7=35) # 2. 该词的每个特征两两组合(共7个) # 3. 相邻两个词的同一特征组合(共10*3=30) # 4. 中心词词窗为1的同一特征组合(共4*3=12) # 总计特征:84个 # 是否位于介词与比较词之间
# CRF import argparse from utils import load_data, SentenceGetter, sent2features, sent2labels from sklearn_crfsuite import CRF from sklearn.externals import joblib from sklearn.model_selection import cross_val_predict from sklearn_crfsuite.metrics import flat_classification_report # 1 加载数据 ner_dataset_dir = '../data/ner_dataset.csv' data = load_data(ner_dataset_dir) # 2 构建数据集 getter = SentenceGetter(data) sentences = getter.sentences X = [sent2features(s) for s in sentences] y = [sent2labels(s) for s in sentences] # 3 CRF 训练 2 def train(): crf = CRF(algorithm='lbfgs', c1=10, c2=0.1, max_iterations=100, all_possible_transitions=True) pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) report = flat_classification_report(y_pred=pred, y_true=y) print(report)
model = tf.keras.models.load_model(full_model_name) model.summary() file_directory= '../../sample_texts/inputs/' for file_name in os.listdir(file_directory): file_path = f'{file_directory}/{file_name}' file = open(file_path, mode='r') test_file_content = file.read() file.close() #print(test_file_content) #test_file_content = re.sub(r"[^\w\s.\d]", "", test_file_content) test_words = word_tokenize(test_file_content) test_data = pd.DataFrame(test_words, index=test_words) X_te = model_instance.process_X(test_data, word2idx, max_sentence_length) X_char_te = get_char_indices(test_data, max_word_length, max_sentence_length, char2idx) sentence_getter = SentenceGetter(test_data, label_adapter=get_label) padding_start = [len(s) for s in sentence_getter.sentences] X_te = np.asarray(X_te).astype(np.float32) sentence_preds = model.predict([X_te, X_char_te]) sentence_preds = [s[:padding_start_index] for s, padding_start_index in zip(sentence_preds, padding_start)] predicted_labels = [idx2tag[np.argmax(prob)] for sentence in sentence_preds for prob in sentence] food_entities = [] all_sentences = list(test_data.index) food_pieces = [] for index, label in enumerate(predicted_labels): if label == 'B-FOOD': food_pieces=[all_sentences[index]] elif label == 'I-FOOD' and len(food_pieces) > 0: