def process(self, utt): lis = [] for line in self.mecab.parse(utt).splitlines(): if line == "EOS": break else: word = line.split("\t")[0] features = line.split("\t")[1:] postag = features[3].split("-")[0] lis.append([word, postag, "O"]) words = [x[0] for x in lis] tokens_str = " ".join(words) X = self.vectorizer.transform([tokens_str]) Y = self.svc.predict(X) # 数値を対応するラベルに戻す da = self.label_encoder.inverse_transform(Y)[0] # lisに[]必要? X = [sent2features(s) for s in [lis]] # 各単語に対応するラベル列 labels = self.crf.predict(X)[0] # 単語列とラベル系列の対応をとって辞書に変換 conceptdic = {} buf = "" last_label = "" for word, label in zip(words, labels): if re.search(r'^B-', label): if buf != "": _label = last_label.replace('B-', '').replace('I-', '') conceptdic[_label] = buf buf = word elif re.search(r'^I-', label): buf += word elif label == "O": if buf != "": _label = last_label.replace('B-','').replace('I-','') conceptdic[_label] = buf buf = "" last_label = label if buf != "": _label = last_label.replace('B-','').replace('I-', '') conceptdic[_label] = buf return da, conceptdic
def extract_concept(utt): lis = [] for line in mecab.parse(utt).splitlines(): if line == "EOS": break else: word, feature_str = line.split("\t") features = feature_str.split(',') postag = features[0] lis.append([word, postag, "O"]) words = [x[0] for x in lis] X = [sent2features(s) for s in [lis]] # 各単語に対応するラベル列 labels = crf.predict(X)[0] # 単語列とラベル系列の対応を取って辞書に変換 conceptdic = {} buf = "" last_label = "" for word, label in zip(words, labels): if re.search(r'^B-',label): if buf != "": _label = last_label.replace('B-','').replace('I-','') conceptdic[_label] = buf buf = word elif re.search(r'^I-',label): buf += word elif label == "O": if buf != "": _label = last_label.replace('B-','').replace('I-','') conceptdic[_label] = buf buf = "" last_label = label if buf != "": _label = last_label.replace('B-','').replace('I-','') conceptdic[_label] = buf return conceptdic
def extract_concept(utt): lis = [] for line in mecab.parse(utt).splitlines(): if line == "EOS": break else: word = line.split("\t")[0] features = line.split("\t")[1:] postag = features[3].split("-")[0] lis.append([word, postag, "O"]) words = [x[0] for x in lis] X = [sent2features(s) for s in [lis]] labels = crf.predict(X)[0] conceptdic = {} buf = "" last_label = "" for word, label in zip(words, labels): if re.search(r"^B-", label): if buf != "": _label = last_label.replace("B-", "").replace("I-", "") conceptdic[_label] = buf buf = word elif re.search(r"^I-", label): buf += word elif label == "O": if buf != "": _label = last_label.replace("B-", "").replace("I-", "") conceptdic[_label] = buf buf = "" last_label = label if buf != "": _label = last_label.replace("B-", "").replace("I-", "") conceptdic[_label] = buf return conceptdic
def ner_crf(sent, tagger): return tagger.tag(sent2features(to_sent(sent)))
lis = [] # concept_samples.dat の読み込み for line in open("concept_samples.dat", "r"): line = line.rstrip() # 空行で一つの事例が完了 if line == "": sents.append(lis) lis = [] else: # concept_samples.dat は単語,品詞,ラベルがタブ区切りになっている word, postag, label = line.split('\t') lis.append([word, postag, label]) # 各単語の情報を素性に変換 X = [sent2features(s) for s in sents] # 各単語のラベル情報 Y = [sent2labels(s) for s in sents] # CRFによる学習 crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) crf.fit(X, Y) # CRFモデルの保存 with open("crf.model", "wb") as f: dill.dump(crf, f)