Пример #1
0
    def process(self, utt):
        lis = []
        for line in self.mecab.parse(utt).splitlines():
            if line == "EOS":
                break
            else:
                word = line.split("\t")[0]
                features = line.split("\t")[1:]
                postag = features[3].split("-")[0]
                lis.append([word, postag, "O"])

        words = [x[0] for x in lis]
        tokens_str = " ".join(words)
        X = self.vectorizer.transform([tokens_str])
        Y = self.svc.predict(X)
        # 数値を対応するラベルに戻す
        da = self.label_encoder.inverse_transform(Y)[0]

        # lisに[]必要?
        X = [sent2features(s) for s in [lis]]

        # 各単語に対応するラベル列
        labels = self.crf.predict(X)[0]

        # 単語列とラベル系列の対応をとって辞書に変換
        conceptdic = {}
        buf = ""
        last_label = ""
        for word, label in zip(words, labels):
            if re.search(r'^B-', label):
                if buf != "":
                    _label = last_label.replace('B-', '').replace('I-', '')
                    conceptdic[_label] = buf
                buf = word

            elif re.search(r'^I-', label):
                buf += word
            elif label == "O":
                if buf != "":
                    _label = last_label.replace('B-','').replace('I-','')
                    conceptdic[_label] = buf
                    buf = ""
            last_label = label

        if buf != "":
            _label = last_label.replace('B-','').replace('I-', '')
            conceptdic[_label] = buf

        return da, conceptdic
Пример #2
0
def extract_concept(utt):
    lis = []
    for line in mecab.parse(utt).splitlines():
        if line == "EOS":
            break
        else:
            word, feature_str = line.split("\t")
            features = feature_str.split(',')
            postag = features[0]
            lis.append([word, postag, "O"])

    words = [x[0] for x in lis]            
    X = [sent2features(s) for s in [lis]]
    
    # 各単語に対応するラベル列
    labels = crf.predict(X)[0]
    
    # 単語列とラベル系列の対応を取って辞書に変換
    conceptdic = {}
    buf = ""
    last_label = ""
    for word, label in zip(words, labels):
        if re.search(r'^B-',label):
            if buf != "":
                _label = last_label.replace('B-','').replace('I-','')
                conceptdic[_label] = buf                    
            buf = word
        elif re.search(r'^I-',label):
            buf += word
        elif label == "O":
            if buf != "":
                _label = last_label.replace('B-','').replace('I-','')
                conceptdic[_label] = buf
                buf = ""
        last_label = label
    if buf != "":
        _label = last_label.replace('B-','').replace('I-','')
        conceptdic[_label] = buf
        
    return conceptdic
Пример #3
0
def extract_concept(utt):
    lis = []
    for line in mecab.parse(utt).splitlines():
        if line == "EOS":
            break
        else:
            word = line.split("\t")[0]
            features = line.split("\t")[1:]
            postag = features[3].split("-")[0]
            lis.append([word, postag, "O"])

    words = [x[0] for x in lis]
    X = [sent2features(s) for s in [lis]]

    labels = crf.predict(X)[0]

    conceptdic = {}
    buf = ""
    last_label = ""
    for word, label in zip(words, labels):
        if re.search(r"^B-", label):
            if buf != "":
                _label = last_label.replace("B-", "").replace("I-", "")
                conceptdic[_label] = buf
            buf = word
        elif re.search(r"^I-", label):
            buf += word
        elif label == "O":
            if buf != "":
                _label = last_label.replace("B-", "").replace("I-", "")
                conceptdic[_label] = buf
                buf = ""
        last_label = label
    if buf != "":
        _label = last_label.replace("B-", "").replace("I-", "")
        conceptdic[_label] = buf

    return conceptdic
Пример #4
0
def ner_crf(sent, tagger):
    return tagger.tag(sent2features(to_sent(sent)))
Пример #5
0
lis = []

# concept_samples.dat の読み込み
for line in open("concept_samples.dat", "r"):
    line = line.rstrip()
    # 空行で一つの事例が完了
    if line == "":
        sents.append(lis)
        lis = []
    else:
        # concept_samples.dat は単語,品詞,ラベルがタブ区切りになっている
        word, postag, label = line.split('\t')
        lis.append([word, postag, label])

# 各単語の情報を素性に変換
X = [sent2features(s) for s in sents]

# 各単語のラベル情報
Y = [sent2labels(s) for s in sents]

# CRFによる学習
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=100,
                           all_possible_transitions=False)
crf.fit(X, Y)

# CRFモデルの保存
with open("crf.model", "wb") as f:
    dill.dump(crf, f)