Пример #1
0
import os

import re
import jieba

from random import shuffle

from util import load_word_re, load_type_re, load_pair, word_replace

path_stop_word = 'dict/stop_word.txt'
path_type_dir = 'dict/word_type'
path_homo = 'dict/h**o.csv'
path_syno = 'dict/syno.csv'
stop_word_re = load_word_re(path_stop_word)
word_type_re = load_type_re(path_type_dir)
homo_dict = load_pair(path_homo)
syno_dict = load_pair(path_syno)

path_cut_word = 'dict/cut_word.txt'
jieba.load_userdict(path_cut_word)


def save_train(path, texts, labels):
    label_texts = dict()
    for text, label in zip(texts, labels):
        if label not in label_texts:
            label_texts[label] = list()
        cut_text = ' '.join(jieba.cut(text))
        label_texts[label].append(cut_text)
    head = 'label,cut_doc'
    with open(path, 'w') as f:
import json
import pickle as pk

import re

from util import load_word_re

path_pre_name = 'dict/pre_name.txt'
path_digit = 'dict/digit.txt'
pre_name_re = load_word_re(path_pre_name)
digit_re = load_word_re(path_digit)

path_label_ind = 'feat/label_ind.pkl'


def include_pre_name(word):
    if re.findall(pre_name_re, word):
        return True
    else:
        return False


def include_digit(word):
    if re.findall(digit_re, word):
        return True
    else:
        return False


def sent2feat(triples):
    sent_feat = list()