示例#1
0
def model_xunlian():
    #读取数据并预处理
    df_bingyin_list = load_dataset('病因')
    df_zhenduan_list = load_dataset('诊断')
    df_zhengzhuang_list = load_dataset('症状')
    df_zhiliao_list = load_dataset('治疗')

    #对各个类别数据进行空值符处理
    df_bingyin_word = processing_null(df_bingyin_list)
    # print(len(df_bingyin_word))
    df_zhenduan_word = processing_null(df_zhenduan_list)
    df_zhengzhuang_word = processing_null(df_zhengzhuang_list)
    df_zhiliao_word = processing_null(df_zhiliao_list)

    bingyin = df_bingyin_word.values.tolist()
    zhenduan = df_zhenduan_word.values.tolist()
    zhengzhuang = df_zhengzhuang_word.values.tolist()
    zhiliao = df_zhiliao_word.values.tolist()

    #分别把各个类别数据整理成一个列表形式
    sentences = []
    prep = preprocess(sentences, bingyin, zhenduan, zhengzhuang, zhiliao)
    prep.preprocess_text(bingyin, sentences, 'pathogeny')
    prep.preprocess_text(zhenduan, sentences, 'diagnosis')
    prep.preprocess_text(zhengzhuang, sentences, 'symptom')
    prep.preprocess_text(zhiliao, sentences, 'treatment')
    random.shuffle(sentences)

    # 分别把各个类别数据整理成各个列表形式
    bingyin_list = []
    zhenduan_list = []
    zhengzhuang_list = []
    zhiliao_list = []
    prep = preprocess2(bingyin_list, zhenduan_list, zhengzhuang_list,
                       zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao)
    prep.preprocess_lines(bingyin, bingyin_list, 'pathogeny')
    prep.preprocess_lines(zhenduan, zhenduan_list, 'diagnosis')
    prep.preprocess_lines(zhengzhuang, zhengzhuang_list, 'symptom')
    prep.preprocess_lines(zhiliao, zhiliao_list, 'treatment')

    #分割数据
    x, y = zip(*sentences)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=1234)

    #训练数据
    text_classifier = TextClassifier()
    text_classifier.fit(x_train, y_train)
    #保存并加载模型
    joblib.dump(text_classifier, 'text_classifier.pkl')
    # new_text_classifier=joblib.load('text_classifier.pkl')
    # precision=text_classifier.score(x_test, y_test)
    return bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, x_train, x_test, y_train, y_test
示例#2
0
from load_dataset import processing_null
#读取数据并预处理
df_bingyin_list = load_dataset('病因')
df_zhenduan_list = load_dataset('诊断')
df_zhengzhuang_list = load_dataset('症状')
df_zhiliao_list = load_dataset('治疗')
stopwords = pd.read_csv('data/stopwords.txt',
                        index_col=False,
                        quoting=3,
                        sep="\t",
                        names=['stopword'],
                        encoding='utf-8')
stopwords = stopwords['stopword'].values  #导入停用词

#对各个类别数据进行空值符处理
df_bingyin_word = processing_null(df_bingyin_list)[0:1000]
# print(len(df_bingyin_word))
df_zhenduan_word = processing_null(df_zhenduan_list)[0:1000]
df_zhengzhuang_word = processing_null(df_zhengzhuang_list)[0:1000]
df_zhiliao_word = processing_null(df_zhiliao_list)[0:1000]


def build_sentence_vector(text, size, imdb_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    contents = jieba.lcut(text)
    for word in contents:
        if word not in stopwords:
            try:
                vec += imdb_w2v[word].reshape((1, size))
                count += 1
示例#3
0
from load_dataset import processing_null
import os
import csv


#数据预处理
def test(line):
    text_classifier = joblib.load('text_classifier.pkl')
    content = text_classifier.process_line(line)
    leibie = text_classifier.predict(content)[0]
    return leibie


#文本处理,训练并随时存储数据
path = "C:\\Users\\Administrator\\Desktop\\GBDT_predicted\\ziliao"
folder_list = os.listdir(path)
with open('new_txt.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['line', 'leibie'])
    for folder in folder_list:
        folder_path = os.path.join(path, folder)
        df = pd.read_csv(folder_path, encoding='gbk')
        # df = pd.read_csv('ziliao/aixiao.csv', encoding='gbk')
        df = processing_null(df)
        for line in df:
            # print(type(line))
            liebie = test(line)
            # new = pd.DataFrame([{'title': line,'leibie':liebie}], index=['0'])
            writer.writerow([line, liebie])
f.close()