Exemplo n.º 1
0
def main(trainfile, modelfile, prepareTraindata, flag_print=False):
    root = sys.path[0]

    datafolder = os.path.join(root, 'data')
    tempdatafolder = os.path.join(datafolder, 'tempdata')
    if not os.path.exists(tempdatafolder): os.mkdir(tempdatafolder)
    tempfolder = os.path.join(root, 'temp')
    dicfolder = os.path.join(root, 'dic')
    modelfoler = os.path.join(root, 'models')
    dic_file = os.path.join(dicfolder, 'dic.txt')

    ebao_dic = tools.loadDic(dic_file)

    train_filenames = [(trainfile, 'un')]  # un 多了一类medicine
    train_length = len(train_filenames)
    train_file_list = []
    boundary4training_list = []
    class4training_list = []

    for i in range(train_length):
        train_file_list.append(os.path.join(datafolder, train_filenames[i][0]))
        boundary4training_list.append(
            os.path.join(tempdatafolder, 'boundary4training_' + str(i)))
        class4training_list.append(
            os.path.join(tempdatafolder, 'class4training_' + str(i)))

    test_filenames = [('jsd-test.txt', 'jsd'), ('ct-test.txt', 'un')]

    test_length = len(test_filenames)
    test_file_list = []
    boundary4testing_list = []
    class4testing_list = []
    sen_ent4testing_list = []
    for i in range(test_length):
        test_file_list.append(os.path.join(datafolder, test_filenames[i][0]))
        boundary4testing_list.append(
            os.path.join(tempdatafolder, 'boundary4testing_' + str(i)))
        class4testing_list.append(
            os.path.join(tempdatafolder, 'class4testing_' + str(i)))
        sen_ent4testing_list.append(
            os.path.join(tempdatafolder, 'sen_ent4testing_' + str(i)))

    sentence_list_test = []
    sen_tags_list_test = []
    if prepareTraindata == 'prepareTrain':
        for i in range(train_length):
            processing.generateFullTagFile(train_file_list[i],
                                           boundary4training_list[i],
                                           class4training_list[i], '', '0',
                                           ebao_dic, 'train', '0',
                                           train_filenames[i][1])
        if flag_print:
            print 'Train data generated!'

    for i in range(test_length):
        sentence_list, sen_tags_list = processing.generateFullTagFile(
            test_file_list[i], boundary4testing_list[i], class4testing_list[i],
            sen_ent4testing_list[i], '1', ebao_dic, 'train', '0',
            test_filenames[i][1])
        sentence_list_test.append(sentence_list)
        sen_tags_list_test.append(sen_tags_list)
    if flag_print:
        print 'Test data generated!'

    for k in range(train_length):
        if flag_print:
            print train_filenames[k][0]
        b_model = os.path.join(modelfoler, 'b-' + modelfile + '-' + str(k))
        c_model = os.path.join(modelfoler, 'c-' + modelfile + '-' + str(k))

        thread_b = threading.Thread(target=modelTraining,
                                    args=(b_model, boundary4training_list[k]))
        thread_b.start()
        thread_b.join()
        if flag_print:
            print train_filenames[k][0] + b_model + ' generated!'
        thread_c = threading.Thread(target=modelTraining,
                                    args=(c_model, class4training_list[k]))
        thread_c.start()
        thread_c.join()
        if flag_print:
            print train_filenames[k][0] + c_model + ' generated!'

        for j in range(test_length):
            if flag_print:
                print '\nTraindata: ' + train_filenames[k][0] + '\n'
                print '\nTestdata: ' + test_filenames[j][0] + '\n'
            boundary_result = boundary4testing_list[j] + '.result'
            os.system('crfsuite tag -m ' + b_model + ' ' +
                      boundary4testing_list[j] + ' > ' + boundary_result)
            if flag_print:
                print 'boundary test result generated!'
            evaluation.eval(boundary4testing_list[j], boundary_result,
                            'boundary', sen_ent4testing_list[j], ebao_dic)
            if flag_print:
                print test_filenames[j][0] + 'boundary test evaluated!'
            class_result = class4testing_list[j] + '.result'
            os.system('crfsuite tag -m ' + c_model + ' ' +
                      class4testing_list[j] + ' > ' + class_result)
            if flag_print:
                print 'class test result generated!'
            evaluation.eval(class4testing_list[j], class_result, 'class',
                            sen_ent4testing_list[j], ebao_dic)
            if flag_print:
                print test_filenames[j][0] + 'class test evaluated!'
            # 用接口实现boundarymodel生成的数据对应的class向量,并对应每个句子中的实体预测其类别, post_processing = '1'
            processing.predictClassAfterBoundaryAndEval(
                boundary_result, sentence_list_test[j], sen_tags_list_test[j],
                c_model, ebao_dic, '0', test_filenames[j][1])
            processing.predictClassAfterBoundaryAndEval(
                boundary_result, sentence_list_test[j], sen_tags_list_test[j],
                c_model, ebao_dic, '1', test_filenames[j][1])
            if flag_print:
                print test_filenames[j][0] + 'combine develop evaluated!'
    print 'train end'
Exemplo n.º 2
0
import tornado.ioloop
import tornado.options
import tornado.web
from tornado.options import define, options
define("port", default=10010, help="run on the given port", type=int)

import crfsuite

import generateFeature
import postProcessing
import evaluation
import tools

root = sys.path[0]
dicfolder = os.path.join(root, 'dic')
ebao_m_dic = tools.loadDic(os.path.join(dicfolder, 'dic.txt'))
ebao_si_dic = tools.loadDic(os.path.join(dicfolder, 'dic_si.txt'))
newtermfile = os.path.join(dicfolder, 'new_term.txt')
if not os.path.exists(newtermfile):
    fw = open(newtermfile, 'w')
    fw.close()

colordic = {\
    'disease': 'ff0000',\
    'symptom': '0000ff',\
    'diagnosis_treatment': '1fb21c',\
    'diagnosis_name': '1fb21c',\
    'treatment': '1fb21c',\
    'other_diagnosis': '1fb21c',\
    'instrument': '0099ff',\
    'medicine': 'fc9037',\
Exemplo n.º 3
0
import datetime

import random
import HTMLParser
import xlrd
import xlwt

import tools
import generateFeature
import postProcessing
import evaluation
import crfsuite

root = sys.path[0]
dicfolder = os.path.join(root, 'dic')
ebao_dic = tools.loadDic(os.path.join(dicfolder, 'dic.txt'))


class Entity(object):
    def __init__(self, con, start, end, t):
        self.content = con
        self.start_pos = start
        self.end_pos = end
        self.type = t


en_cn_list = [('disease', '疾病'), ('symptom', '症状'), ('diagnosis_name', '辅助检查'),
              ('treatment', '治疗项目'), ('other_diagnosis', '其他诊疗项目'),
              ('medicine_cn', '药品-通用名'), ('medicine_pn', '药品-产品名'),
              ('medicine_mn', '药品-商品名'), ('medicine', '药品'),
              ('dosage_form', '剂型'), ('specifications', '规格'),
Exemplo n.º 4
0
    else:
        return {"code": -1, "info": 'The operation is illegal.'}


class IndexHandler(tornado.web.RequestHandler):
    def post(self):
        operation = self.get_argument("operation", '')
        entity = self.get_argument("entity", '')
        enttype = self.get_argument("enttype", '')
        replaceFlag = self.get_argument("replaceFlag", '')

        result = excuteOperation(operation, entity, enttype, replaceFlag)
        self.write(json.dumps(result))


if __name__ == '__main__':
    #global dicfile, medi_dic, nomedi_dic
    root = sys.path[0]
    dicfolder = os.path.join(root, 'dic')
    dicfile = os.path.join(dicfolder, 'dic.txt')
    nomedi_dicfile = os.path.join(dicfolder, 'nonmedicine.txt')
    medi_dic = tools.loadDic(dicfile)
    nomedi_dic = tools.loadList(nomedi_dicfile)

    tornado.options.parse_command_line()
    app = tornado.web.Application([(r'/', IndexHandler)],
                                  debug=False)  # True 为服务自动重启
    http_server = tornado.httpserver.HTTPServer(app)
    http_server.listen(options.port)
    tornado.ioloop.IOLoop.instance().start()