Пример #1
0
 def get_token(self, filename):
     self.features = []
     sentences = fio.ReadFileUTF8(filename)
     for sentence in sentences:
         for token in sentence:
             feature = [token, "N"]
             self.features.append(feature)
Пример #2
0
 def get_token(self, filename):
     self.features = []
     sentences = fio.ReadFileUTF8(filename)
     for sentence in sentences:
         words = self.get_posTag(sentence)
         for w in words:
             for token in w.word:
                 feature = [token, w.flag, "N"]
                 self.features.append(feature)
Пример #3
0
def getWord(filename):
    lines = fio.ReadFileUTF8(filename)

    for line in lines:
        print line.encode('utf-8')
        words = nltk.sent_tokenize(line.encode('utf-8'))
        for word in words:
            wordss = word.word_tokenize(word)
            for wordddd in wordss:
                print worddd
Пример #4
0
def WriteDocsent(excelfile, folder, phrasedir, np=None):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:

            phrasefile = os.path.join(phrasedir, str(week),
                                      type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue

            print phrasefile

            DID = str(week) + '_' + type

            path = folder + str(week) + '/'
            fio.NewPath(path)
            path = path + type + '/'
            fio.NewPath(path)
            path = path + 'docsent/'
            fio.NewPath(path)
            filename = path + DID + '.docsent'

            #create a XML file
            root = ET.Element(tag='DOCSENT',
                              attrib={
                                  'DID': DID,
                                  'LANG': "ENG"
                              })
            root.tail = '\n'
            tree = ET.ElementTree(root)

            phrases = fio.ReadFileUTF8(phrasefile)

            sno_id = 1
            for par, phrase in enumerate(phrases):
                phrase = phrase.rstrip()
                s = [phrase]

                for RSNT, value in enumerate(s):
                    node = ET.Element(tag='S',
                                      attrib={
                                          'PAR': str(par + 1),
                                          'RSNT': str(RSNT + 1),
                                          'SNO': str(sno_id)
                                      })
                    node.text = value
                    node.tail = '\n'
                    root.append(node)
                    sno_id = sno_id + 1

            tree.write(filename)
Пример #5
0
    def get_type(self, filename):
        sentences = fio.ReadFileUTF8(filename)
        for sentence in sentences:
            words = sentence.split()
            x = int(words[1])
            y = int(words[2])

            #if words[3].encode('utf-8') == "身体部位":
            itype = self.read_type(words[3])
            self.features[x][1] = "B-" + itype
            for j in range(x + 1, y + 1):
                self.features[j][1] = "I-" + itype
Пример #6
0
def get_together(num, itype, total):
    filename = datadir + '/result_' + itype + '-' + str(i) + '.txt'
    #total = []
    lines = fio.ReadFileUTF8(filename)
    for j in range(len(lines)):
        if len(lines[j]) == 0:
            continue
        words = lines[j].split()
        if words[1].startswith('B'):
            cur_type = trans_type(words[1][2:])
            cur_name = ""
            cur_name += words[0]
            cur_left = j
            cur_right = j
            words = lines[j + 1].split()
            while words[1].startswith('I'):
                j += 1
                cur_right += 1
                cur_name += words[0]
                words = lines[j + 1].split()
            total.append([cur_left, cur_right, cur_name, cur_type])
    return total
Пример #7
0
# coding: utf-8

import fio
import codecs
import sys
import os

traindir = "../dataset/train/"
testdir = "../dataset/test/"
resultdir = "../dataset/result/"
area = ["病史特点", "出院情况", "一般项目", "诊疗经过"]

if __name__ == '__main__':
    for i in range(301, 401):
        for j in range(4):
            filename = testdir + area[j] + '/' + area[j] + '-' + str(
                i) + '.txtoriginal.txt'
            if not os.path.exists(filename):
                continue
            sentences = fio.ReadFileUTF8(filename)
            printfile = resultdir + area[j] + '/' + area[j] + '-' + str(
                i) + '.segment.txt'
            for sentence in sentences:
                if sentence == u'\r':
                    continue
                sentence = sentence.replace(' ', '#')
                for word in sentence:
                    if word == u'\r':
                        continue
                    fio.WriteFileUTF8(word, printfile)
Пример #8
0
    if itype == "治疗":
        return "TREATMENT"
    if itype == "身体部位":
        return "BODY"


if __name__ == '__main__':
    for i in range(1, 301):
        for j in range(4):
            original_file = traindir + area[j] + '/' + area[j] + '-' + str(
                i) + '.txtoriginal.txt'
            tag_file = traindir + area[j] + '/' + area[j] + '-' + str(
                i) + '.txt'
            printfile = biotagdir + area[j] + '/' + area[j] + '-' + str(
                i) + '.biotag.txt'
            lines = fio.ReadFileUTF8(original_file)
            ners = fio.ReadFileUTF8(tag_file)

            tmp = 0
            if tmp < len(ners):
                cur_ner = ners[tmp].split()
            else:
                cur_ner = ['', -1, -1]
            for words in lines:
                for k in range(len(words)):
                    if str(k) >= cur_ner[1] and str(k) <= cur_ner[2]:
                        ner_type = judge_type(cur_ner[3])
                        if str(k) == cur_ner[1]:
                            ans = words[k] + " B-" + ner_type
                        else:
                            ans = words[k] + " I-" + ner_type
Пример #9
0
 def test_into_aline(self, filename):
     self.features = []
     sentences = fio.ReadFileUTF8(filename)
     for sentence in sentences:
         for token in sentence:
             self.features.append(token)
Пример #10
0
import codecs
import sys
import os
import argparse, time, random

traindir = "../dataset/train/"
biotagdir = "../dataset/biotag/"
area = ["一般项目", "病史特点", "出院情况", "诊疗经过"]

def judge_type(itype):
    itype = itype.encode('utf-8')
    if itype == "症状和体征":
        return "SIGNS"
    if itype == "检查和检验":
        return "CHECK"
    if itype == "疾病和诊断":
        return "DISEASE"
    if itype == "治疗":
        return "TREATMENT"
    if itype == "身体部位":
        return "BODY"

if __name__ == '__main__':
    for i in range(1, 301):
        for j in range(4):
            train_file = biotagdir + area[j] + '/' + area[j] + '-' + str(i) +'.biotag.txt'
            result_file = '../dataset/train_data'
            lines = fio.ReadFileUTF8(train_file)
            fio.AddTest(lines, result_file)
            fio.WriteFileUTF8('', result_file)