def get_token(self, filename): self.features = [] sentences = fio.ReadFileUTF8(filename) for sentence in sentences: for token in sentence: feature = [token, "N"] self.features.append(feature)
def get_token(self, filename): self.features = [] sentences = fio.ReadFileUTF8(filename) for sentence in sentences: words = self.get_posTag(sentence) for w in words: for token in w.word: feature = [token, w.flag, "N"] self.features.append(feature)
def getWord(filename): lines = fio.ReadFileUTF8(filename) for line in lines: print line.encode('utf-8') words = nltk.sent_tokenize(line.encode('utf-8')) for word in words: wordss = word.word_tokenize(word) for wordddd in wordss: print worddd
def WriteDocsent(excelfile, folder, phrasedir, np=None): sheets = range(0, maxWeek) for i, sheet in enumerate(sheets): week = i + 1 for type in ['q1', 'q2', 'q3', 'q4']: phrasefile = os.path.join(phrasedir, str(week), type + '.' + method + '.key') if not fio.IsExist(phrasefile): continue print phrasefile DID = str(week) + '_' + type path = folder + str(week) + '/' fio.NewPath(path) path = path + type + '/' fio.NewPath(path) path = path + 'docsent/' fio.NewPath(path) filename = path + DID + '.docsent' #create a XML file root = ET.Element(tag='DOCSENT', attrib={ 'DID': DID, 'LANG': "ENG" }) root.tail = '\n' tree = ET.ElementTree(root) phrases = fio.ReadFileUTF8(phrasefile) sno_id = 1 for par, phrase in enumerate(phrases): phrase = phrase.rstrip() s = [phrase] for RSNT, value in enumerate(s): node = ET.Element(tag='S', attrib={ 'PAR': str(par + 1), 'RSNT': str(RSNT + 1), 'SNO': str(sno_id) }) node.text = value node.tail = '\n' root.append(node) sno_id = sno_id + 1 tree.write(filename)
def get_type(self, filename): sentences = fio.ReadFileUTF8(filename) for sentence in sentences: words = sentence.split() x = int(words[1]) y = int(words[2]) #if words[3].encode('utf-8') == "身体部位": itype = self.read_type(words[3]) self.features[x][1] = "B-" + itype for j in range(x + 1, y + 1): self.features[j][1] = "I-" + itype
def get_together(num, itype, total): filename = datadir + '/result_' + itype + '-' + str(i) + '.txt' #total = [] lines = fio.ReadFileUTF8(filename) for j in range(len(lines)): if len(lines[j]) == 0: continue words = lines[j].split() if words[1].startswith('B'): cur_type = trans_type(words[1][2:]) cur_name = "" cur_name += words[0] cur_left = j cur_right = j words = lines[j + 1].split() while words[1].startswith('I'): j += 1 cur_right += 1 cur_name += words[0] words = lines[j + 1].split() total.append([cur_left, cur_right, cur_name, cur_type]) return total
# coding: utf-8 import fio import codecs import sys import os traindir = "../dataset/train/" testdir = "../dataset/test/" resultdir = "../dataset/result/" area = ["病史特点", "出院情况", "一般项目", "诊疗经过"] if __name__ == '__main__': for i in range(301, 401): for j in range(4): filename = testdir + area[j] + '/' + area[j] + '-' + str( i) + '.txtoriginal.txt' if not os.path.exists(filename): continue sentences = fio.ReadFileUTF8(filename) printfile = resultdir + area[j] + '/' + area[j] + '-' + str( i) + '.segment.txt' for sentence in sentences: if sentence == u'\r': continue sentence = sentence.replace(' ', '#') for word in sentence: if word == u'\r': continue fio.WriteFileUTF8(word, printfile)
if itype == "治疗": return "TREATMENT" if itype == "身体部位": return "BODY" if __name__ == '__main__': for i in range(1, 301): for j in range(4): original_file = traindir + area[j] + '/' + area[j] + '-' + str( i) + '.txtoriginal.txt' tag_file = traindir + area[j] + '/' + area[j] + '-' + str( i) + '.txt' printfile = biotagdir + area[j] + '/' + area[j] + '-' + str( i) + '.biotag.txt' lines = fio.ReadFileUTF8(original_file) ners = fio.ReadFileUTF8(tag_file) tmp = 0 if tmp < len(ners): cur_ner = ners[tmp].split() else: cur_ner = ['', -1, -1] for words in lines: for k in range(len(words)): if str(k) >= cur_ner[1] and str(k) <= cur_ner[2]: ner_type = judge_type(cur_ner[3]) if str(k) == cur_ner[1]: ans = words[k] + " B-" + ner_type else: ans = words[k] + " I-" + ner_type
def test_into_aline(self, filename): self.features = [] sentences = fio.ReadFileUTF8(filename) for sentence in sentences: for token in sentence: self.features.append(token)
import codecs import sys import os import argparse, time, random traindir = "../dataset/train/" biotagdir = "../dataset/biotag/" area = ["一般项目", "病史特点", "出院情况", "诊疗经过"] def judge_type(itype): itype = itype.encode('utf-8') if itype == "症状和体征": return "SIGNS" if itype == "检查和检验": return "CHECK" if itype == "疾病和诊断": return "DISEASE" if itype == "治疗": return "TREATMENT" if itype == "身体部位": return "BODY" if __name__ == '__main__': for i in range(1, 301): for j in range(4): train_file = biotagdir + area[j] + '/' + area[j] + '-' + str(i) +'.biotag.txt' result_file = '../dataset/train_data' lines = fio.ReadFileUTF8(train_file) fio.AddTest(lines, result_file) fio.WriteFileUTF8('', result_file)