import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd4') #txt目录提取 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 lines = f.readlines() if len(lines) > 1: #对疾病数量判断 lines = ''.join(lines) output = re.sub('\n', ' ', lines) out EMRdef.text_create(r'D:\DeepLearning ER\EHRzhzd6', '.txt', emrpath, output) ''' out = re.split('',output) ryzd.append(out) ''' #导入关联规则 import orangecontrib.associate.fpgrowth as oaf def dealRules(rules):
#-*- coding: UTF-8 -*- #本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录,并提取关键词新建文件 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR') #txt目录提取 zljhs = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_', emrtxt) #提取ID emrtxt = "".join(emrtxt_str) #转成str pattern = r',|.|,|。|;|;' #清除标点 #txtp=txtp.decode('utf-8') for line in f.readlines(): line = re.sub(' ', '', line) #删除空格 if line.find(u'入院诊断:', 0, 6) > -1: line = re.sub(r'h|H', '小时', line) line = re.sub(r'入院诊断:', '', line) line_out = re.split() EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd', '.txt', emrtxt, line) #导出带有诊疗计划的文件和诊疗计划 #zljhs.append(emrtxt+':'+line)
#-*- coding: UTF-8 -*- #根据给药方式和剂量剂型分词 import os import EMRdef import string import re emrtxts = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 #pattern = r',|;|\*|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|\+|\-|【|】| \)|\( |(|)|·|!|、|…'#清除标点 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|>|,|。|:|<|;|‘|’|【|】|(|)|·|!|\*|\/|…' #清除标点 for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 f_end = [] for line in f.readlines(): c = line line = re.sub(' ', '', line) #删除空格 line = re.sub('\.', '', line) #删除. line = re.sub('×', '', line) #删除. a = EMRdef.tq_bnum(line) a_end = "".join(a) #转成str a_end = re.split(pattern, a_end) a_end = "".join(a_end) #转成str a_end = re.sub(' ', '', a_end) #删除空格 a_end = "".join(a_end) #转成str if a_end == '': a_end = 1 else: acb = EMRdef.rre(c, a_end, a_end + ':', 1) #f_end = re.split(pattern, f_start2)
#-*- coding: UTF-8 -*- import re import EMRdef import os, os.path,shutil emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR1')#txt目录提取 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|\^|&|=|,|。|:|;|‘|’|【|】|·|!|、|…'#根据标点分词 for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] f_out = f.readlines() for line in f_out: if line.find('都保')>-1 or line.find('舒利迭')>-1: f_out = ''.join(f_out) EMRdef.text_create(u'D:\DeepLearning ER\EHRxiaochuan','.txt',emrpath,f_out) #EMRdef.text_save(emrtxt,f_end)
import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR-all')#txt目录提取 for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID emrtxt = "".join(emrtxt_str)#转成str out = [] for line in f.readlines(): if line.find('男')>-1: out.append('男') elif line.find('女')>-1: out.append('女') if line.find(‘岁')
import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd5')#txt目录提取 emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRsex') ryzd = [] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0]#提取目录 lines=f.readlines() line = ''.join(lines) lines = re.sub(' ','',line) lines = re.split('\n',lines) for emrtxt2 in emrtxt2s: f2 = open(emrtxt2,'r',errors="ignore")#中文加入errors emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0]#提取目录 lines2 = f2.readlines() lines2 = ''.join(lines2) if emrpath == emrpath2: lines.append(lines2) out = '\n'.join(lines) EMRdef.text_create(r'D:\DeepLearning ER\EHRzhzd6','.txt',emrpath,out)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd5') #txt目录提取 emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRsex') ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 lines = f.readlines() lines = ''.join(lines) lines = re.sub(' ', '', lines) lines = re.split('\n', output) for emrtxt2 in emrtxt2s: f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] #提取目录 lines2 = f2.readlines() lines2 = ''.join(lines2) if emrpath == emrpath2: lines.append(lines2) ryzd.append(lines)
#-*- coding: UTF-8 -*- #本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录, # 并提取关键词新建文件,关键词 诊疗过程 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 a_out = [] pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|>|,|。|:|<|;|‘|’|【|】|(|)|·|!|\*|\/|…' #清除标点 for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_', emrtxt) #提取ID emrtxt = "".join(emrtxt_str) #转成str #txtp=txtp.decode('utf-8') for line in f.readlines(): line = re.sub(' ', '', line) #删除空格 line = re.sub('\.', '', line) #删除. line = re.sub('×', '', line) #删除. a = EMRdef.tq_bnum(line) a_end = "".join(a) #转成str a_end = re.split(pattern, a_end) a_end = "".join(a_end) #转成str a_end = re.sub(' ', '', a_end) #删除空格
#根据词典提取 #-*- coding: UTF-8 -*- #本文件用于根据指标参数提取所有指标 import os import EMRdef import re #根据句号分词 emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc') #txt目录提取 pattern2 = r'、|;|:|、|:|,' #根据标点分词 for emrtxt2 in emrtxt2s: f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors f2_end = re.split(pattern2, f2.read()) f2_out = "\n".join(f2_end) #转成str emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] EMRdef.text_create(u'D:\DeepLearning ER\EHRzlgc3', '.txt', emrpath2, f2_out) #EMRdef.text_save(emrtxt,f_end) '''----------------------------------------------------------------------------------------------------------------------------------------------''' #根据化验指标提取段落 b = open('D:\python\EMR\hyzb.txt', 'r', errors="ignore") emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc3') #txt目录提取 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|【|】|(|)|·|!|、|…' #清除标点 brl = b.readlines() for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 test_out = []
C.append(i) return C import re f = open('D:\DeepLearning ER\Z1006014.txt', 'r', errors='ignore') g = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', 'r', errors='ignore') line_re = [] lines = f.readlines() dics = g.readlines() out = [] for line in lines: line = re.sub('\n', '', line) line = re.sub(' ', '', line) line = re.sub(r'\?|?', '', line) line = re.sub(r'\,|\.|;', '', line) line_re.append(line) while '' in line_re: line_re.remove('') for line in line_re: for dic in dics: dic = re.sub('\n', '', dic) if set(line) == set(dic): out.append(dic) elif SBS(line, dic) > 0.8 and SBS(line, dic) < 1: out.append(dic) import EMRdef out = EMRdef.delre(out) EMRdef.dic_save(r'D:\DeepLearning ER\JBML.txt', out)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 emrtxt2s = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 out = [] for emrtxt in emrtxts: emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] for emrtxt2 in emrtxt2s: emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] if emrpath == emrpath2: f = open(emrtxt, 'r', errors="ignore") #中文加入errors f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors a = f.readlines() b = f2.readlines() c = b + a
#-*- coding: UTF-8 -*- #本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录, # 并提取关键词新建文件,关键词 主诉 import os import sys import os, os.path,shutil import codecs import EMRdef import re #关键词提取 关键词为诊疗计划 emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR')#txt目录提取 pattern2 = r'。|:|、|,'#根据标点分词 zljhs = [] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID emrtxt = "".join(emrtxt_str)#转成str #txtp=txtp.decode('utf-8') for line in f.readlines(): line = re.sub(' ','',line)#删除空格 if line.find (u'诊疗计划') >-1: #line = re.sub('主诉:','',line) f2_end = re.split(pattern2,line) f2_out = "\n".join(f2_end)#转成str EMRdef.text_create(r'D:\DeepLearning ER\EHRzhusu','.txt' ,emrtxt,f2_out)#导出 #zljhs.append(emrtxt+':'+line) #EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)''' '''------------------------------------------------------------------------------------------------------------'''
#-*- coding: UTF-8 -*- #根据给药方式和剂量剂型分词 import os import EMRdef import string import re emrtxts = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 #pattern = r',|;|\*|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|\+|\-|【|】| \)|\( |(|)|·|!|、|…'#清除标点 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|>|,|。|:|<|;|‘|’|【|】|(|)|·|!|\*|\/|…' #清除标点 hyjg = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 f_end = [] for line in f.readlines(): c = line line = re.sub(' ', '', line) #删除空格 line = re.sub('\.', '', line) #删除. line = re.sub('×', '', line) #删除. a = EMRdef.tq_bnum(line) a_end = "".join(a) #转成str a_end = re.split(pattern, a_end) a_end = "".join(a_end) #转成str a_end = re.sub(' ', '', a_end) #删除空格 a_end = "".join(a_end) #转成str if a_end == '': a_end = 1 else: acb = EMRdef.rre(c, a_end, a_end + ':', 1)
#-*- coding: UTF-8 -*- #本文件用于提取给药方式 import os import EMRdef import re pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|\^|&|=|,|。|:|;|‘|’|【|】|·|!|、|…'#根据标点分词 b = open('D:\python\EMR\967ywml.txt','r',errors="ignore") brl = b.readlines() adult = [] adult_c = [] for bl in brl: bl = re.sub('\n','',bl) bl = re.sub('','',bl) adult.append(bl) adult_c = EMRdef.delre(adult) EMRdef.text_save(u'D:\python\EMR\967yw.txt',adult_c)
#-*- coding: UTF-8 -*- # #提取最后诊断之后的内容 并进入下一步处理 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR-all') #txt目录提取 for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_', emrtxt) #提取ID emrtxt = "".join(emrtxt_str) #转成str pattern = r',|.|,|。|;|;' #清除标点 #txtp=txtp.decode('utf-8') temp = f.readlines() tem_del = [] for line in temp: tem_del.append(line) if line.find(u'初步诊断') > -1: break elif line.find(u'最后诊断') > -1: break elif line.find(u'最后诊断') > -1: break temp = list(set(temp) - set(tem_del))
import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd')#txt目录提取 dis = open(r'C:\Users\Administrator\Desktop\ICD-10.txt',errors='ignore') ds=dis.readlines() ds_cs = [] for line in ds: line = re.sub('\n','',line) ds_cs.append(line) ryzd=[] output=[] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0]#提取目录 pattern =r'\s*\d+、+\s?(.*)' c=re.compile(pattern) for line in f.readlines(): line1=line.strip('\n') line2 = ''.join(line1) line2 = line2.strip( ) line3=c.findall(line2) line3=''.join(line3) line4 = str(line3)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd') #txt目录提取 g = open(r'C:\Users\Administrator\Desktop\JBML.txt', errors='ignore') dics = g.readlines() ryzd = [] output = [] line_re = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 pattern = r'\s*\d+、+\s?(.*)' c = re.compile(pattern) for line in f.readlines(): line1 = line.strip('\n') line2 = ''.join(line1) line2 = line2.strip() line3 = c.findall(line2) line3 = ''.join(line3) line4 = str(line3) line = line4 line = re.sub('\n', '', line)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd') #txt目录提取 for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 pattern = r'\s*\d+、+\s?(.*)' c = re.compile(pattern) output = [] for line in f.readlines(): line1 = line.strip('\n') line2 = ''.join(line1) line2 = line2.strip() line3 = c.findall(line2) line3 = ''.join(line3) line4 = str(line3) out = line4 if out.find('肺')>-1 or out.find('呼吸')>-1 or out.find('气管')>-1 or out.find('呼吸')>-1 \ or out.find('筛窦')>-1 or out.find('上额窦')>-1 or out.find('胸腔')>-1 or out.find('鼻')>-1 \ or out.find('蝶窦')>-1 or out.find('蝶窦')>-1 : output.append(out) output = EMRdef.delre(output) output1 = '\n'.join(output) EMRdef.text_create(r'D:\DeepLearning ER\EHRzhzd2', '.txt', emrpath,
C.append(i) return C import re f = open('D:\DeepLearning ER\Z1006014.txt', 'r', errors='ignore') g = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', 'r', errors='ignore') line_re = [] lines = f.readlines() dics = g.readlines() out = [] for line in lines: line = re.sub('\n', '', line) line = re.sub(' ', '', line) line = re.sub(r'\?|?', '', line) line = re.sub(r'\,|\.|;', '', line) line_re.append(line) while '' in line_re: line_re.remove('') for line in line_re: for dic in dics: dic = re.sub('\n', '', dic) if set(line) == set(dic): out.append(dic) elif SBS(line, dic) > 0.9: out.append(dic) import EMRdef out = EMRdef.delre(out) print(out)
# -*- coding:utf-8 -*- import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR-all')#txt目录提取 for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID emrtxt = "".join(emrtxt_str)#转成str out = [] for line in f.readlines(): if line.find(r'男')>-1: out.append('M') elif line.find(r'女')>-1: out.append('W') if line.find('岁')>-1: line = re.sub('岁','',line) line = ''.join(line) output = ' '.join(out) EMRdef.text_create(r'D:\DeepLearning ER\EHRbase','.txt' ,emrtxt,output) se = int(line) if se <=20: a = 'Child' elif se <=40:
#本文件用于数据清洗 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd import numpy as np #设置pandas参数 np.set_printoptions(threshold=np.inf) emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 line_out = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd') #txt目录提取 dis = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', errors='ignore') ds = dis.readlines() ds_cs = [] for line in ds: line = re.sub('\n', '', line) ds_cs.append(line) ryzd = [] output = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 pattern = r'\s*\d+、+\s?(.*)' c = re.compile(pattern) for line in f.readlines(): line1 = line.strip('\n') line2 = ''.join(line1) line2 = line2.strip() line3 = c.findall(line2) line3 = ''.join(line3) line4 = str(line3)
import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd')#txt目录提取 g = open(r'C:\Users\Administrator\Desktop\JBML.txt',errors='ignore') dics=g.readlines() ryzd=[] output=[] line_re = [] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0]#提取目录 pattern =r'\s*\d+、+\s?(.*)' c=re.compile(pattern) for line in f.readlines(): line1=line.strip('\n') line2 = ''.join(line1) line2 = line2.strip( ) line3=c.findall(line2) line3=''.join(line3) line4 = str(line3) line = line4 line=re.sub('\n','',line) line=re.sub(' ','',line)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd2') #txt目录提取 dis = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', errors='ignore') ds = dis.readlines() ds_cs = [] ryzd = [] for line in ds: line = re.sub('\n', '', line) ds_cs.append(line) for emrtxt in emrtxts: out = [] f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 lines = f.readlines() for line in lines: line = re.sub('\n', '', line) for ds_c in ds_cs: if line.find(ds_c) > -1: out.append(d) if set(line) == set(ds_c): out.append(ds_c) elif EMRdef.SBS(line, ds_c) > 0.6 and EMRdef.SBS(line, ds_c) < 1:
import time import math import os import sys import os, os.path,shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd2')#txt目录提取 dis = open(r'C:\Users\Administrator\Desktop\JBML.txt',errors='ignore') ds=dis.readlines() ds_cs = [] for line in ds: line = re.sub('\n','',line) ds_cs.append(line) for emrtxt in emrtxts: out = [] f = open(emrtxt,'r',errors="ignore")#中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0]#提取目录 lines = f.readlines() for line in lines: line = re.sub('\n','',line) for ds_c in ds_cs: if set(line) == set(ds_c): out.append(ds_c) elif EMRdef.SBS(line,dic)>0.8 and SBS(line,dic) <1:
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd4') #txt目录提取 for emrtxt in emrtxts: out = [] f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录
#-*- coding: UTF-8 -*- #本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录, # 并提取关键词新建文件,关键词 主诉 import os import sys import os, os.path,shutil import codecs import EMRdef import re #关键词提取 关键词为诊疗计划 emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR1')#txt目录提取 pattern2 = r'。|:|、|,'#根据标点分词 zljhs = [] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID emrtxt = "".join(emrtxt_str)#转成str a_out = f.readlines() #txtp=txtp.decode('utf-8') for line in a_out: line = re.sub(' ','',line)#删除空格 if line.find (u'吸入剂') >-1: f2_out = "".join(a_out)#转成str EMRdef.text_create(r'D:\DeepLearning ER\EHRxiaochuan','.txt' ,emrtxt,f2_out)#导出 #zljhs.append(emrtxt+':'+line) #EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)''' '''------------------------------------------------------------------------------------------------------------'''