def process_note_helper(note): # split note into sections note_sections = sent_tokenize_rules(note) processed_sections = [] section_frame = pd.DataFrame({'sections':note_sections}) section_frame.apply(process_section, args=(note,processed_sections,), axis=1) return(processed_sections)
def process_note_helper(note): # split note into sections note_sections = sent_tokenize_rules(note) for c, i in enumerate(note_sections): note_sections[c] = re.sub('[0-9]+\.','' ,note_sections[c]) # remove '1.', '2.' note_sections[c] = re.sub('(-){2,}|_{2,}|={2,}','' ,note_sections[c]) # remove _____ note_sections[c] = re.sub('dr\.','doctor' ,note_sections[c]) note_sections[c] = re.sub('m\.d\.','md' ,note_sections[c]) regex = '(\[\*\*[^*]*\*\*\])' processed_sections = [re.sub(regex, repl, i) for i in note_sections] processed_sections = [nlp(i.strip()) for i in processed_sections if i is not None and len(i.strip()) > 0] return(processed_sections) #list of spacy docs
def extract_description(subject_id, episode_id): """ Function to extract the input to the summariser for an hospital course summary * currently focus on those epidoes from one and only discharge summary """ date = summaries[(summaries.SUBJECT_ID == subject_id) & (summaries.HADM_ID == episode_id)].CHARTDATE.iloc[0] # extract relevent_rows = notes[(notes.SUBJECT_ID == subject_id) & (notes.HADM_ID == episode_id) & (notes.CHARTDATE <= date) & (notes.CATEGORY != 'Discharge summary')] text = relevent_rows.TEXT.str.cat(sep=' ') # tokenisation sents = sent_tokenize_rules(text) output = "" for sent in sents: # convert to lower case sent = sent.lower() # replace confidential tokens sent = re.sub("\[\*+.+\*+\]", "unk", sent) # replace patterns like "**** CPT codes *****"" sent = re.sub('^\*+.+\*+$', "", sent) # replace new line character sent = sent.replace('\n', ' ') sent = sent.replace('/', ' ') doc = nlp(sent) output += " ".join([token.text for token in doc if token.text.strip()]) + " " return '<sec> ' + output.strip() + '\n' if output else None
def extract_summary(file_name): """ Generate hospital course summary in the required format for LeafNATS ---------------- Args: file_name: name of the file for the raw summary Returns: summary: processed hospital course summary """ f = open(summary_path + file_name, 'r') summary = f.read() sections = sent_tokenize_rules(summary) output = "" for sec in sections: # convert to lower case sec = sec.lower() # replace confidential tokens sec = re.sub("\[\*+.+\*+\]", "unk", sec) # replace patterns like "**** CPT codes *****"" sent = re.sub('^\*+.+\*+$', "", sent) # replace new line character sec = sec.replace('\n', ' ') sec = sec.replace('/', ' ') for sent in nltk.sent_tokenize(sec): output += '<s> ' + ' '.join([ token for token in nltk.word_tokenize(sent) if token.strip() ]) + ' </s> ' return output.strip()
notes = notes[notes.CATEGORY == category] # for other notes if len(sys.argv) < 2: print('Please specify the batch number.') sys.exit() batch = sys.argv[1] other_notes = pd.read_csv('data/notes_batch_{}.csv'.format(batch)) print("start processing: batch {}".format(batch)) to_process = notes if discharge else other_notes for text in tqdm(to_process.TEXT): sents = sent_tokenize_rules(text) for sent in sents: sent = re.sub("\[\*\*.{0,15}.*?\*\*\]", "unk", sent) if not sent or sent.strip() == '\n': continue sent = sent.replace('\n', ' ') sent = sent.replace('/', ' ') tokens = nlp(sent) for token in tokens: word = token.string.strip().lower() if not word: continue