def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: try: # Original text file text = f.read().strip('\n') # text = f.read() except: f = open(txt, encoding="utf8") text = f.read().strip('\n') f.close() # tokenize sentences = sent_tokenize(text) # print(sentences) ''' ['DATE OF ADMISSION : MM/DD/YYYY', 'DATE OF DISCHARGE : MM/DD/YYYY', 'DISCHARGE DIAGNOSES :', '1 . Vasovagal syncope , status post fall .', '2 . Traumatic arthritis , right knee .', '3 . Hypertension .', '4 ] ''' for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print(w) #print() tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno == end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, key=lambda t: t[1:]) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts) - 1): c1 = tok_concepts[i] c2 = tok_concepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basename(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error3 = 'Please modify this file: %s' % con error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join(tokenized_sents[c1[1] - 1][c1[2]:c1[3] + 1]), c1[1], c1[2], c1[1], c1[3], c1[0]) error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join(tokenized_sents[c2[1] - 1][c2[2]:c2[3] + 1]), c2[1], c2[2], c2[1], c2[3], c2[0]) error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % ( error1, error2, error3, error4, error5) raise DocumentException(error_msg) # print(tok_concepts) # ('treatment', 48, 2, 2), ('treatment', 49, 5, 5)] return tokenized_sents, tok_concepts
def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: # Original text file text = f.read().strip('\n') # tokenize sentences = sent_tokenize(text) for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print w #print tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno == end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, cmp=classification_cmp) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts) - 1): c1 = tok_concepts[i] c2 = tok_concepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basenme(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error_msg = '%s\n%s' % (error1, error2) raise DocumentException(error_msg) return tokenized_sents, tok_concepts
def readDocs(txt, concept): tokenizedSentences = [] sentTokenize = lambda text: text.split('\n') wordTokenize = lambda text: text.split(' ') with open(txt) as foo: text = foo.read().strip('\n') sentences = sentTokenize(text) for s in sentences: sent = clean_text(s.rstrip()) sent = sent.lower() tokens = wordTokenize(sent) normedTokens = normalize_tokens(tokens) tokenizedSentences.append(normedTokens) tokenizedConcepts = [] if concept: with open(concept) as foo: for l in foo.readlines(): if not l.strip(): continue conceptRegex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(conceptRegex, l.strip()) groups = match.groups() concept_text = groups[0] beginLineNum = int(groups[1]) beginTokenIndex = int(groups[2]) lastLineNum = int(groups[3]) lastTokenIndex = int(groups[4]) conceptLabel = groups[5] assert beginLineNum == lastLineNum, 'concept must span single line' tup = (conceptLabel, beginLineNum, beginTokenIndex, lastTokenIndex) tokenizedConcepts.append(tup) tokenizedConcepts = list(set(tokenizedConcepts)) tokenizedConcepts = sorted(tokenizedConcepts, key=lambda t: t[1:]) # Ensure no overlapping concepts (that would be bad) for i in range(len(tokenizedConcepts) - 1): c1 = tokenizedConcepts[i] c2 = tokenizedConcepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basename(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error3 = 'Please modify this file: %s' % con error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join( tokenizedSentences[c1[1] - 1][c1[2]:c1[3] + 1]), c1[1], c1[2], c1[1], c1[3], c1[0]) error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join( tokenizedSentences[c2[1] - 1][c2[2]:c2[3] + 1]), c2[1], c2[2], c2[1], c2[3], c2[0]) error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % ( error1, error2, error3, error4, error5) raise DocumentException(error_msg) return tokenizedSentences, tokenizedConcepts
def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: # Original text file text = f.read().strip('\n') # tokenize sentences = sent_tokenize(text) for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase (like word2vec preprocessing) sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print w #print tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno==end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, cmp=classification_cmp) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts)-1): c1 = tok_concepts[i] c2 = tok_concepts[i+1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basenme(con) error1='%s has overlapping entities on line %d'%(fname,c1[1]) error2="It can't be processed until you remove one" error_msg = '%s\n%s' % (error1,error2) raise DocumentException(error_msg) return tokenized_sents, tok_concepts