def get_labels(self): if self.annotated_note_path is not None and self.iob_labels == []: # don't want to modify original pre_processed_text = copy.deepcopy(self.pre_processed_text) # need to create a list of tokens iob_labels = [] tagged_entities = get_tagged_entities(self.annotated_note_path) _tagged_entities = copy.deepcopy(tagged_entities) raw_text = get_text(self.note_path) labeled_text = get_text_with_taggings(self.annotated_note_path) # lots of checks! for char in ['\n'] + list(whitespace): raw_text = raw_text.strip(char) labeled_text = labeled_text.strip(char) raw_text = re.sub(r"``", r"''", raw_text) labeled_text = re.sub(r'"', r"'", labeled_text) raw_text = re.sub("<TEXT>\n+", "", raw_text) raw_text = re.sub("\n+</TEXT>", "", raw_text) labeled_text = re.sub("<TEXT>\n+", "", labeled_text) labeled_text = re.sub("\n+</TEXT>", "", labeled_text) raw_index = 0 labeled_index = 0 raw_char_offset = 0 labeled_char_offset = 0 # should we count? count_raw = True count_labeled = True text1 = "" text2 = "" start_count = 0 end_count = 0 offsets = {} tagged_element = None # need to get char based offset for each tagging within annotated timeml doc. while raw_index < len(raw_text) or labeled_index < len(labeled_text): if raw_index < len(raw_text): if count_raw is True: raw_char_offset += 1 text1 += raw_text[raw_index] raw_index += 1 if labeled_index < len(labeled_text): # TODO: change this to be an re match. if labeled_text[labeled_index:labeled_index+1] == '<' and labeled_text[labeled_index:labeled_index+2] != '</': tagged_element = tagged_entities.pop(0) count_labeled = False start_count += 1 elif labeled_text[labeled_index:labeled_index+2] == '</': count_labeled = False start_count += 1 if labeled_text[labeled_index:labeled_index+1] == ">": if tagged_element != None: start = labeled_char_offset end = labeled_char_offset+len(tagged_element.text) - 1 # spans should be unique? offsets[(start, end)] = {"tagged_xml_element":tagged_element, "text":tagged_element.text} # ensure the text at the offset is correct assert raw_text[start:end + 1] == tagged_element.text, "\'{}\' != \'{}\'".format( raw_text[start:end + 1], tagged_element.text) tagged_element = None end_count += 1 count_labeled = True labeled_index += 1 continue if count_labeled is True: labeled_char_offset += 1 text2 += labeled_text[labeled_index] labeled_index += 1 assert text1 == text2, "{} != {}".format(text1, text2) assert start_count == end_count, "{} != {}".format(start_count, end_count) assert raw_index == len(raw_text) and labeled_index == len(labeled_text) assert raw_char_offset == labeled_char_offset assert len(tagged_entities) == 0 assert tagged_element is None assert len(offsets) == len(_tagged_entities) for sentence_num in sorted(pre_processed_text.keys()): # list of dicts sentence = pre_processed_text[sentence_num] # iobs in a sentence iobs_sentence = [] # need to assign the iob labels by token index for token in sentence: # set proper iob label to token iob_label, entity_type, entity_id = TimeNote.get_label(token, offsets) if iob_label is not 'O': assert entity_id is not None assert entity_type in ['EVENT', 'TIMEX3'] else: assert entity_id is None assert entity_type is None #if token["token"] == "expects": # print "Found expects" # print "iob_label: ", iob_label # print "entity_type: ", entity_type # print "entity_id: ", entity_id # print # sys.exit("done") iobs_sentence.append({'entity_label':iob_label, 'entity_type':entity_type, 'entity_id':entity_id}) iob_labels.append(iobs_sentence) self.iob_labels = iob_labels return self.iob_labels
def __init__(self, timeml_note_path, annotated_timeml_path=None, verbose=False): if verbose: print "called TimeNote constructor" _Note = Note.__init__(self, timeml_note_path, annotated_timeml_path) # get body of document data = get_text(timeml_note_path) # original text body of timeml doc self.original_text = data # send body of document to NewsReader pipeline. tokenized_text, token_to_offset, sentence_features, dependency_paths, id_to_tok = pre_processing.pre_process(data, timeml_note_path) # {sentence_num: [{token},...], ...} self.pre_processed_text = tokenized_text # contains the char based offsets generated by tokenizer, used for asserting char offsets are correct # {'token':[(start, end),...],...} self.token_to_offset = token_to_offset # contains sentence level information extracted by newsreader self.sentence_features = sentence_features # dependency paths for sentences in the document self.dependency_paths = dependency_paths # map token ids to tokens within self.tokenized_text # {'wid':'token'} self.id_to_tok = id_to_tok self.discourse_connectives = {} self.iob_labels = [] """ print "\n\nself.original_text:\n\n" print self.original_text print "\n\n" print "self.pre_processed_text:\n\n" print tokenized_text print "\n\n" print "self.token_to_offset:\n\n" print self.token_to_offset print "\n\n" print "self.sentence_features:\n\n" print self.sentence_features print "\n\n" """ self.tlinks = [] if self.annotated_note_path is not None: self.get_tlinked_entities() # will store labels in self.iob_labels self.get_labels()