def make_new_ot_morph_layer(old_text_obj, new_text_obj, new_layer='original_words_morph_analysis', old_layer='ot_morph_analysis', new_parent_layer='original_words'): '''Creates new 'original_words_morph_analysis' layer based on the old morph analysis layer.''' assert old_layer in old_text_obj.layers assert new_parent_layer in new_text_obj.layers assert new_layer not in new_text_obj.layers original_layer = old_text_obj[old_layer] assert 'normalized_text' not in original_layer.attributes parent_layer = new_text_obj[new_parent_layer] layer = Layer(name=new_layer, text_object=new_text_obj, attributes=('normalized_text', ) + original_layer.attributes, parent=new_parent_layer, ambiguous=True) assert len(parent_layer) == len(original_layer) for wid, parent_span in enumerate(parent_layer): old_morph_span = original_layer[wid] new_span = Span(base_span=parent_span.base_span, layer=layer) for ann in old_morph_span.annotations: new_annotation = {'normalized_text': parent_span.text} for a in layer.attributes: if a in ['start', 'end', 'text', 'normalized_text']: continue new_annotation[a] = ann[a] new_span.add_annotation(Annotation(new_span, **new_annotation)) layer.add_span(new_span) return layer
def make_new_sentences_layer( old_text_obj, new_text_obj, new_layer='original_sentences_flat', old_layer='sentences' ): '''Creates new 'original_sentences_flat' layer based on the old sentences layer.''' assert old_layer in old_text_obj.layers assert new_layer not in new_text_obj.layers original_layer = old_text_obj[old_layer] layer = Layer(name = new_layer, text_object = new_text_obj, attributes = original_layer.attributes, parent =None, ambiguous=False) layer.meta['desc'] = 'Original sentence tokenization from koondkorpus XML file.' for span in original_layer: attribs = {} layer.add_annotation( (span.start, span.end), **attribs ) return layer
def test_intersection_more_elements(self): i = Intersection( self.adjectives(), self.ed_suffix_regex(), Layer('words'), self.ed_suffix() ) matches = i.get_matches(self.text()) self.assertListEqual(self.expected(), matches)
def make_new_words_layer(old_text_obj, new_text_obj, new_layer='original_words', old_layer='words'): '''Creates new 'original_words' layer based on the old words layer.''' assert old_layer in old_text_obj.layers assert new_layer not in new_text_obj.layers original_layer = old_text_obj[old_layer] layer = Layer(name=new_layer, text_object=new_text_obj, attributes=original_layer.attributes, parent=None, ambiguous=True) layer.meta[ 'desc'] = 'Original word tokenization from koondkorpus XML file.' for span in original_layer: attribs = {'normalized_form': None} layer.add_annotation((span.start, span.end), **attribs) return layer
def remove_attribs_from_layer(text, layer_name, new_layer_name, remove_attribs): ''' Rewrites given layer in a way that attributes from the list remove_attribs will be completely removed. Returns the new layer. ''' new_attribs = [ a for a in text[layer_name].attributes if a not in remove_attribs ] new_layer = Layer( name=new_layer_name, text_object=text, attributes=new_attribs, parent=text[layer_name].parent if text[layer_name].parent else None, ambiguous=text[layer_name].ambiguous) for span in text[layer_name]: for annotation in span.annotations: analysis = {attrib: annotation[attrib] for attrib in new_attribs} new_layer.add_annotation((span.start, span.end), **analysis) return new_layer
def import_from_brat_folder(folder): assert os.path.isdir( folder ), \ "(!) Invalid folder name {!r}.".format(folder) annotation_files = dict() for fname in os.listdir(folder): if fname.endswith(('.ann', '.txt')): name, ext = os.path.splitext(fname) fpath = os.path.join(folder, fname) if name not in annotation_files: annotation_files[name] = [] annotation_files[name].append(fpath) # Check that both .ann and .txt exist for name in annotation_files.keys(): if len(annotation_files[name]) != 2: has_ann = any([ fname for fname in annotation_files[name] if fname.endswith('.ann') ]) has_txt = any([ fname for fname in annotation_files[name] if fname.endswith('.txt') ]) if not has_txt: raise ValueError( '(!) Annotations file {!r} is missing .txt part.'.format( name)) if not has_ann: raise ValueError( '(!) Annotations file {!r} is missing .ann part.'.format( name)) text_objects = [] for name in annotation_files.keys(): ann_file = [ fname for fname in annotation_files[name] if fname.endswith('.ann') ][0] entity_annotations, rel_annotations = import_brat_annotations(ann_file) txt_file = [ fname for fname in annotation_files[name] if fname.endswith('.txt') ][0] content = import_brat_text(txt_file) # # Create text object and entity annotations # text_obj = Text(content) text_obj.meta['file'] = name brat_entities = \ Layer('brat_entities', attributes=('brat_id',), text_object = text_obj) event_layer = \ Layer('events', attributes=('brat_id', 'class', 'class_confidence', 'duration', 'duration_confidence', 'comment'), \ text_object = text_obj, enveloping='brat_entities') timex_layer = \ Layer('timexes', attributes=('brat_id', 'tid', 'type', 'value', 'mod', 'anchor_time_id', 'comment'), \ text_object = text_obj, enveloping='brat_entities') entity_layer = \ Layer('entities', attributes=('brat_id',), text_object = text_obj, enveloping='brat_entities') entity_id_to_loc_map = dict() for (entity_id, type, start, end, attribs) in entity_annotations: # Check that location strings are expected ones # Collect corrected locations corrected_locs = [] if isinstance(start, int): corrected_start, delta = _calculate_corrected_start_and_delta( content, start) snippet = content[corrected_start:end + delta] assert snippet == attribs['text'], \ f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}" corrected_locs.append((corrected_start, end + delta)) elif isinstance(start, list): if len(start) == len(attribs['text']): for s_start, s_end, s_text in zip(start, end, attribs['text']): corrected_start, delta = _calculate_corrected_start_and_delta( content, s_start) snippet = content[corrected_start:s_end + delta] assert snippet == s_text, \ f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}" corrected_locs.append((corrected_start, s_end + delta)) elif len(start) <= len(attribs['text']): # Tricky case: there can be less entity locations than entity text strings # (!) different number of entity texts ['oli', 'kõige', 'parem'] and start locs [1904, 1908] assert len(start) == len(end) for s_start, s_end in zip(start, end): corrected_start, delta = _calculate_corrected_start_and_delta( content, s_start) snippet = content[corrected_start:s_end + delta] assert any([s in snippet for s in attribs['text']]), \ f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}" corrected_locs.append((corrected_start, s_end + delta)) else: raise Exception( '(!) Mismatching number of locations and texts in {!r}' .format((entity_id, type, start, end, attribs))) # add base layer: brat entities for s_start, s_end in corrected_locs: brat_entities.add_annotation((s_start, s_end), **{'brat_id': entity_id}) entity_id_to_loc_map[entity_id] = corrected_locs # add enveloping layers if type == 'Event': attribs['brat_id'] = entity_id event_layer.add_annotation(corrected_locs, **attribs) elif type == 'Timex': attribs['brat_id'] = entity_id timex_layer.add_annotation(corrected_locs, **attribs) elif type == 'Entity': attribs['brat_id'] = entity_id entity_layer.add_annotation(corrected_locs, **attribs) text_obj.add_layer(brat_entities) text_obj.add_layer(event_layer) text_obj.add_layer(timex_layer) text_obj.add_layer(entity_layer) # # Add tlink relation annotations # relations_layer = \ Layer('tlinks', attributes=('brat_id', 'a_text', 'rel_type', 'b_text', 'b_index'), \ text_object = text_obj, enveloping='brat_entities', ambiguous=True) for (rel_arg1, rel_type, rel_arg2, rel_id) in rel_annotations: if rel_type == 'has_Argument': continue assert rel_arg1 in entity_id_to_loc_map.keys() assert rel_arg2 in entity_id_to_loc_map.keys() arg1_loc = entity_id_to_loc_map[rel_arg1] arg2_loc = entity_id_to_loc_map[rel_arg2] # check if relation needs to be reversed if arg1_loc[0] > arg2_loc[0]: # reverse relation temp = arg1_loc arg1_loc = arg2_loc arg2_loc = temp # change reltype if rel_type == 'AFTER': rel_type = 'BEFORE' elif rel_type == 'BEFORE': rel_type = 'AFTER' elif rel_type == 'INCLUDES': rel_type = 'IS_INCLUDED' elif rel_type == 'IS_INCLUDED': rel_type = 'INCLUDES' attribs = {} attribs['brat_id'] = rel_id attribs['rel_type'] = rel_type attribs['a_text'] = ' '.join([content[s:e] for s, e in arg1_loc]) attribs['b_text'] = ' '.join([content[s:e] for s, e in arg2_loc]) attribs['b_index'] = len(arg1_loc) relations_layer.add_annotation(arg1_loc + arg2_loc, **attribs) text_obj.add_layer(relations_layer) # # Add has_Argument relations # arguments_layer = \ Layer('event_arguments', attributes=('brat_id', 'a_text', 'rel_type', 'b_text', 'b_index'), \ text_object = text_obj, enveloping='brat_entities', ambiguous=True) for (rel_arg1, rel_type, rel_arg2, rel_id) in rel_annotations: if rel_type != 'has_Argument': continue assert rel_arg1 in entity_id_to_loc_map.keys() assert rel_arg2 in entity_id_to_loc_map.keys() arg1_loc = entity_id_to_loc_map[rel_arg1] arg2_loc = entity_id_to_loc_map[rel_arg2] # check if relation needs to be reversed if arg1_loc[0] > arg2_loc[0]: # reverse relation temp = arg1_loc arg1_loc = arg2_loc arg2_loc = temp # change reltype rel_type = 'is_Argument_of' attribs = {} attribs['brat_id'] = rel_id attribs['rel_type'] = rel_type attribs['a_text'] = ' '.join([content[s:e] for s, e in arg1_loc]) attribs['b_text'] = ' '.join([content[s:e] for s, e in arg2_loc]) attribs['b_index'] = len(arg1_loc) arguments_layer.add_annotation(arg1_loc + arg2_loc, **attribs) text_obj.add_layer(arguments_layer) text_objects.append(text_obj) return text_objects
def read_from_tsv(path): texts = [] tokens_tagger = WhiteSpaceTokensTagger() if os.path.isdir(path): for root, dirs, files in os.walk(path): (head, tail) = os.path.split(root) if len(files) > 0: for file in files: if not file.endswith(".tsv"): continue with open(os.path.join(root, file), encoding="utf-8") as fin: reader = csv.reader(fin, delimiter='\t') words = [] #Lines containing a word and its' analysis word = [] #Morphological analysis of the whole text. morph_analysis = [] raw_text = "" multiword_expressions = [] for index, row in enumerate(reader): row[0] = row[0].strip() #Check if the row has correct number of elements #If there are less than 6 then it is probably an adverb, abbreviation etc. #But if there are more, then there's something wrond and the user has to be notified. if len(row) > 6: #If the elements after the 6th one contain nothing, then we can continue. for x in row[6:]: x = x.strip() if x != "": sys.stderr.write( "Something is wrong with the following file: " + file + " In the following line: " + str(index + 1) + "\n" + "\t".join(row) + "\n") sys.exit(1) #If the first element of a row is empty then it is an alternative analysis of a word. if row[0] == "" and word: word.append(row) else: if len(word) != 0: words.append(word) #After appending the word into the words list let's initialize a new word. word = [row] #As the loop terminates before adding the last word into the list, let's do it now words.append(word) for word in words: #Iterate over the analyses and check for manual fixes. #Remove all other analyses if they exist. type_of_fix = "" for analysis in word: #As it may be sometimes necessary to look at the whole line, join the elements of a row back together. line = "\t".join(analysis) if "¤" in line: word[0][1:] = [ None, None, None, None, None ] word = [word[0]] type_of_fix = "No_correct_analysis_available" break elif analysis[1].startswith("@"): word[0][1:] = analysis[1:] word = [word[0]] word[0][1] = word[0][1].strip("@") type_of_fix = "correct_analysis_provided" break elif analysis[1].startswith("£"): word[0][1:] = analysis[1:] word = [word[0]] word[0][1] = word[0][1].strip("£") type_of_fix = "correct_analysis_not_provided" break elif re.match("#[A-Üa-ü0-9]", analysis[1]): word[0][1:] = analysis[1:] word = [word[0]] word[0][1] = word[0][1].strip("#") type_of_fix = "correct_analysis_manually_added" break analyses = [] for a in word: analysis = {} analysis['root'] = a[1] #If it is an abbreviation some fields may be missing. #Sometimes there are also missing tabs in the end of a line, so the last element has to be checked. if a[-1] == "Y" or a[-1] == 'D' or a[-1] == 'K': analysis['partofspeech'] = a[-1] analysis['ending'] = "" analysis['form'] = "" analysis['clitic'] = "" else: analysis['ending'] = a[2] analysis['clitic'] = a[3] analysis['partofspeech'] = a[4] analysis['form'] = a[5] if len( a) == 6 else "" if analysis['root'] != None: analysis['root'], analysis[ 'root_tokens'], analysis[ 'lemma'] = _postprocess_root( analysis['root'], analysis['partofspeech']) else: analysis['root_tokens'] = None analysis['lemma'] = None analysis['type_of_fix'] = type_of_fix #If not otherwize specified the normalized_text will remain the same as the word form analysis['normalized_text'] = word[0][0] analyses.append(analysis) #if len(analyses) > 1: # print (analyses) word_tuple = (word[0][0], analyses) morph_analysis.append(word_tuple) raw_text += word[0][0] + " " if ' ' in word[0][0]: multiword_expressions.append(word[0][0]) text = Text(raw_text) tokens_layer = tokens_tagger.make_layer(text) multiword_expressions = [ mw.split() for mw in multiword_expressions ] compound_tokens_tagger = PretokenizedTextCompoundTokensTagger( multiword_units=multiword_expressions) compound_tokens_layer = compound_tokens_tagger.make_layer( text, layers={'tokens': tokens_layer}) word_tagger = WordTagger() words_layer = word_tagger.make_layer( text, layers={ 'compound_tokens': compound_tokens_layer, 'tokens': tokens_layer }) #text.tag_layer(['sentences']) layer_morph = Layer(name='manual_morph', text_object=text, attributes=[ 'root', 'lemma', 'root_tokens', 'ending', 'clitic', 'partofspeech', 'form' ], ambiguous=True) layer_fix = Layer(name='type_of_fix', text_object=text, attributes=['type_of_fix'], parent='manual_morph') for ind, word in enumerate(words_layer): layer_fix.add_annotation( (word.start, word.end), type_of_fix=analysis['type_of_fix']) for analysis in morph_analysis[ind][1]: layer_morph.add_annotation( (word.start, word.end), **analysis) text.add_layer(layer_morph) text.add_layer(layer_fix) text.meta['id'] = file.split(".")[0] text.meta['location'] = root.split(os.sep)[-1].lower() texts.append(text) return texts
def create_new_text_obj( fname, metadata, cur_text_len, cur_tokens, cur_tok_id, \ raw_timexes, timexes_layer_name='gold_timexes' ): '''Based on the snapshot of data collected from the file, creates a new EstNLTK v1.6 Text object, and populates with metadata and gold standard timexes layer. Returns the Text object.''' # Construct new text object text_str = ''.join(cur_tokens) assert len(text_str) == cur_text_len text_obj = Text(text_str) # Add metadata text_obj.meta['source_file'] = fname assert len(metadata) >= 1 if len(metadata) > 1: print( 'Warn! Unexpected number of metadata items {!r}. Using only first.' .format(metadata)) for (k, v) in metadata[0].items(): text_obj.meta[k] = v text_obj.meta['_original_token_count'] = cur_tok_id # Add document creation date for timex in raw_timexes: if 'functionInDocument' in timex and \ timex['functionInDocument'] == 'CREATION_TIME': assert 'value' in timex text_obj.meta['document_creation_time'] = timex['value'] if 'comment' in timex: text_obj.meta['dct_comment'] = timex['comment'] break # Add TIMEX-es layer timexes_layer = Layer(name=timexes_layer_name, \ attributes=('tid', 'type', 'value', 'temporal_function', 'anchor_time_id', \ 'mod', 'quant', 'freq', 'begin_point', 'end_point', 'part_of_interval', \ 'comment' ), \ text_object=text_obj,\ ambiguous=False) for timex in raw_timexes: if '_start' in timex and '_end' in timex: # Determine if this TIMEX is part of an interval (without textual content) interval_timex, place_in_interval = get_parent_of_interval( timex, raw_timexes) if interval_timex: if interval_timex.get('type', None) == 'DURATION': # Record interval timex as an implicit timex interval_timex_odict = convert_timex_to_ordered_dict( interval_timex) timex['part_of_interval'] = interval_timex_odict else: raise Exception( '(!) Unexpected interval_timex {!r} for timex {!r}'. format(interval_timex, timex)) # Determine if this TIMEX is an implicit interval that has explicit timepoints # in text. If so, skip it to avoid duplicates in annotations if is_removable_interval_timex(timex, raw_timexes): continue # Determine if this is an explicit interval with one or more implicit time points # If so, then attach the implicit time points as OrderedDict-s begin_point_tmx, end_point_tmx = get_child_timepoints( timex, raw_timexes, only_implicit=True) if begin_point_tmx: begin_point_odict = convert_timex_to_ordered_dict( begin_point_tmx) timex['beginPoint'] = begin_point_odict if end_point_tmx: end_point_odict = convert_timex_to_ordered_dict(end_point_tmx) timex['endPoint'] = end_point_odict # Determine exact position of the timex: if 'text' not in timex: # Timexes without pre-specified textual position/substring: # _start and _end provide all the information we need loc = (timex['_start'], timex['_end']) annotations = convert_timex_attributes(copy.deepcopy(timex)) for k in annotations.keys(): if k not in timexes_layer.attributes: raise Exception( '(!) Unexpceted key {!r} in {!r}'.format( k, annotations)) timexes_layer.add_annotation(loc, **annotations) elif 'text' in timex: # Timexes with pre-specified textual position/substring: # we need to detect exact indexes of position in text loc = (timex['_start'], timex['_end']) textual_content = timex['text'] timex_span = text_obj.text[loc[0]:loc[1]] if re.sub('\s+', '', textual_content) == timex_span: # A) strings match if spaces are removed from text, e.g. # text="31. 12. 1997.a." vs token="31.12.1997.a." loc = (timex['_start'], timex['_end']) annotations = convert_timex_attributes( copy.deepcopy(timex)) for k in annotations.keys(): if k not in timexes_layer.attributes: raise Exception( '(!) Unexpceted key {!r} in {!r}'.format( k, annotations)) timexes_layer.add_annotation(loc, **annotations) elif re.sub('\s+', '', textual_content) == re.sub('\s+', '', timex_span): # B) strings match if spaces are removed from both text and token, e.g. # text="täna kell 19. 08" vs token="täna kell 19.08" loc = (timex['_start'], timex['_end']) annotations = convert_timex_attributes( copy.deepcopy(timex)) for k in annotations.keys(): if k not in timexes_layer.attributes: raise Exception( '(!) Unexpceted key {!r} in {!r}'.format( k, annotations)) timexes_layer.add_annotation(loc, **annotations) elif textual_content in timex_span: # C) text is a substring of the phrase, e.g. # text="1899-" vs token="1899-1902" i = text_obj.text.find(textual_content, timex['_start']) if i > -1 and i + len(textual_content) <= loc[1]: new_start = i new_end = i + len(textual_content) assert text_obj.text[ new_start:new_end] == textual_content loc = (new_start, new_end) annotations = convert_timex_attributes( copy.deepcopy(timex)) for k in annotations.keys(): if k not in timexes_layer.attributes: raise Exception( '(!) Unexpceted key {!r} in {!r}'.format( k, annotations)) timexes_layer.add_annotation(loc, **annotations) else: raise Exception( '(!) Unable to detect location of the timex {!r}'. format(timex)) else: # D) Tricky situation: text only overlaps the phrase. # So, we must find out its true indexes in text. i = 0 candidate_locs = [] while (text_obj.text.find(textual_content, i) > -1): i = text_obj.text.find(textual_content, i) j = i + len(textual_content) if locations_overlap(timex['_start'], timex['_end'], i, j): # if there is an overlap between the token location # and timex location, then we have a candidate if [i, j] not in candidate_locs: candidate_locs.append([i, j]) i = j if len(candidate_locs) == 0: # Try to search when spaces are removed textual_content = re.sub('\s+', '', textual_content) i = 0 while (text_obj.text.find(textual_content, i) > -1): i = text_obj.text.find(textual_content, i) j = i + len(textual_content) if locations_overlap(timex['_start'], timex['_end'], i, j): # if there is an overlap between the token location # and timex location, then we have a candidate if [i, j] not in candidate_locs: candidate_locs.append([i, j]) i = j if len(candidate_locs) == 1: # Exactly one location: all clear! new_start = candidate_locs[0][0] new_end = candidate_locs[0][1] assert text_obj.text[ new_start:new_end] == textual_content loc = (new_start, new_end) annotations = convert_timex_attributes( copy.deepcopy(timex)) for k in annotations.keys(): if k not in timexes_layer.attributes: raise Exception( '(!) Unexpceted key {!r} in {!r}'.format( k, annotations)) timexes_layer.add_annotation(loc, **annotations) elif len(candidate_locs) > 1: stretch = text_obj.text[ candidate_locs[0][0]:candidate_locs[-1][-1]] raise Exception( '(!) Multiple possible locations {!r} detected for the timex {!r} in {!r}' .format(candidate_locs, timex, stretch)) elif len(candidate_locs) == 0: loc = (timex['_start'], timex['_end']) print(text_obj.text[loc[0]:loc[1]]) raise Exception( '(!) Unable to detect location of the timex {!r}'. format(timex)) text_obj.add_layer(timexes_layer) return text_obj