Python Layer примеры использования

Язык программирования: Python

Пространство имен/Пакет: estnltk

Класс/Тип: Layer

Примеров на hotexamples.com: 8

Python Layer - 8 примеров найдено. Это лучшие примеры Python кода для estnltk.Layer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Layer(8)

add_annotation(6)

add_span(1)

Основные методы

Layer (8)

add_annotation (6)

add_span (1)

Пример #1

Показать файл

Файл: carry_over_original_morph_analysis.py Проект: estnltk/estnltk-workflows

def make_new_ot_morph_layer(old_text_obj,
                            new_text_obj,
                            new_layer='original_words_morph_analysis',
                            old_layer='ot_morph_analysis',
                            new_parent_layer='original_words'):
    '''Creates new 'original_words_morph_analysis' layer based on the old morph analysis layer.'''
    assert old_layer in old_text_obj.layers
    assert new_parent_layer in new_text_obj.layers
    assert new_layer not in new_text_obj.layers
    original_layer = old_text_obj[old_layer]
    assert 'normalized_text' not in original_layer.attributes
    parent_layer = new_text_obj[new_parent_layer]
    layer = Layer(name=new_layer,
                  text_object=new_text_obj,
                  attributes=('normalized_text', ) + original_layer.attributes,
                  parent=new_parent_layer,
                  ambiguous=True)
    assert len(parent_layer) == len(original_layer)
    for wid, parent_span in enumerate(parent_layer):
        old_morph_span = original_layer[wid]
        new_span = Span(base_span=parent_span.base_span, layer=layer)
        for ann in old_morph_span.annotations:
            new_annotation = {'normalized_text': parent_span.text}
            for a in layer.attributes:
                if a in ['start', 'end', 'text', 'normalized_text']:
                    continue
                new_annotation[a] = ann[a]
            new_span.add_annotation(Annotation(new_span, **new_annotation))
        layer.add_span(new_span)
    return layer

Пример #2

Показать файл

Файл: carry_over_original_sentences.py Проект: estnltk/estnltk-workflows

def make_new_sentences_layer( old_text_obj, new_text_obj, new_layer='original_sentences_flat', old_layer='sentences' ):
    '''Creates new 'original_sentences_flat' layer based on the old sentences layer.'''
    assert old_layer in old_text_obj.layers
    assert new_layer not in new_text_obj.layers
    original_layer = old_text_obj[old_layer]
    layer = Layer(name = new_layer,
                  text_object = new_text_obj,
                  attributes  = original_layer.attributes,
                  parent   =None,
                  ambiguous=False)
    layer.meta['desc'] = 'Original sentence tokenization from koondkorpus XML file.'
    for span in original_layer:
        attribs = {}
        layer.add_annotation( (span.start, span.end), **attribs )
    return layer

Пример #3

Показать файл

Файл: test_grammar.py Проект: xmichelf/estnltk

 def test_intersection_more_elements(self):
     i = Intersection(
         self.adjectives(),
         self.ed_suffix_regex(),
         Layer('words'),
         self.ed_suffix()
     )
     matches = i.get_matches(self.text())
     self.assertListEqual(self.expected(), matches)

Пример #4

Показать файл

def make_new_words_layer(old_text_obj,
                         new_text_obj,
                         new_layer='original_words',
                         old_layer='words'):
    '''Creates new 'original_words' layer based on the old words layer.'''
    assert old_layer in old_text_obj.layers
    assert new_layer not in new_text_obj.layers
    original_layer = old_text_obj[old_layer]
    layer = Layer(name=new_layer,
                  text_object=new_text_obj,
                  attributes=original_layer.attributes,
                  parent=None,
                  ambiguous=True)
    layer.meta[
        'desc'] = 'Original word tokenization from koondkorpus XML file.'
    for span in original_layer:
        attribs = {'normalized_form': None}
        layer.add_annotation((span.start, span.end), **attribs)
    return layer

Пример #5

Показать файл

Файл: morph_eval_utils.py Проект: estnltk/processing-old-estonian

def remove_attribs_from_layer(text, layer_name, new_layer_name,
                              remove_attribs):
    ''' Rewrites given layer in a way that attributes from the list remove_attribs
        will be completely removed. Returns the new layer.
    '''
    new_attribs = [
        a for a in text[layer_name].attributes if a not in remove_attribs
    ]
    new_layer = Layer(
        name=new_layer_name,
        text_object=text,
        attributes=new_attribs,
        parent=text[layer_name].parent if text[layer_name].parent else None,
        ambiguous=text[layer_name].ambiguous)
    for span in text[layer_name]:
        for annotation in span.annotations:
            analysis = {attrib: annotation[attrib] for attrib in new_attribs}
            new_layer.add_annotation((span.start, span.end), **analysis)
    return new_layer

Пример #6

Показать файл

def import_from_brat_folder(folder):
    assert os.path.isdir( folder ), \
        "(!) Invalid folder name {!r}.".format(folder)
    annotation_files = dict()
    for fname in os.listdir(folder):
        if fname.endswith(('.ann', '.txt')):
            name, ext = os.path.splitext(fname)
            fpath = os.path.join(folder, fname)
            if name not in annotation_files:
                annotation_files[name] = []
            annotation_files[name].append(fpath)
    # Check that both .ann and .txt exist
    for name in annotation_files.keys():
        if len(annotation_files[name]) != 2:
            has_ann = any([
                fname for fname in annotation_files[name]
                if fname.endswith('.ann')
            ])
            has_txt = any([
                fname for fname in annotation_files[name]
                if fname.endswith('.txt')
            ])
            if not has_txt:
                raise ValueError(
                    '(!) Annotations file {!r} is missing .txt part.'.format(
                        name))
            if not has_ann:
                raise ValueError(
                    '(!) Annotations file {!r} is missing .ann part.'.format(
                        name))
    text_objects = []
    for name in annotation_files.keys():
        ann_file = [
            fname for fname in annotation_files[name] if fname.endswith('.ann')
        ][0]
        entity_annotations, rel_annotations = import_brat_annotations(ann_file)
        txt_file = [
            fname for fname in annotation_files[name] if fname.endswith('.txt')
        ][0]
        content = import_brat_text(txt_file)
        #
        #  Create text object and entity annotations
        #
        text_obj = Text(content)
        text_obj.meta['file'] = name
        brat_entities = \
            Layer('brat_entities', attributes=('brat_id',), text_object = text_obj)
        event_layer = \
            Layer('events', attributes=('brat_id', 'class', 'class_confidence', 'duration', 'duration_confidence', 'comment'), \
                            text_object = text_obj, enveloping='brat_entities')
        timex_layer = \
            Layer('timexes', attributes=('brat_id', 'tid', 'type', 'value', 'mod', 'anchor_time_id', 'comment'), \
                             text_object = text_obj, enveloping='brat_entities')
        entity_layer = \
            Layer('entities', attributes=('brat_id',), text_object = text_obj, enveloping='brat_entities')
        entity_id_to_loc_map = dict()
        for (entity_id, type, start, end, attribs) in entity_annotations:
            # Check that location strings are expected ones
            # Collect corrected locations
            corrected_locs = []
            if isinstance(start, int):
                corrected_start, delta = _calculate_corrected_start_and_delta(
                    content, start)
                snippet = content[corrected_start:end + delta]
                assert snippet == attribs['text'], \
                    f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}"
                corrected_locs.append((corrected_start, end + delta))
            elif isinstance(start, list):
                if len(start) == len(attribs['text']):
                    for s_start, s_end, s_text in zip(start, end,
                                                      attribs['text']):
                        corrected_start, delta = _calculate_corrected_start_and_delta(
                            content, s_start)
                        snippet = content[corrected_start:s_end + delta]
                        assert snippet == s_text, \
                            f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}"
                        corrected_locs.append((corrected_start, s_end + delta))
                elif len(start) <= len(attribs['text']):
                    # Tricky case: there can be less entity locations than entity text strings
                    # (!) different number of entity texts ['oli', 'kõige', 'parem'] and start locs [1904, 1908]
                    assert len(start) == len(end)
                    for s_start, s_end in zip(start, end):
                        corrected_start, delta = _calculate_corrected_start_and_delta(
                            content, s_start)
                        snippet = content[corrected_start:s_end + delta]
                        assert any([s in snippet for s in attribs['text']]), \
                            f"(!) {name!r} has mismatching entity texts {snippet!r} vs {attribs['text']!r}"
                        corrected_locs.append((corrected_start, s_end + delta))
                else:
                    raise Exception(
                        '(!) Mismatching number of locations and texts in {!r}'
                        .format((entity_id, type, start, end, attribs)))
            # add base layer: brat entities
            for s_start, s_end in corrected_locs:
                brat_entities.add_annotation((s_start, s_end),
                                             **{'brat_id': entity_id})
            entity_id_to_loc_map[entity_id] = corrected_locs
            # add enveloping layers
            if type == 'Event':
                attribs['brat_id'] = entity_id
                event_layer.add_annotation(corrected_locs, **attribs)
            elif type == 'Timex':
                attribs['brat_id'] = entity_id
                timex_layer.add_annotation(corrected_locs, **attribs)
            elif type == 'Entity':
                attribs['brat_id'] = entity_id
                entity_layer.add_annotation(corrected_locs, **attribs)
        text_obj.add_layer(brat_entities)
        text_obj.add_layer(event_layer)
        text_obj.add_layer(timex_layer)
        text_obj.add_layer(entity_layer)
        #
        #  Add tlink relation annotations
        #
        relations_layer = \
            Layer('tlinks', attributes=('brat_id', 'a_text', 'rel_type', 'b_text', 'b_index'), \
                            text_object = text_obj, enveloping='brat_entities', ambiguous=True)
        for (rel_arg1, rel_type, rel_arg2, rel_id) in rel_annotations:
            if rel_type == 'has_Argument':
                continue
            assert rel_arg1 in entity_id_to_loc_map.keys()
            assert rel_arg2 in entity_id_to_loc_map.keys()
            arg1_loc = entity_id_to_loc_map[rel_arg1]
            arg2_loc = entity_id_to_loc_map[rel_arg2]
            # check if relation needs to be reversed
            if arg1_loc[0] > arg2_loc[0]:
                # reverse relation
                temp = arg1_loc
                arg1_loc = arg2_loc
                arg2_loc = temp
                # change reltype
                if rel_type == 'AFTER':
                    rel_type = 'BEFORE'
                elif rel_type == 'BEFORE':
                    rel_type = 'AFTER'
                elif rel_type == 'INCLUDES':
                    rel_type = 'IS_INCLUDED'
                elif rel_type == 'IS_INCLUDED':
                    rel_type = 'INCLUDES'
            attribs = {}
            attribs['brat_id'] = rel_id
            attribs['rel_type'] = rel_type
            attribs['a_text'] = ' '.join([content[s:e] for s, e in arg1_loc])
            attribs['b_text'] = ' '.join([content[s:e] for s, e in arg2_loc])
            attribs['b_index'] = len(arg1_loc)
            relations_layer.add_annotation(arg1_loc + arg2_loc, **attribs)
        text_obj.add_layer(relations_layer)
        #
        #  Add has_Argument relations
        #
        arguments_layer = \
            Layer('event_arguments', attributes=('brat_id', 'a_text', 'rel_type', 'b_text', 'b_index'), \
                               text_object = text_obj, enveloping='brat_entities', ambiguous=True)
        for (rel_arg1, rel_type, rel_arg2, rel_id) in rel_annotations:
            if rel_type != 'has_Argument':
                continue
            assert rel_arg1 in entity_id_to_loc_map.keys()
            assert rel_arg2 in entity_id_to_loc_map.keys()
            arg1_loc = entity_id_to_loc_map[rel_arg1]
            arg2_loc = entity_id_to_loc_map[rel_arg2]
            # check if relation needs to be reversed
            if arg1_loc[0] > arg2_loc[0]:
                # reverse relation
                temp = arg1_loc
                arg1_loc = arg2_loc
                arg2_loc = temp
                # change reltype
                rel_type = 'is_Argument_of'
            attribs = {}
            attribs['brat_id'] = rel_id
            attribs['rel_type'] = rel_type
            attribs['a_text'] = ' '.join([content[s:e] for s, e in arg1_loc])
            attribs['b_text'] = ' '.join([content[s:e] for s, e in arg2_loc])
            attribs['b_index'] = len(arg1_loc)
            arguments_layer.add_annotation(arg1_loc + arg2_loc, **attribs)
        text_obj.add_layer(arguments_layer)
        text_objects.append(text_obj)
    return text_objects

Пример #7

Показать файл

Файл: corpus_readers.py Проект: estnltk/processing-old-estonian

def read_from_tsv(path):
    texts = []
    tokens_tagger = WhiteSpaceTokensTagger()
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            (head, tail) = os.path.split(root)
            if len(files) > 0:
                for file in files:
                    if not file.endswith(".tsv"):
                        continue
                    with open(os.path.join(root, file),
                              encoding="utf-8") as fin:
                        reader = csv.reader(fin, delimiter='\t')
                        words = []
                        #Lines containing a word and its' analysis
                        word = []
                        #Morphological analysis of the whole text.
                        morph_analysis = []
                        raw_text = ""
                        multiword_expressions = []
                        for index, row in enumerate(reader):
                            row[0] = row[0].strip()
                            #Check if the row has correct number of elements
                            #If there are less than 6 then it is probably an adverb, abbreviation etc.
                            #But if there are more, then there's something wrond and the user has to be notified.
                            if len(row) > 6:
                                #If the elements after the 6th one contain nothing, then we can continue.
                                for x in row[6:]:
                                    x = x.strip()
                                    if x != "":
                                        sys.stderr.write(
                                            "Something is wrong with the following file: "
                                            + file +
                                            " In the following line: " +
                                            str(index + 1) + "\n" +
                                            "\t".join(row) + "\n")
                                        sys.exit(1)
                            #If the first element of a row is empty then it is an alternative analysis of a word.
                            if row[0] == "" and word:
                                word.append(row)
                            else:
                                if len(word) != 0:
                                    words.append(word)
                                #After appending the word into the words list let's initialize a new word.
                                word = [row]
                        #As the loop terminates before adding the last word into the list, let's do it now
                        words.append(word)
                        for word in words:
                            #Iterate over the analyses and check for manual fixes.
                            #Remove all other analyses if they exist.
                            type_of_fix = ""
                            for analysis in word:
                                #As it may be sometimes necessary to look at the whole line, join the elements of a row back together.
                                line = "\t".join(analysis)
                                if "¤" in line:
                                    word[0][1:] = [
                                        None, None, None, None, None
                                    ]
                                    word = [word[0]]

                                    type_of_fix = "No_correct_analysis_available"
                                    break
                                elif analysis[1].startswith("@"):
                                    word[0][1:] = analysis[1:]
                                    word = [word[0]]
                                    word[0][1] = word[0][1].strip("@")
                                    type_of_fix = "correct_analysis_provided"
                                    break
                                elif analysis[1].startswith("£"):
                                    word[0][1:] = analysis[1:]
                                    word = [word[0]]
                                    word[0][1] = word[0][1].strip("£")
                                    type_of_fix = "correct_analysis_not_provided"
                                    break
                                elif re.match("#[A-Üa-ü0-9]", analysis[1]):
                                    word[0][1:] = analysis[1:]
                                    word = [word[0]]
                                    word[0][1] = word[0][1].strip("#")
                                    type_of_fix = "correct_analysis_manually_added"
                                    break
                            analyses = []
                            for a in word:
                                analysis = {}
                                analysis['root'] = a[1]
                                #If it is an abbreviation some fields may be missing.
                                #Sometimes there are also missing tabs in the end of a line, so the last element has to be checked.
                                if a[-1] == "Y" or a[-1] == 'D' or a[-1] == 'K':
                                    analysis['partofspeech'] = a[-1]
                                    analysis['ending'] = ""
                                    analysis['form'] = ""
                                    analysis['clitic'] = ""
                                else:
                                    analysis['ending'] = a[2]
                                    analysis['clitic'] = a[3]
                                    analysis['partofspeech'] = a[4]
                                    analysis['form'] = a[5] if len(
                                        a) == 6 else ""
                                if analysis['root'] != None:
                                    analysis['root'], analysis[
                                        'root_tokens'], analysis[
                                            'lemma'] = _postprocess_root(
                                                analysis['root'],
                                                analysis['partofspeech'])
                                else:
                                    analysis['root_tokens'] = None
                                    analysis['lemma'] = None
                                analysis['type_of_fix'] = type_of_fix
                                #If not otherwize specified the normalized_text will remain the same as the word form
                                analysis['normalized_text'] = word[0][0]
                                analyses.append(analysis)
                            #if len(analyses) > 1:
                            #	print (analyses)
                            word_tuple = (word[0][0], analyses)
                            morph_analysis.append(word_tuple)
                            raw_text += word[0][0] + " "
                            if ' ' in word[0][0]:
                                multiword_expressions.append(word[0][0])
                        text = Text(raw_text)

                        tokens_layer = tokens_tagger.make_layer(text)
                        multiword_expressions = [
                            mw.split() for mw in multiword_expressions
                        ]
                        compound_tokens_tagger = PretokenizedTextCompoundTokensTagger(
                            multiword_units=multiword_expressions)
                        compound_tokens_layer = compound_tokens_tagger.make_layer(
                            text, layers={'tokens': tokens_layer})
                        word_tagger = WordTagger()
                        words_layer = word_tagger.make_layer(
                            text,
                            layers={
                                'compound_tokens': compound_tokens_layer,
                                'tokens': tokens_layer
                            })
                        #text.tag_layer(['sentences'])
                        layer_morph = Layer(name='manual_morph',
                                            text_object=text,
                                            attributes=[
                                                'root', 'lemma', 'root_tokens',
                                                'ending', 'clitic',
                                                'partofspeech', 'form'
                                            ],
                                            ambiguous=True)
                        layer_fix = Layer(name='type_of_fix',
                                          text_object=text,
                                          attributes=['type_of_fix'],
                                          parent='manual_morph')

                        for ind, word in enumerate(words_layer):
                            layer_fix.add_annotation(
                                (word.start, word.end),
                                type_of_fix=analysis['type_of_fix'])
                            for analysis in morph_analysis[ind][1]:
                                layer_morph.add_annotation(
                                    (word.start, word.end), **analysis)
                        text.add_layer(layer_morph)
                        text.add_layer(layer_fix)
                        text.meta['id'] = file.split(".")[0]
                        text.meta['location'] = root.split(os.sep)[-1].lower()
                        texts.append(text)
    return texts

Пример #8

Показать файл

Файл: tml_conv_utils.py Проект: soras/EstTimexCorpora

def create_new_text_obj( fname, metadata, cur_text_len, cur_tokens, cur_tok_id, \
                         raw_timexes, timexes_layer_name='gold_timexes' ):
    '''Based on the snapshot of data collected from the file, creates a 
       new EstNLTK v1.6 Text object, and populates with metadata and gold 
       standard timexes layer. Returns the Text object.'''
    # Construct new text object
    text_str = ''.join(cur_tokens)
    assert len(text_str) == cur_text_len
    text_obj = Text(text_str)
    # Add metadata
    text_obj.meta['source_file'] = fname
    assert len(metadata) >= 1
    if len(metadata) > 1:
        print(
            'Warn! Unexpected number of metadata items {!r}. Using only first.'
            .format(metadata))
    for (k, v) in metadata[0].items():
        text_obj.meta[k] = v
    text_obj.meta['_original_token_count'] = cur_tok_id
    # Add document creation date
    for timex in raw_timexes:
        if 'functionInDocument' in timex and \
            timex['functionInDocument'] == 'CREATION_TIME':
            assert 'value' in timex
            text_obj.meta['document_creation_time'] = timex['value']
            if 'comment' in timex:
                text_obj.meta['dct_comment'] = timex['comment']
            break
    # Add TIMEX-es layer
    timexes_layer = Layer(name=timexes_layer_name, \
                          attributes=('tid', 'type', 'value', 'temporal_function', 'anchor_time_id', \
                                      'mod', 'quant', 'freq', 'begin_point', 'end_point', 'part_of_interval', \
                                      'comment' ), \
                          text_object=text_obj,\
                          ambiguous=False)
    for timex in raw_timexes:
        if '_start' in timex and '_end' in timex:
            # Determine if this TIMEX is part of an interval (without textual content)
            interval_timex, place_in_interval = get_parent_of_interval(
                timex, raw_timexes)
            if interval_timex:
                if interval_timex.get('type', None) == 'DURATION':
                    # Record interval timex as an implicit timex
                    interval_timex_odict = convert_timex_to_ordered_dict(
                        interval_timex)
                    timex['part_of_interval'] = interval_timex_odict
                else:
                    raise Exception(
                        '(!) Unexpected interval_timex {!r} for timex {!r}'.
                        format(interval_timex, timex))
            # Determine if this TIMEX is an implicit interval that has explicit timepoints
            # in text. If so, skip it to avoid duplicates in annotations
            if is_removable_interval_timex(timex, raw_timexes):
                continue
            # Determine if this is an explicit interval with one or more implicit time points
            # If so, then attach the implicit time points as OrderedDict-s
            begin_point_tmx, end_point_tmx = get_child_timepoints(
                timex, raw_timexes, only_implicit=True)
            if begin_point_tmx:
                begin_point_odict = convert_timex_to_ordered_dict(
                    begin_point_tmx)
                timex['beginPoint'] = begin_point_odict
            if end_point_tmx:
                end_point_odict = convert_timex_to_ordered_dict(end_point_tmx)
                timex['endPoint'] = end_point_odict
            # Determine exact position of the timex:
            if 'text' not in timex:
                # Timexes without pre-specified textual position/substring:
                #  _start and _end provide all the information we need
                loc = (timex['_start'], timex['_end'])
                annotations = convert_timex_attributes(copy.deepcopy(timex))
                for k in annotations.keys():
                    if k not in timexes_layer.attributes:
                        raise Exception(
                            '(!) Unexpceted key {!r} in {!r}'.format(
                                k, annotations))
                timexes_layer.add_annotation(loc, **annotations)
            elif 'text' in timex:
                # Timexes with pre-specified textual position/substring:
                #  we need to detect exact indexes of position in text
                loc = (timex['_start'], timex['_end'])
                textual_content = timex['text']
                timex_span = text_obj.text[loc[0]:loc[1]]
                if re.sub('\s+', '', textual_content) == timex_span:
                    # A) strings match if spaces are removed from text, e.g.
                    #    text="31. 12. 1997.a."  vs token="31.12.1997.a."
                    loc = (timex['_start'], timex['_end'])
                    annotations = convert_timex_attributes(
                        copy.deepcopy(timex))
                    for k in annotations.keys():
                        if k not in timexes_layer.attributes:
                            raise Exception(
                                '(!) Unexpceted key {!r} in {!r}'.format(
                                    k, annotations))
                    timexes_layer.add_annotation(loc, **annotations)
                elif re.sub('\s+', '',
                            textual_content) == re.sub('\s+', '', timex_span):
                    # B) strings match if spaces are removed from both text and token, e.g.
                    #    text="täna kell 19. 08"  vs token="täna kell 19.08"
                    loc = (timex['_start'], timex['_end'])
                    annotations = convert_timex_attributes(
                        copy.deepcopy(timex))
                    for k in annotations.keys():
                        if k not in timexes_layer.attributes:
                            raise Exception(
                                '(!) Unexpceted key {!r} in {!r}'.format(
                                    k, annotations))
                    timexes_layer.add_annotation(loc, **annotations)
                elif textual_content in timex_span:
                    # C) text is a substring of the phrase, e.g.
                    #    text="1899-"  vs  token="1899-1902"
                    i = text_obj.text.find(textual_content, timex['_start'])
                    if i > -1 and i + len(textual_content) <= loc[1]:
                        new_start = i
                        new_end = i + len(textual_content)
                        assert text_obj.text[
                            new_start:new_end] == textual_content
                        loc = (new_start, new_end)
                        annotations = convert_timex_attributes(
                            copy.deepcopy(timex))
                        for k in annotations.keys():
                            if k not in timexes_layer.attributes:
                                raise Exception(
                                    '(!) Unexpceted key {!r} in {!r}'.format(
                                        k, annotations))
                        timexes_layer.add_annotation(loc, **annotations)
                    else:
                        raise Exception(
                            '(!) Unable to detect location of the timex {!r}'.
                            format(timex))
                else:
                    # D) Tricky situation: text only overlaps the phrase.
                    #    So, we must find out its true indexes in text.
                    i = 0
                    candidate_locs = []
                    while (text_obj.text.find(textual_content, i) > -1):
                        i = text_obj.text.find(textual_content, i)
                        j = i + len(textual_content)
                        if locations_overlap(timex['_start'], timex['_end'], i,
                                             j):
                            # if there is an overlap between the token location
                            # and timex location, then we have a candidate
                            if [i, j] not in candidate_locs:
                                candidate_locs.append([i, j])
                        i = j
                    if len(candidate_locs) == 0:
                        # Try to search when spaces are removed
                        textual_content = re.sub('\s+', '', textual_content)
                        i = 0
                        while (text_obj.text.find(textual_content, i) > -1):
                            i = text_obj.text.find(textual_content, i)
                            j = i + len(textual_content)
                            if locations_overlap(timex['_start'],
                                                 timex['_end'], i, j):
                                # if there is an overlap between the token location
                                # and timex location, then we have a candidate
                                if [i, j] not in candidate_locs:
                                    candidate_locs.append([i, j])
                            i = j
                    if len(candidate_locs) == 1:
                        # Exactly one location: all clear!
                        new_start = candidate_locs[0][0]
                        new_end = candidate_locs[0][1]
                        assert text_obj.text[
                            new_start:new_end] == textual_content
                        loc = (new_start, new_end)
                        annotations = convert_timex_attributes(
                            copy.deepcopy(timex))
                        for k in annotations.keys():
                            if k not in timexes_layer.attributes:
                                raise Exception(
                                    '(!) Unexpceted key {!r} in {!r}'.format(
                                        k, annotations))
                        timexes_layer.add_annotation(loc, **annotations)
                    elif len(candidate_locs) > 1:
                        stretch = text_obj.text[
                            candidate_locs[0][0]:candidate_locs[-1][-1]]
                        raise Exception(
                            '(!) Multiple possible locations {!r} detected for the timex {!r} in {!r}'
                            .format(candidate_locs, timex, stretch))
                    elif len(candidate_locs) == 0:
                        loc = (timex['_start'], timex['_end'])
                        print(text_obj.text[loc[0]:loc[1]])
                        raise Exception(
                            '(!) Unable to detect location of the timex {!r}'.
                            format(timex))
    text_obj.add_layer(timexes_layer)
    return text_obj