Exemplo n.º 1
0
 def parse(self, text: str, mwe: Mwe = None) -> Parsed:
     with self.parser_lock:
         tokens, lemmas = self.lemmatize(text, mwe)
         # print("Language:", self.language)
         # print("Tokens:", tokens)
         # print("Lemmas:", lemmas)
         token_positions = tokenizations.get_original_spans(tokens, text)
         return Parsed(self.language, text, tokens, token_positions, lemmas)
Exemplo n.º 2
0
def txt_to_token_span(tokens: List[str],
                      text: str,
                      txt_spans):
    """
    Transfer text-domain spans to token-domain spans
    :param tokens: tokens
    :param text: text
    :param txt_spans: text spans tuples: (start, end, ...)
    :return: a list of transferred span tuples.
    """
    token_indices = get_original_spans(tokens, text)
    for idx, token_indice in enumerate(token_indices):
        if token_indice is None:
            # handle none error
            if idx == 0:
                token_indices[idx] = (0, token_indices[idx+1][0] - 1)
            elif idx == len(token_indices) - 1:
                token_indices[idx] = (token_indices[idx-1][1]+1, len(text))
            else:
                token_indices[idx] = (token_indices[idx-1][1]+1, token_indices[idx+1][0]-1)
    if isinstance(txt_spans, list):
        tgt_spans = list()
        for txt_span in txt_spans:
            txt_start = txt_span[0]
            txt_end = txt_span[1]
            start = None
            end = None
            for i, (s, e) in enumerate(token_indices):
                if s <= txt_start < e:
                    start = i
                if s <= txt_end <= e:
                    end = i + 1
                if (start is not None) and (end is not None):
                    break
            assert (start is not None) and (end is not None), ValueError("input spans out of scope")
            tgt_spans.append((start, end))
    elif isinstance(txt_spans, dict):
        tgt_spans = dict()
        for txt_span, v in txt_spans.items():
            txt_start = txt_span[0]
            txt_end = txt_span[1]
            start = None
            end = None
            for i, (s, e) in enumerate(token_indices):
                if s <= txt_start < e:
                    start = i
                if txt_start == e:
                    start = i + 1
                if s <= txt_end <= e:
                    end = i + 1
                if (start is not None) and (end is not None):
                    break
            assert (start is not None) and (end is not None), ValueError("input spans out of scope")
            tgt_spans[(start, end)] = v
    else:
        raise NotImplementedError
    return tgt_spans
Exemplo n.º 3
0
def token_to_txt_span(tokens: List[str], text: str, token_spans):
    """
    Transfer text-domain spans to token-domain spans
    :param tokens: tokens
    :param text: text
    :param token_spans: text spans tuples: (start, end, ...)
    :return: a list of transferred span tuples.
    """
    token_indices = get_original_spans(tokens, text)
    tgt_spans = dict()
    for token_span, value in token_spans.items():
        txt_start = token_indices[token_span[0]][0]
        txt_end = token_indices[token_span[1] - 1][1]
        tgt_spans[(txt_start, txt_end)] = value
    return tgt_spans
Exemplo n.º 4
0
def test_error_get_original_spans():
    with pytest.raises(ValueError):
        get_original_spans([], "")
Exemplo n.º 5
0
def test_warn_get_original_spans():
    with pytest.warns(DeprecationWarning):
        get_original_spans([], "")
Exemplo n.º 6
0
def test_random_get_original_spans(tokens, text, expected):
    ret = tokenizations.get_original_spans(tokens, text)
    assert ret == expected, (tokens, text)
Exemplo n.º 7
0
def test_random_get_original_spans(tokens, text):
    tokenizations.get_original_spans(tokens, text)
    ret = tokenizations.get_original_spans(tokens, "".join(tokens))
    assert all(x is not None for x in ret)
Exemplo n.º 8
0
def tokenize_span(text, sentence_tokenizer, word_tokenizer, word_normalizer):
    """
    tokenize a text and return the tokens as well as their span (start,end) in the
    original text

    tokenizes the text into sentences and then each sentence into tokens

    returns a list of list of tokens and a list of list of spans of those tokens and the sentences

    word_tokenizer: tokenizes list of sentences. returns list of list of tokens
    """
    sentences = sentence_tokenizer(text)

    #separation between sentences may be variable, take into accout when adding to the span
    so_helper = tokenizations.get_original_spans(sentences, text)
    sentence_offset = np.array([
        so_helper[i + 1][0] - so_helper[i][1]
        for i in range(len(so_helper) - 1)
    ] + [0])
    #print(sentence_offset)

    tokens = word_normalizer(word_tokenizer(sentences))
    safe_tokenized = [safe_tokenize(token_list) for token_list in tokens]
    lengths = np.array(list(map(len, sentences))) + sentence_offset
    lengths = np.cumsum(lengths)
    spans = [
        tokenizations.get_original_spans(t, s)
        for t, s in zip(safe_tokenized, sentences)
    ]
    #check if all tokens have been aligned (not None)
    #if not, and have two conseutive unaligned tokens, abort
    #else, infer spans from tokens

    for i in range(len(spans)):

        if spans[i][0] is None:

            if spans[i][1] is None:

                print(text[spans[i][0][0]:spans[i][-1][1]])
                print(spans[i])
                print(tokens[i])
                raise NotImplementedError(
                    "Cannot perform alingment with two consecutive unaligned tokens"
                )

            spans[i][0] = (0, spans[i][1][0] - 1)

        if spans[i][-1] is None:

            if spans[i][-2] is None:

                print(text[spans[i][0][0]:spans[i][-1][1]])
                print(spans[i])
                print(tokens[i])
                raise NotImplementedError(
                    "Cannot perform alingment with two consecutive unaligned tokens"
                )

            spans[i][-1] = (spans[i][-2][1] + 1, len(sentences[i]))

        try:

            spans[i][1:-1] = [(spans[i][j][0],
                               spans[i][j][1]) if spans[i][j] is not None else
                              (spans[i][j - 1][1] + 1, spans[i][j + 1][0] - 1)
                              for j in range(1,
                                             len(spans[i]) - 1)]

        except TypeError:

            print(text[spans[i][0][0]:spans[i][-1][1]])
            print(spans[i])
            print(tokens[i])
            raise NotImplementedError(
                "Cannot perform alingment with two consecutive unaligned tokens"
            )

    #each sentence is tokenized separately, with the span being
    #calculated separatly for each sentence
    #for each token in each sentence, add to the span
    #the length of all previous sentences
    #print(tokens)
    #print(spans)
    for i in range(1, len(spans)):

        for w in range(len(spans[i])):

            spans[i][w] = (spans[i][w][0] + lengths[i - 1],
                           spans[i][w][1] + lengths[i - 1])

    return tokens, spans, so_helper