Пример #1
0
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_sentence):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())
Пример #2
0
def test_span_char_start_and_char_end():
    """Test chart_start and char_end of TemporarySpan that comes from Ngrams.apply."""
    ngrams = Ngrams()
    sent = Sentence()
    sent.text = "BC548BG"
    sent.words = ["BC548BG"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 1
    assert result[0].get_span() == "BC548BG"
    assert result[0].char_start == 0
    assert result[0].char_end == 6
Пример #3
0
def _get_direction_ngrams(
    direction: str,
    c: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str,
    n_min: int,
    n_max: int,
    lower: bool,
    from_sentence: bool,
) -> Iterator[str]:
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_visual():
            continue
        for sentence in span.sentence.document.sentences:
            # Skip if not in the same page.
            if span.sentence.get_bbox().page != sentence.get_bbox().page:
                continue
            if from_sentence:
                if (bbox_direction_aligned(sentence.get_bbox(),
                                           span.get_bbox()) and sentence
                        is not span.sentence  # not from its Sentence
                    ):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if (  # True if visually aligned AND not from itself.
                            bbox_direction_aligned(ts.get_bbox(),
                                                   span.get_bbox())
                            and ts not in span and span not in ts):
                        yield f(ts.get_span())
Пример #4
0
def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams()
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New/Text-Word"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text-Word"
Пример #5
0
def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams(split_tokens=["-", "/"])
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 6
    spans = [r.get_span() for r in result]
    assert "New/Text-Word" in spans
    assert "New" in spans
    assert "New/Text" in spans
    assert "Text" in spans
    assert "Text-Word" in spans
    assert "Word" in spans

    sent.text = "A-B/C-D"
    sent.words = ["A-B/C-D"]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C-D" in spans
    assert "A-B/C" in spans
    assert "B/C-D" in spans
    assert "A-B" in spans
    assert "C-D" in spans
    assert "B/C" in spans
    assert "A" in spans
    assert "B" in spans
    assert "C" in spans
    assert "D" in spans

    ngrams = Ngrams(split_tokens=["~", "~~"])
    sent = Sentence()

    sent.text = "a~b~~c~d"
    sent.words = ["a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "a~b~~c~d" in spans
    assert "a" in spans
    assert "a~b" in spans
    assert "a~b~~c" in spans
    assert "b" in spans
    assert "b~~c" in spans
    assert "b~~c~d" in spans
    assert "c" in spans
    assert "c~d" in spans
    assert "d" in spans

    ngrams = Ngrams(split_tokens=["~a", "a~"])
    sent = Sentence()

    sent.text = "~a~b~~c~d"
    sent.words = ["~a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    spans = [r.get_span() for r in result]
    assert "~a~b~~c~d" in spans
    assert "~b~~c~d" in spans

    ngrams = Ngrams(split_tokens=["-", "/", "*"])
    sent = Sentence()

    sent.text = "A-B/C*D"
    sent.words = ["A-B/C*D"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C*D" in spans
    assert "A" in spans
    assert "A-B" in spans
    assert "A-B/C" in spans
    assert "B" in spans
    assert "B/C" in spans
    assert "B/C*D" in spans
    assert "C" in spans
    assert "C*D" in spans
    assert "D" in spans