Python Ngrams примеры использования

Язык программирования: Python

Пространство имен/Пакет: fonduer.candidates.mentions

Класс/Тип: Ngrams

Примеров на hotexamples.com: 5

Python Ngrams - 5 примеров найдено. Это лучшие примеры Python кода для fonduer.candidates.mentions.Ngrams, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Ngrams(5)

apply(5)

Основные методы

Ngrams (5)

apply (5)

Пример #1

Показать файл

def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_sentence):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())

Пример #2

Показать файл

def test_span_char_start_and_char_end():
    """Test chart_start and char_end of TemporarySpan that comes from Ngrams.apply."""
    ngrams = Ngrams()
    sent = Sentence()
    sent.text = "BC548BG"
    sent.words = ["BC548BG"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 1
    assert result[0].get_span() == "BC548BG"
    assert result[0].char_start == 0
    assert result[0].char_end == 6

Пример #3

Показать файл

Файл: visual.py Проект: vishalbelsare/fonduer

def _get_direction_ngrams(
    direction: str,
    c: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str,
    n_min: int,
    n_max: int,
    lower: bool,
    from_sentence: bool,
) -> Iterator[str]:
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_visual():
            continue
        for sentence in span.sentence.document.sentences:
            # Skip if not in the same page.
            if span.sentence.get_bbox().page != sentence.get_bbox().page:
                continue
            if from_sentence:
                if (bbox_direction_aligned(sentence.get_bbox(),
                                           span.get_bbox()) and sentence
                        is not span.sentence  # not from its Sentence
                    ):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if (  # True if visually aligned AND not from itself.
                            bbox_direction_aligned(ts.get_bbox(),
                                                   span.get_bbox())
                            and ts not in span and span not in ts):
                        yield f(ts.get_span())

Пример #4

Показать файл

def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams()
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New/Text-Word"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text-Word"

Пример #5

Показать файл

Файл: test_candidates.py Проект: sbrown-ai/fonduer

def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams(split_tokens=["-", "/"])
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 6
    spans = [r.get_span() for r in result]
    assert "New/Text-Word" in spans
    assert "New" in spans
    assert "New/Text" in spans
    assert "Text" in spans
    assert "Text-Word" in spans
    assert "Word" in spans

    sent.text = "A-B/C-D"
    sent.words = ["A-B/C-D"]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C-D" in spans
    assert "A-B/C" in spans
    assert "B/C-D" in spans
    assert "A-B" in spans
    assert "C-D" in spans
    assert "B/C" in spans
    assert "A" in spans
    assert "B" in spans
    assert "C" in spans
    assert "D" in spans

    ngrams = Ngrams(split_tokens=["~", "~~"])
    sent = Sentence()

    sent.text = "a~b~~c~d"
    sent.words = ["a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "a~b~~c~d" in spans
    assert "a" in spans
    assert "a~b" in spans
    assert "a~b~~c" in spans
    assert "b" in spans
    assert "b~~c" in spans
    assert "b~~c~d" in spans
    assert "c" in spans
    assert "c~d" in spans
    assert "d" in spans

    ngrams = Ngrams(split_tokens=["~a", "a~"])
    sent = Sentence()

    sent.text = "~a~b~~c~d"
    sent.words = ["~a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    spans = [r.get_span() for r in result]
    assert "~a~b~~c~d" in spans
    assert "~b~~c~d" in spans

    ngrams = Ngrams(split_tokens=["-", "/", "*"])
    sent = Sentence()

    sent.text = "A-B/C*D"
    sent.words = ["A-B/C*D"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C*D" in spans
    assert "A" in spans
    assert "A-B" in spans
    assert "A-B/C" in spans
    assert "B" in spans
    assert "B/C" in spans
    assert "B/C*D" in spans
    assert "C" in spans
    assert "C*D" in spans
    assert "D" in spans