예제 #1
0
def test_split_sentences_by_char_limit(caplog):
    caplog.set_level(logging.INFO)

    lingual_parser = Spacy("en")
    lingual_parser.load_lang_model()

    text = "This is a text. This is another text."
    all_sentences = [
        Sentence(**parts) for parts in lingual_parser.split_sentences(text)
    ]
    assert len(all_sentences) == 2
    assert [len(sentence.text) for sentence in all_sentences] == [15, 21]

    lingual_parser.model.remove_pipe("sentencizer")
    lingual_parser.model.add_pipe(set_custom_boundary,
                                  before="parser",
                                  name="sentence_boundary_detector")

    sentence_batches = lingual_parser._split_sentences_by_char_limit(
        all_sentences, 20)
    assert len(sentence_batches) == 2
    sentence_batches = lingual_parser._split_sentences_by_char_limit(
        all_sentences, 100)
    assert len(sentence_batches) == 1

    sentence_batch = sentence_batches[0]
    custom_tokenizer = TokenPreservingTokenizer(lingual_parser.model.vocab)
    doc = custom_tokenizer(sentence_batch)
    doc.user_data = sentence_batch
    for name, proc in lingual_parser.model.pipeline:  # iterate over components in order
        doc = proc(doc)
    assert doc.is_parsed

    # See if the number of parsed spaCy sentences matches that of input sentences
    assert len(list(doc.sents)) == len(sentence_batch)
예제 #2
0
def doc_setup():
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple"
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    return doc
예제 #3
0
def bbox_from_sentence(sentence: Sentence) -> Bbox:
    # TODO: this may have issues where a sentence is linked to words on different pages
    if isinstance(sentence, Sentence) and sentence.is_visual():
        return Bbox(
            sentence.page[0],
            min(sentence.top),
            max(sentence.bottom),
            min(sentence.left),
            max(sentence.right),
        )
    else:
        return None
예제 #4
0
def doc_setup():
    """Set up document."""
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple. That is orange. Where is banaba? I like Apple."
    lingual_parser = SpacyParser("en")
    # Split sentences
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Enrich sentences
    for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences):
        pass

    # Pick one sentence and add visual information
    # so that all the words get aligned horizontally.
    sentence: Sentence = doc.sentences[0]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 2nd sentence is horizontally aligned with 1st.
    sentence: Sentence = doc.sentences[1]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [40, 50, 60, 70]
    sentence.right = [50, 60, 70, 80]

    # Assume the 3rd sentence is vertically aligned with 1st.
    sentence: Sentence = doc.sentences[2]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [10, 10, 10, 10]
    sentence.bottom = [20, 20, 20, 20]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 4th sentence is in 2nd page.
    sentence: Sentence = doc.sentences[3]
    sentence.page = [2, 2, 2, 2]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    return doc
예제 #5
0
def test_span_char_start_and_char_end():
    """Test chart_start and char_end of TemporarySpan that comes from Ngrams.apply."""
    ngrams = Ngrams()
    sent = Sentence()
    sent.text = "BC548BG"
    sent.words = ["BC548BG"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 1
    assert result[0].get_span() == "BC548BG"
    assert result[0].char_start == 0
    assert result[0].char_end == 6
    def _parse_sentence(self, paragraph, node, state):
        """Parse the Sentences of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """

        # Set name for Sentence
        name = node.attrib["name"] if "name" in node.attrib else None

        # Lingual Parse
        document = state["document"]
        sens_parts = []
        sens_words_id = []
        for sentence in node:
            parts = defaultdict(list)
            parts["document"] = document
            flag = 0
            text = ""
            words = []
            char_abs_offsets = []
            start = 0
            for i, word in enumerate(sentence):
                w = ""
                for char in word:
                    if "bbox" in char.attrib.keys():
                        flag = 1
                    w += char.text
                words.append(w)
                char_abs_offsets.append(start)
                start += (1 + len(word))
                text += re.sub("\s+", " ", w)
                text += " "
            if not flag:
                continue
            if text.isspace():
                continue
            if not any(p and p[0].isalnum() for p in words):
                continue
            if not text:
                continue

            for i, word in enumerate(sentence):
                parts["words"].append(words[i].replace(" ", "_"))
                parts["lemmas"].append(words[i].replace(" ", "_"))
                parts["ner_tags"].append(
                    "")  # placeholder for later NLP parsing
                parts["char_offsets"].append(char_abs_offsets[i])
                # parts["abs_char_offsets"].append(char_abs_offsets[i])
                parts["dep_parents"].append(
                    0)  # placeholder for later NLP parsing
                parts["dep_labels"].append(
                    "")  # placeholder for later NLP parsing

            parts["text"], parts["pos_tags"] = self.lingual_parser.tagger(
                text[:-1])

            abs_offset = state["sentence"]["abs_offset"]
            parts["abs_char_offsets"] = [
                char_offset + abs_offset
                for char_offset in parts["char_offsets"]
            ]
            parts["position"] = state["sentence"]["idx"]

            if self.tabular:
                parts["position"] = state["sentence"]["idx"]

                # If tabular, consider own Context first in case a Cell
                # was just created. Otherwise, defer to the parent.
                parent = paragraph
                if isinstance(parent, Paragraph):
                    parts["section"] = parent.section
                    parts["paragraph"] = parent
                else:
                    raise NotImplementedError(
                        "Sentence parent must be Paragraph.")

            if self.structural:
                context_node = sentence
                tree = lxml.etree.ElementTree(state["root"])
                parts["xpath"] = tree.getpath(context_node)
                parts["html_tag"] = context_node.tag
                parts["html_attrs"] = []
                temp_attrs = []
                for word in sentence:
                    if len(word) == 0:
                        continue
                    t = ""
                    for k, v in word[0].attrib.items():
                        if k != "bbox":
                            v = v.replace(" ", "")
                            t = t + k + "=" + v + " "
                    t = t[:-1]
                    temp_attrs.append(t)
                for temp_attr in temp_attrs:
                    parts["html_attrs"].append(temp_attr)

            if self.visual:
                page = []
                top = []
                left = []
                right = []
                bottom = []

                p = int(node.getparent().get("id"))
                bbox = node.getparent().get("bbox")
                bbox = bbox.split(",")
                height = int(round(float(bbox[3])))

                # hack for handle error coordinate in sentence
                flag = False
                try:
                    for word in sentence:
                        if len(word) == 0:
                            continue
                        coord_f = word[0].attrib[
                            "bbox"]  # coordinate first character of word
                        coord_l = word[-1].attrib["bbox"]
                        coord_f = coord_f.split(",")
                        coord_l = coord_l.split(",")
                        page.append(p)
                        left.append(int(round(float(coord_f[0]))))
                        bottom.append(height - int(round(float(coord_f[1]))))
                        right.append(int(round(float(coord_l[2]))))
                        if height > int(round(float(coord_f[3]))):
                            top.append(height - int(round(float(coord_f[3]))))
                        else:
                            top.append(0)
                    parts["page"] = page
                    parts["left"] = left
                    parts["top"] = top
                    parts["right"] = right
                    parts["bottom"] = bottom
                except Exception as e:
                    print(e)
                    print(document, "\n", text)
                    continue

                abs_sentence_offset_end = (state["sentence"]["abs_offset"] +
                                           parts["char_offsets"][-1] +
                                           len(parts["words"][-1]))
                parts["stable_id"] = construct_stable_id(
                    document,
                    "sentence",
                    state["sentence"]["abs_offset"],
                    abs_sentence_offset_end,
                )
                state["sentence"]["idx"] += 1
                state["sentence"]["abs_offset"] = abs_sentence_offset_end
                parts["name"] = name

            yield Sentence(**parts)
예제 #7
0
def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams(split_tokens=["-", "/"])
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 6
    spans = [r.get_span() for r in result]
    assert "New/Text-Word" in spans
    assert "New" in spans
    assert "New/Text" in spans
    assert "Text" in spans
    assert "Text-Word" in spans
    assert "Word" in spans

    sent.text = "A-B/C-D"
    sent.words = ["A-B/C-D"]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C-D" in spans
    assert "A-B/C" in spans
    assert "B/C-D" in spans
    assert "A-B" in spans
    assert "C-D" in spans
    assert "B/C" in spans
    assert "A" in spans
    assert "B" in spans
    assert "C" in spans
    assert "D" in spans

    ngrams = Ngrams(split_tokens=["~", "~~"])
    sent = Sentence()

    sent.text = "a~b~~c~d"
    sent.words = ["a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "a~b~~c~d" in spans
    assert "a" in spans
    assert "a~b" in spans
    assert "a~b~~c" in spans
    assert "b" in spans
    assert "b~~c" in spans
    assert "b~~c~d" in spans
    assert "c" in spans
    assert "c~d" in spans
    assert "d" in spans

    ngrams = Ngrams(split_tokens=["~a", "a~"])
    sent = Sentence()

    sent.text = "~a~b~~c~d"
    sent.words = ["~a~b~~c~d"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    spans = [r.get_span() for r in result]
    assert "~a~b~~c~d" in spans
    assert "~b~~c~d" in spans

    ngrams = Ngrams(split_tokens=["-", "/", "*"])
    sent = Sentence()

    sent.text = "A-B/C*D"
    sent.words = ["A-B/C*D"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 10
    spans = [r.get_span() for r in result]
    assert "A-B/C*D" in spans
    assert "A" in spans
    assert "A-B" in spans
    assert "A-B/C" in spans
    assert "B" in spans
    assert "B/C" in spans
    assert "B/C*D" in spans
    assert "C" in spans
    assert "C*D" in spans
    assert "D" in spans
예제 #8
0
파일: parser.py 프로젝트: chyikwei/fonduer
    def _parse_sentence(self, paragraph, node, state):
        """Parse the Sentences of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        text = state["paragraph"]["text"]
        field = state["paragraph"]["field"]
        # Lingual Parse
        document = state["document"]
        for parts in self.lingual_parse(document, text):
            (_, _, _, char_end) = split_stable_id(parts["stable_id"])
            parts["document"] = document
            parts["position"] = state["sentence"]["idx"]
            abs_sentence_offset_end = (
                state["sentence"]["abs_offset"]
                + parts["char_offsets"][-1]
                + len(parts["words"][-1])
            )
            parts["stable_id"] = construct_stable_id(
                document,
                "sentence",
                state["sentence"]["abs_offset"],
                abs_sentence_offset_end,
            )
            state["sentence"]["abs_offset"] = abs_sentence_offset_end
            if self.structural:
                context_node = node.getparent() if field == "tail" else node
                tree = lxml.etree.ElementTree(state["root"])
                parts["xpath"] = tree.getpath(context_node)
                parts["html_tag"] = context_node.tag
                parts["html_attrs"] = [
                    "=".join(x) for x in list(context_node.attrib.items())
                ]

                # Extending html style attribute with the styles
                # from inline style class for the element.
                cur_style_index = None
                for index, attr in enumerate(parts["html_attrs"]):
                    if attr.find("style") >= 0:
                        cur_style_index = index
                        break
                styles = state["root"].find("head").find("style")
                if styles is not None:
                    for x in list(context_node.attrib.items()):
                        if x[0] == "class":
                            exp = r"(." + x[1] + ")([\n\s\r]*)\{(.*?)\}"
                            r = re.compile(exp, re.DOTALL)
                            if r.search(styles.text) is not None:
                                if cur_style_index is not None:
                                    parts["html_attrs"][cur_style_index] += (
                                        r.search(styles.text)
                                        .group(3)
                                        .replace("\r", "")
                                        .replace("\n", "")
                                        .replace("\t", "")
                                    )
                                else:
                                    parts["html_attrs"].extend(
                                        [
                                            "style="
                                            + re.sub(
                                                r"\s{1,}",
                                                " ",
                                                r.search(styles.text)
                                                .group(3)
                                                .replace("\r", "")
                                                .replace("\n", "")
                                                .replace("\t", "")
                                                .strip(),
                                            )
                                        ]
                                    )
                            break
            if self.tabular:
                parts["position"] = state["sentence"]["idx"]

                # If tabular, consider own Context first in case a Cell
                # was just created. Otherwise, defer to the parent.
                parent = paragraph
                if isinstance(parent, Paragraph):
                    parts["section"] = parent.section
                    parts["paragraph"] = parent
                    if parent.cell:
                        parts["table"] = parent.cell.table
                        parts["cell"] = parent.cell
                        parts["row_start"] = parent.cell.row_start
                        parts["row_end"] = parent.cell.row_end
                        parts["col_start"] = parent.cell.col_start
                        parts["col_end"] = parent.cell.col_end
                else:
                    raise NotImplementedError("Sentence parent must be Paragraph.")
            yield Sentence(**parts)

            state["sentence"]["idx"] += 1
예제 #9
0
def test_ner_matchers():
    """Test different ner type matchers."""
    # Set up a document
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = " ".join([
        "Tim Cook was born in USA in 1960.",
        "He is the CEO of Apple.",
        "He sold 100 million of iPhone.",
    ])
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Manually attach ner_tags as the result from spacy may fluctuate.
    doc.sentences[0].ner_tags = [
        "PERSON",
        "PERSON",
        "O",
        "O",
        "O",
        "GPE",
        "O",
        "DATE",
        "O",
    ]
    doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"]
    doc.sentences[2].ner_tags = [
        "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O"
    ]

    # the length of words and that of ner_tags should match.
    assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags)
    assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags)

    space = MentionNgrams(n_min=1, n_max=2)

    # Test if PersonMatcher works as expected
    matcher = PersonMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"}

    # Test if LocationMatcher works as expected
    matcher = LocationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"USA"}

    # Test if DateMatcher works as expected
    matcher = DateMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"1960"}

    # Test if OrganizationMatcher works as expected
    matcher = OrganizationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Apple"}

    # Test if NumberMatcher works as expected
    matcher = NumberMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"100 million"}

    # Test if MiscMatcher works as expected
    matcher = MiscMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"iPhone"}
예제 #10
0
def test_ngram_split(caplog):
    """Test ngram split."""
    caplog.set_level(logging.INFO)
    ngrams = Ngrams()
    sent = Sentence()

    # When a split_token appears in the middle of the text.
    sent.text = "New-Text"
    sent.words = ["New-Text"]
    sent.char_offsets = [0]
    sent.abs_char_offsets = [0]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New-Text"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text"

    # When a text ends with a split_token.
    sent.text = "New-"
    sent.words = ["New-"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "New-"
    assert result[1].get_span() == "New"

    # When a text starts with a split_token.
    sent.text = "-Text"
    sent.words = ["-Text"]
    result = list(ngrams.apply(sent))

    assert len(result) == 2
    assert result[0].get_span() == "-Text"
    assert result[1].get_span() == "Text"

    # When more than one split_token appears.
    sent.text = "New/Text-Word"
    sent.words = ["New/Text-Word"]
    result = list(ngrams.apply(sent))

    assert len(result) == 3
    assert result[0].get_span() == "New/Text-Word"
    assert result[1].get_span() == "New"
    assert result[2].get_span() == "Text-Word"