예제 #1
0
    def test_not_hashable(self):
        anno: Annotation = Annotation(self.pack, 0, 5)
        with self.assertRaises(TypeError):
            hash(anno)
        anno.regret_creation()

        anno1: EntityMention = EntityMention(self.pack, 0, 2)
        with self.assertRaises(TypeError):
            hash(anno1)
        anno1.regret_creation()
예제 #2
0
    def _process(self, input_pack: DataPack):
        entity_text = self.configs.entities_to_insert

        input_text = input_pack.text
        if not all(bool(entity in input_text) for entity in entity_text):
            raise Exception(
                "Entities to be added are not valid for the input text.")
        for entity in entity_text:
            start = input_text.index(entity)
            end = start + len(entity)
            entity_mention = EntityMention(input_pack, start, end)
            input_pack.add_entry(entity_mention)
예제 #3
0
    def pack(self,
             data_pack: DataPack,
             output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None):
        """
        Write the prediction results back to datapack. by writing the predicted
        ner to the original tokens.
        """

        if output_dict is None:
            return

        current_entity_mention: Tuple[int, str] = (-1, "None")

        for i in range(len(output_dict["Token"]["tid"])):
            # an instance
            for j in range(len(output_dict["Token"]["tid"][i])):
                tid: int = output_dict["Token"]["tid"][i][j]  # type: ignore

                orig_token: Token = data_pack.get_entry(tid)  # type: ignore
                ner_tag: str = output_dict["Token"]["ner"][i][j]

                orig_token.set_fields(ner=ner_tag)

                token = orig_token
                token_ner = token.get_field("ner")
                if token_ner[0] == "B":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                elif token_ner[0] == "I":
                    continue
                elif token_ner[0] == "O":
                    continue

                elif token_ner[0] == "E":
                    if token_ner[2:] != current_entity_mention[1]:
                        continue

                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
                                           current_entity_mention[0],
                                           token.span.end)
                    entity.set_fields(**kwargs_i)
                    data_pack.add_or_get_entry(entity)
                elif token_ner[0] == "S":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
                                           current_entity_mention[0],
                                           token.span.end)
                    entity.set_fields(**kwargs_i)
                    data_pack.add_or_get_entry(entity)
예제 #4
0
    def pack(self,
             data_pack: DataPack,
             output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None):
        """
        Write the prediction results back to datapack. by writing the predicted
        ner to the original subwords and convert predictions to something that
        makes sense in a word-by-word segmentation
        """

        if output_dict is None:
            return

        for i in range(len(output_dict["Subword"]["tid"])):
            tids = output_dict["Subword"]["tid"][i]
            labels = output_dict["Subword"]["ner"][i]

            # Filter to labels not in `self.ignore_labels`
            entities = [
                dict(idx=idx, label=label, tid=tid)
                for idx, (label, tid) in enumerate(zip(labels, tids))
                if label not in self.ft_configs.ignore_labels
            ]

            entity_groups = self._group_entities(entities, data_pack, tids)
            # Add NER tags and create EntityMention ontologies.
            for first_tid, last_tid in entity_groups:
                first_token: Subword = data_pack.get_entry(  # type: ignore
                    first_tid)
                first_token.ner = 'B-' + self.ft_configs.ner_type

                for tid in range(first_tid + 1, last_tid + 1):
                    token: Subword = data_pack.get_entry(tid)  # type: ignore
                    token.ner = 'I-' + self.ft_configs.ner_type

                begin = first_token.span.begin
                end = data_pack.get_entry(last_tid).span.end
                entity = EntityMention(data_pack, begin, end)
                entity.ner_type = self.ft_configs.ner_type
예제 #5
0
    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(Sentence):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               components=self.token_component))
            tokens = [(token.text, token.pos) for token in token_entries]
            ne_tree = ne_chunk(tokens)

            index = 0
            for chunk in ne_tree:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    entity = EntityMention(input_pack, begin_pos, end_pos)
                    entity.ner_type = chunk.label()
                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('This', 'DT')
                    index += 1
예제 #6
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + '_copy'
        else:
            copy_pack.pack_name = 'copy'

        ent: EntityMention
        for ent in from_pack.get(EntityMention):
            EntityMention(copy_pack, ent.begin, ent.end)
예제 #7
0
    def _parse_pack(self, data: dict) -> Iterator[DataPack]:
        """
        Extracts information from input `data` of one document
        output from Prodigy Annotator including the text,
        tokens and its annotations into a DataPack.

        Args:
            data: a dict that contains information for one document.

        Returns: DataPack containing information extracted from `data`.

        """
        pack = DataPack()
        text = data['text']
        tokens = data['tokens']
        spans = data['spans']

        document = Document(pack, 0, len(text))
        pack.set_text(text, replace_func=self.text_replace_operation)
        pack.add_or_get_entry(document)

        for token in tokens:
            begin = token['start']
            end = token['end']
            token_entry = Token(pack, begin, end)
            pack.add_or_get_entry(token_entry)

        for span_items in spans:
            begin = span_items['start']
            end = span_items['end']
            annotation_entry = EntityMention(pack, begin, end)
            annotation_entry.set_fields(ner_type=span_items['label'])
            pack.add_or_get_entry(annotation_entry)

        pack.meta.doc_id = data['meta']['id']

        yield pack
예제 #8
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")

                    # Add sentence.
                    sentence = oie_component[0]
                    text += sentence + "\n"
                    Sentence(pack, offset, offset + len(sentence))

                    # Find argument 1.
                    arg1_begin = sentence.find(oie_component[3]) + offset
                    arg1_end = arg1_begin + len(oie_component[3])
                    arg1: EntityMention = EntityMention(
                        pack, arg1_begin, arg1_end)

                    # Find argument 2.
                    arg2_begin = sentence.find(oie_component[4]) + offset
                    arg2_end = arg2_begin + len(oie_component[4])
                    arg2: EntityMention = EntityMention(
                        pack, arg2_begin, arg2_end)

                    head_relation = RelationLink(pack, arg1, arg2)
                    head_relation.rel_type = oie_component[2]

                    offset += len(sentence) + 1

        self.set_text(pack, text)
        pack.pack_name = os.path.basename(file_path)
        yield pack
예제 #9
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + "_copy"
        else:
            copy_pack.pack_name = "copy"

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)

        e: EntityMention
        for e in from_pack.get(EntityMention):
            EntityMention(copy_pack, e.begin, e.end)
예제 #10
0
    def pack(
        self,
        pack: DataPack,
        predict_results: Dict[str, Dict[str, List[str]]],
        _: Optional[Annotation] = None,
    ):
        """
        Write the prediction results back to datapack. by writing the predicted
        ner to the original tokens.
        """

        if predict_results is None:
            return

        current_entity_mention: Tuple[int, str] = (-1, "None")

        for i in range(len(predict_results["Token"]["tid"])):
            # an instance
            for j in range(len(predict_results["Token"]["tid"][i])):
                tid: int = predict_results["Token"]["tid"][i][
                    j]  # type: ignore

                orig_token: Token = pack.get_entry(tid)  # type: ignore
                ner_tag: str = predict_results["Token"]["ner"][i][j]

                orig_token.ner = ner_tag

                token = orig_token
                token_ner = token.ner
                assert isinstance(token_ner, str)
                if token_ner[0] == "B":
                    current_entity_mention = (token.begin, token_ner[2:])
                elif token_ner[0] == "I":
                    continue
                elif token_ner[0] == "O":
                    continue

                elif token_ner[0] == "E":
                    if token_ner[2:] != current_entity_mention[1]:
                        continue

                    entity = EntityMention(pack, current_entity_mention[0],
                                           token.end)
                    entity.ner_type = current_entity_mention[1]
                elif token_ner[0] == "S":
                    current_entity_mention = (token.begin, token_ner[2:])
                    entity = EntityMention(pack, current_entity_mention[0],
                                           token.end)
                    entity.ner_type = current_entity_mention[1]
예제 #11
0
    def add_to_pack(self, pack: DataPack, instance: Annotation,
                    prediction: List[int]):
        r"""Add the prediction for attribute to the instance. We make following
        assumptions for prediction.

            1. If we encounter "I" while its tag is different from the previous
               tag, we will consider this "I" as a "B" and start a new tag here.
            2. We will truncate the prediction it according to the number of
               entry. If the prediction contains `<PAD>` element, this should
               remove them.

        Args:
            pack (DataPack):
                The datapack that contains the current instance.
            instance (Annotation):
                The instance to which the extractor add prediction.
            prediction (Iterable[Union[int, Any]]):
                This is the output of the model, which contains the index for
                attributes of one instance.
        """
        instance_tagging_unit: List[Annotation] = \
            list(pack.get(self.config.tagging_unit, instance))
        prediction = prediction[:len(instance_tagging_unit)]
        tags = [self.id2element(x) for x in prediction]
        tag_start = None
        tag_end = None
        tag_type = None
        for entry, tag in zip(instance_tagging_unit, tags):
            if tag[1] == "O" or tag[1] == "B" or \
                    (tag[1] == "I" and tag[0] != tag_type):
                if tag_type:
                    entity_mention = EntityMention(pack, tag_start, tag_end)
                    entity_mention.ner_type = tag_type
                tag_start = entry.begin
                tag_end = entry.end
                tag_type = tag[0]
            else:
                tag_end = entry.end

        # Handle the final tag
        if tag_type is not None and \
                tag_start is not None and \
                tag_end is not None:
            entity_mention = EntityMention(pack, tag_start, tag_end)
            entity_mention.ner_type = tag_type
예제 #12
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()

        with open(file_path, 'r', encoding='utf8') as fp:
            txt = ""
            offset = 0

            while True:
                sent_line: str = fp.readline()
                if not sent_line:
                    break

                if len(sent_line.split()) == 0:
                    continue

                relation_line: str = fp.readline()
                # Command line is not used.
                _ = fp.readline()

                sent_line = sent_line[sent_line.find('"') +
                                      1:sent_line.rfind('"')]
                index1 = sent_line.find("<e1>")
                index2 = sent_line.find("<e2>")
                # 5 is the length of "</e1>", include both <e1> and
                # </e1> when extracting the string.
                e1 = sent_line[index1:sent_line.find("</e1>") + 5]
                e2 = sent_line[index2:sent_line.find("</e2>") + 5]
                # Remove <e1> and </e1> in the sentence.
                sent_line = sent_line.replace(e1, e1[4:-5])
                sent_line = sent_line.replace(e2, e2[4:-5])
                # Remove <e1> and </e1> in e1.
                e1 = e1[4:-5]
                e2 = e2[4:-5]
                # Re-calculate the index after removing <e1>, </e1> in
                # in the sentence.
                if index1 < index2:
                    diff1 = 0
                    diff2 = 9
                else:
                    diff1 = 9
                    diff2 = 0
                index1 += offset - diff1
                index2 += offset - diff2

                Sentence(pack, offset, offset + len(sent_line))
                entry1 = EntityMention(pack, index1, index1 + len(e1))
                entry2 = EntityMention(pack, index2, index2 + len(e2))
                offset += len(sent_line) + 1
                txt += sent_line + " "

                pair = relation_line[relation_line.find("(") +
                                     1:relation_line.find(")")]

                if "," in pair:
                    parent, _ = pair.split(",")
                    if parent == "e1":
                        relation = RelationLink(pack, entry1, entry2)
                    else:
                        relation = RelationLink(pack, entry2, entry1)
                    relation.rel_type = relation_line[:relation_line.find("(")]
                else:
                    # For "Other" relation, just set parent as e1
                    # set child as e2.
                    relation = RelationLink(pack, entry1, entry2)
                    relation.rel_type = relation_line.strip()

        pack.set_text(txt, replace_func=self.text_replace_operation)
        pack.pack_name = os.path.basename(file_path)

        yield pack
예제 #13
0
    def _parse_pack(self, collection: str) -> Iterator[DataPack]:
        with open(collection, "r", encoding="utf8") as doc:
            pack_id: int = 0

            pack: DataPack = DataPack()
            text: str = ""
            offset: int = 0
            has_rows: bool = False

            sentence_begin: int = 0
            sentence_cnt: int = 0

            # NER tag is either "O" or in the format "X-Y",
            # where X is one of B, I,
            # Y is a tag like ORG, PER etc
            prev_y = None
            prev_x = None
            start_index = -1

            for line in doc:
                line = line.strip()

                if line.find("DOCSTART") != -1:
                    # Skip the first DOCSTART.
                    if offset == 0:
                        continue
                    # Add remaining sentence.
                    if has_rows:
                        # Add the last sentence if exists.
                        Sentence(pack, sentence_begin, offset - 1)
                        sentence_cnt += 1

                    pack.set_text(text,
                                  replace_func=self.text_replace_operation)
                    Document(pack, 0, len(text))
                    pack.pack_name = collection + "_%d" % pack_id
                    pack_id += 1
                    yield pack

                    # Create a new datapack.
                    pack = DataPack()
                    text = ""
                    offset = 0
                    has_rows = False

                    sentence_begin = 0
                    sentence_cnt = 0

                    prev_y = None
                    prev_x = None
                    start_index = -1

                elif line != "" and not line.startswith("#"):
                    conll_components = line.split()

                    word = conll_components[0]
                    pos = conll_components[1]
                    chunk_id = conll_components[2]

                    ner_tag = conll_components[3]

                    # A new ner tag occurs.
                    if ner_tag == "O" or ner_tag.split("-")[0] == "B":
                        # Add previous ner tag to sentence if it exists.
                        if prev_y is not None:
                            entity_mention = EntityMention(
                                pack, start_index, offset - 1)
                            entity_mention.ner_type = prev_y

                        # Start process current ner tag.
                        if ner_tag == "O":
                            # Current ner tag is O, reset information.
                            prev_x = None
                            prev_y = None
                            start_index = -1
                        else:
                            # Current ner tag is B.
                            prev_x = "B"
                            prev_y = ner_tag.split("-")[1]
                            start_index = offset
                    # This ner tag is connected to previous one.
                    else:
                        x, y = ner_tag.split("-")
                        assert x == "I", "Unseen tag %s in the file." % x
                        assert y == prev_y, "Error in %s." % ner_tag
                        assert prev_x in ("B", "I"), "Error in %s." % ner_tag
                        prev_x = "I"

                    word_begin = offset
                    word_end = offset + len(word)

                    # Add tokens.
                    token = Token(pack, word_begin, word_end)
                    token.pos = pos
                    token.chunk = chunk_id

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True
                else:
                    if not has_rows:
                        # Skip consecutive empty lines.
                        continue
                    # Add sentence
                    Sentence(pack, sentence_begin, offset - 1)

                    # Handle the last ner tag if exists.
                    if prev_x is not None:
                        entity_mention = EntityMention(pack, start_index,
                                                       offset - 1)
                        entity_mention.ner_type = prev_y

                    # Reset information.
                    sentence_cnt += 1
                    has_rows = False
                    prev_y = None
                    prev_x = None
                    sentence_begin = offset

            if has_rows:
                # Add the last sentence if exists.
                Sentence(pack, sentence_begin, offset - 1)
                sentence_cnt += 1

            pack.set_text(text, replace_func=self.text_replace_operation)
            Document(pack, 0, len(text))
            pack.pack_name = os.path.basename(collection)

            yield pack