Exemplo n.º 1
    def test_not_hashable(self):
        anno: Annotation = Annotation(self.pack, 0, 5)
        with self.assertRaises(TypeError):

        anno1: EntityMention = EntityMention(self.pack, 0, 2)
        with self.assertRaises(TypeError):
Exemplo n.º 2
    def _process(self, input_pack: DataPack):
        entity_text = self.configs.entities_to_insert

        input_text = input_pack.text
        if not all(bool(entity in input_text) for entity in entity_text):
            raise Exception(
                "Entities to be added are not valid for the input text.")
        for entity in entity_text:
            start = input_text.index(entity)
            end = start + len(entity)
            entity_mention = EntityMention(input_pack, start, end)
Exemplo n.º 3
    def pack(self,
             data_pack: DataPack,
             output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None):
        Write the prediction results back to datapack. by writing the predicted
        ner to the original tokens.

        if output_dict is None:

        current_entity_mention: Tuple[int, str] = (-1, "None")

        for i in range(len(output_dict["Token"]["tid"])):
            # an instance
            for j in range(len(output_dict["Token"]["tid"][i])):
                tid: int = output_dict["Token"]["tid"][i][j]  # type: ignore

                orig_token: Token = data_pack.get_entry(tid)  # type: ignore
                ner_tag: str = output_dict["Token"]["ner"][i][j]


                token = orig_token
                token_ner = token.get_field("ner")
                if token_ner[0] == "B":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                elif token_ner[0] == "I":
                elif token_ner[0] == "O":

                elif token_ner[0] == "E":
                    if token_ner[2:] != current_entity_mention[1]:

                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
                elif token_ner[0] == "S":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
Exemplo n.º 4
    def pack(self,
             data_pack: DataPack,
             output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None):
        Write the prediction results back to datapack. by writing the predicted
        ner to the original subwords and convert predictions to something that
        makes sense in a word-by-word segmentation

        if output_dict is None:

        for i in range(len(output_dict["Subword"]["tid"])):
            tids = output_dict["Subword"]["tid"][i]
            labels = output_dict["Subword"]["ner"][i]

            # Filter to labels not in `self.ignore_labels`
            entities = [
                dict(idx=idx, label=label, tid=tid)
                for idx, (label, tid) in enumerate(zip(labels, tids))
                if label not in self.ft_configs.ignore_labels

            entity_groups = self._group_entities(entities, data_pack, tids)
            # Add NER tags and create EntityMention ontologies.
            for first_tid, last_tid in entity_groups:
                first_token: Subword = data_pack.get_entry(  # type: ignore
                first_token.ner = 'B-' + self.ft_configs.ner_type

                for tid in range(first_tid + 1, last_tid + 1):
                    token: Subword = data_pack.get_entry(tid)  # type: ignore
                    token.ner = 'I-' + self.ft_configs.ner_type

                begin = first_token.span.begin
                end = data_pack.get_entry(last_tid).span.end
                entity = EntityMention(data_pack, begin, end)
                entity.ner_type = self.ft_configs.ner_type
Exemplo n.º 5
    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(Sentence):
            token_entries = list(
            tokens = [(token.text, token.pos) for token in token_entries]
            ne_tree = ne_chunk(tokens)

            index = 0
            for chunk in ne_tree:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    entity = EntityMention(input_pack, begin_pos, end_pos)
                    entity.ner_type = chunk.label()
                    index += len(chunk)
                    # For example:
                    # chunk: ('This', 'DT')
                    index += 1
Exemplo n.º 6
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)


        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + '_copy'
            copy_pack.pack_name = 'copy'

        ent: EntityMention
        for ent in from_pack.get(EntityMention):
            EntityMention(copy_pack, ent.begin, ent.end)
Exemplo n.º 7
    def _parse_pack(self, data: dict) -> Iterator[DataPack]:
        Extracts information from input `data` of one document
        output from Prodigy Annotator including the text,
        tokens and its annotations into a DataPack.

            data: a dict that contains information for one document.

        Returns: DataPack containing information extracted from `data`.

        pack = DataPack()
        text = data['text']
        tokens = data['tokens']
        spans = data['spans']

        document = Document(pack, 0, len(text))
        pack.set_text(text, replace_func=self.text_replace_operation)

        for token in tokens:
            begin = token['start']
            end = token['end']
            token_entry = Token(pack, begin, end)

        for span_items in spans:
            begin = span_items['start']
            end = span_items['end']
            annotation_entry = EntityMention(pack, begin, end)

        pack.meta.doc_id = data['meta']['id']

        yield pack
Exemplo n.º 8
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")

                    # Add sentence.
                    sentence = oie_component[0]
                    text += sentence + "\n"
                    Sentence(pack, offset, offset + len(sentence))

                    # Find argument 1.
                    arg1_begin = sentence.find(oie_component[3]) + offset
                    arg1_end = arg1_begin + len(oie_component[3])
                    arg1: EntityMention = EntityMention(
                        pack, arg1_begin, arg1_end)

                    # Find argument 2.
                    arg2_begin = sentence.find(oie_component[4]) + offset
                    arg2_end = arg2_begin + len(oie_component[4])
                    arg2: EntityMention = EntityMention(
                        pack, arg2_begin, arg2_end)

                    head_relation = RelationLink(pack, arg1, arg2)
                    head_relation.rel_type = oie_component[2]

                    offset += len(sentence) + 1

        self.set_text(pack, text)
        pack.pack_name = os.path.basename(file_path)
        yield pack
Exemplo n.º 9
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)


        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + "_copy"
            copy_pack.pack_name = "copy"

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)

        e: EntityMention
        for e in from_pack.get(EntityMention):
            EntityMention(copy_pack, e.begin, e.end)
Exemplo n.º 10
    def pack(
        pack: DataPack,
        predict_results: Dict[str, Dict[str, List[str]]],
        _: Optional[Annotation] = None,
        Write the prediction results back to datapack. by writing the predicted
        ner to the original tokens.

        if predict_results is None:

        current_entity_mention: Tuple[int, str] = (-1, "None")

        for i in range(len(predict_results["Token"]["tid"])):
            # an instance
            for j in range(len(predict_results["Token"]["tid"][i])):
                tid: int = predict_results["Token"]["tid"][i][
                    j]  # type: ignore

                orig_token: Token = pack.get_entry(tid)  # type: ignore
                ner_tag: str = predict_results["Token"]["ner"][i][j]

                orig_token.ner = ner_tag

                token = orig_token
                token_ner = token.ner
                assert isinstance(token_ner, str)
                if token_ner[0] == "B":
                    current_entity_mention = (token.begin, token_ner[2:])
                elif token_ner[0] == "I":
                elif token_ner[0] == "O":

                elif token_ner[0] == "E":
                    if token_ner[2:] != current_entity_mention[1]:

                    entity = EntityMention(pack, current_entity_mention[0],
                    entity.ner_type = current_entity_mention[1]
                elif token_ner[0] == "S":
                    current_entity_mention = (token.begin, token_ner[2:])
                    entity = EntityMention(pack, current_entity_mention[0],
                    entity.ner_type = current_entity_mention[1]
Exemplo n.º 11
    def add_to_pack(self, pack: DataPack, instance: Annotation,
                    prediction: List[int]):
        r"""Add the prediction for attribute to the instance. We make following
        assumptions for prediction.

            1. If we encounter "I" while its tag is different from the previous
               tag, we will consider this "I" as a "B" and start a new tag here.
            2. We will truncate the prediction it according to the number of
               entry. If the prediction contains `<PAD>` element, this should
               remove them.

            pack (DataPack):
                The datapack that contains the current instance.
            instance (Annotation):
                The instance to which the extractor add prediction.
            prediction (Iterable[Union[int, Any]]):
                This is the output of the model, which contains the index for
                attributes of one instance.
        instance_tagging_unit: List[Annotation] = \
            list(pack.get(self.config.tagging_unit, instance))
        prediction = prediction[:len(instance_tagging_unit)]
        tags = [self.id2element(x) for x in prediction]
        tag_start = None
        tag_end = None
        tag_type = None
        for entry, tag in zip(instance_tagging_unit, tags):
            if tag[1] == "O" or tag[1] == "B" or \
                    (tag[1] == "I" and tag[0] != tag_type):
                if tag_type:
                    entity_mention = EntityMention(pack, tag_start, tag_end)
                    entity_mention.ner_type = tag_type
                tag_start = entry.begin
                tag_end = entry.end
                tag_type = tag[0]
                tag_end = entry.end

        # Handle the final tag
        if tag_type is not None and \
                tag_start is not None and \
                tag_end is not None:
            entity_mention = EntityMention(pack, tag_start, tag_end)
            entity_mention.ner_type = tag_type
Exemplo n.º 12
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()

        with open(file_path, 'r', encoding='utf8') as fp:
            txt = ""
            offset = 0

            while True:
                sent_line: str = fp.readline()
                if not sent_line:

                if len(sent_line.split()) == 0:

                relation_line: str = fp.readline()
                # Command line is not used.
                _ = fp.readline()

                sent_line = sent_line[sent_line.find('"') +
                index1 = sent_line.find("<e1>")
                index2 = sent_line.find("<e2>")
                # 5 is the length of "</e1>", include both <e1> and
                # </e1> when extracting the string.
                e1 = sent_line[index1:sent_line.find("</e1>") + 5]
                e2 = sent_line[index2:sent_line.find("</e2>") + 5]
                # Remove <e1> and </e1> in the sentence.
                sent_line = sent_line.replace(e1, e1[4:-5])
                sent_line = sent_line.replace(e2, e2[4:-5])
                # Remove <e1> and </e1> in e1.
                e1 = e1[4:-5]
                e2 = e2[4:-5]
                # Re-calculate the index after removing <e1>, </e1> in
                # in the sentence.
                if index1 < index2:
                    diff1 = 0
                    diff2 = 9
                    diff1 = 9
                    diff2 = 0
                index1 += offset - diff1
                index2 += offset - diff2

                Sentence(pack, offset, offset + len(sent_line))
                entry1 = EntityMention(pack, index1, index1 + len(e1))
                entry2 = EntityMention(pack, index2, index2 + len(e2))
                offset += len(sent_line) + 1
                txt += sent_line + " "

                pair = relation_line[relation_line.find("(") +

                if "," in pair:
                    parent, _ = pair.split(",")
                    if parent == "e1":
                        relation = RelationLink(pack, entry1, entry2)
                        relation = RelationLink(pack, entry2, entry1)
                    relation.rel_type = relation_line[:relation_line.find("(")]
                    # For "Other" relation, just set parent as e1
                    # set child as e2.
                    relation = RelationLink(pack, entry1, entry2)
                    relation.rel_type = relation_line.strip()

        pack.set_text(txt, replace_func=self.text_replace_operation)
        pack.pack_name = os.path.basename(file_path)

        yield pack
Exemplo n.º 13
    def _parse_pack(self, collection: str) -> Iterator[DataPack]:
        with open(collection, "r", encoding="utf8") as doc:
            pack_id: int = 0

            pack: DataPack = DataPack()
            text: str = ""
            offset: int = 0
            has_rows: bool = False

            sentence_begin: int = 0
            sentence_cnt: int = 0

            # NER tag is either "O" or in the format "X-Y",
            # where X is one of B, I,
            # Y is a tag like ORG, PER etc
            prev_y = None
            prev_x = None
            start_index = -1

            for line in doc:
                line = line.strip()

                if line.find("DOCSTART") != -1:
                    # Skip the first DOCSTART.
                    if offset == 0:
                    # Add remaining sentence.
                    if has_rows:
                        # Add the last sentence if exists.
                        Sentence(pack, sentence_begin, offset - 1)
                        sentence_cnt += 1

                    Document(pack, 0, len(text))
                    pack.pack_name = collection + "_%d" % pack_id
                    pack_id += 1
                    yield pack

                    # Create a new datapack.
                    pack = DataPack()
                    text = ""
                    offset = 0
                    has_rows = False

                    sentence_begin = 0
                    sentence_cnt = 0

                    prev_y = None
                    prev_x = None
                    start_index = -1

                elif line != "" and not line.startswith("#"):
                    conll_components = line.split()

                    word = conll_components[0]
                    pos = conll_components[1]
                    chunk_id = conll_components[2]

                    ner_tag = conll_components[3]

                    # A new ner tag occurs.
                    if ner_tag == "O" or ner_tag.split("-")[0] == "B":
                        # Add previous ner tag to sentence if it exists.
                        if prev_y is not None:
                            entity_mention = EntityMention(
                                pack, start_index, offset - 1)
                            entity_mention.ner_type = prev_y

                        # Start process current ner tag.
                        if ner_tag == "O":
                            # Current ner tag is O, reset information.
                            prev_x = None
                            prev_y = None
                            start_index = -1
                            # Current ner tag is B.
                            prev_x = "B"
                            prev_y = ner_tag.split("-")[1]
                            start_index = offset
                    # This ner tag is connected to previous one.
                        x, y = ner_tag.split("-")
                        assert x == "I", "Unseen tag %s in the file." % x
                        assert y == prev_y, "Error in %s." % ner_tag
                        assert prev_x in ("B", "I"), "Error in %s." % ner_tag
                        prev_x = "I"

                    word_begin = offset
                    word_end = offset + len(word)

                    # Add tokens.
                    token = Token(pack, word_begin, word_end)
                    token.pos = pos
                    token.chunk = chunk_id

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True
                    if not has_rows:
                        # Skip consecutive empty lines.
                    # Add sentence
                    Sentence(pack, sentence_begin, offset - 1)

                    # Handle the last ner tag if exists.
                    if prev_x is not None:
                        entity_mention = EntityMention(pack, start_index,
                                                       offset - 1)
                        entity_mention.ner_type = prev_y

                    # Reset information.
                    sentence_cnt += 1
                    has_rows = False
                    prev_y = None
                    prev_x = None
                    sentence_begin = offset

            if has_rows:
                # Add the last sentence if exists.
                Sentence(pack, sentence_begin, offset - 1)
                sentence_cnt += 1

            pack.set_text(text, replace_func=self.text_replace_operation)
            Document(pack, 0, len(text))
            pack.pack_name = os.path.basename(collection)

            yield pack