Exemplo n.º 1
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                token = Token(pack, word_begin, word_end)
                token.pos = pos
                token.chunk = chunk_id
                token.ner = ner_tag

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                Sentence(pack, sentence_begin, offset - 1)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path
        doc.close()

        yield pack
Exemplo n.º 2
0
    def _process(self, input_pack: DataPack):
        pattern = "\\.\\s*"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            end = m.end()
            Sentence(input_pack, start, end)
            start = end

        if start < len(input_pack.text):
            input_pack.add_entry(
                Sentence(input_pack, start, len(input_pack.text)))
Exemplo n.º 3
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        data_pack: DataPack = self.new_pack()

        sent_begin: int = 0
        doc_text: str = ""

        with open(file_path, encoding="utf8") as doc:
            for para in doc:
                para = self.preprocess_reviews(para)
                sents = para.split("\n")
                for sent in sents:
                    if len(sent) > 0:
                        sent = sent.strip()
                        doc_text += sent + " "
                        doc_offset = sent_begin + len(sent) + 1
                        # Add sentences.
                        Sentence(data_pack, sent_begin, doc_offset - 1)
                        sent_begin = doc_offset

        pos_dir: str = os.path.basename(os.path.dirname(file_path))
        movie_file: str = os.path.basename(file_path)
        title: List = movie_file.split('_')
        doc_id: str = pos_dir + title[0]
        score: float = float(title[1].split('.')[0])
        score /= 10.0

        data_pack.pack_name = doc_id
        data_pack.set_text(doc_text)

        # Add documents.
        document: Document = Document(data_pack, 0, len(doc_text))
        document.sentiment = {doc_id: score}

        yield data_pack
Exemplo n.º 4
0
    def _parse_pack(self, sent_lines) -> Iterator[DataPack]:
        data_pack: DataPack = DataPack()
        sent_bias: int = 0
        batch_text: str = "\n".join(
            [sent_text for _, sent_text, _ in sent_lines]
        )
        data_pack.set_text(batch_text)

        for i, sent_line in enumerate(sent_lines):
            sent_id: str = sent_line[0]
            sent_text: str = sent_line[1].strip()
            parent_pointer_list: List[int] = sent_line[2]
            # Name the data_pack with the first sentence id.
            if i == 0:
                data_pack.pack_name = sent_id
            # Add sentence to data_pack.
            Sentence(data_pack, sent_bias, sent_bias + len(sent_text))
            self._parse_parent_pointer_list(
                data_pack,
                sent_bias,
                sent_text,
                parent_pointer_list
            )

            sent_bias += len(sent_text) + 1

        yield data_pack
Exemplo n.º 5
0
    def _process_parser(self, sentences, input_pack: DataPack):
        """Parse the sentence. Default behaviour is to segment sentence, POSTag
        and Lemmatize.

        Args:
            sentences: Generator object which yields sentences in document
            input_pack: input pack which needs to be modified

        Returns:

        """
        for sentence in sentences:
            Sentence(input_pack, sentence.start_char, sentence.end_char)

            if "tokenize" in self.processors:
                # Iterating through spaCy token objects
                for word in sentence:
                    begin_pos_word = word.idx
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word, end_pos_word)

                    if "pos" in self.processors:
                        token.pos = word.tag_

                    if "lemma" in self.processors:
                        token.lemma = word.lemma_
Exemplo n.º 6
0
    def pack(self, data_pack: MultiPack, output_dict):
        """
        Write the prediction results back to datapack. If :attr:`_overwrite`
        is `True`, write the predicted ner to the original tokens.
        Otherwise, create a new set of tokens and write the predicted ner
        to the new tokens (usually use this configuration for evaluation.)
        """
        assert output_dict is not None
        output_pack = data_pack.get_pack(self.output_pack_name)

        input_sent_tids = output_dict["input_sents_tids"]
        output_sentences = output_dict["output_sents"]

        text = output_pack.text
        input_pack = data_pack.get_pack(self.input_pack_name)
        for input_id, output_sentence in zip(input_sent_tids, output_sentences):
            offset = len(output_pack.text)
            sent = Sentence(output_pack, offset, offset + len(output_sentence))
            output_pack.add_entry(sent)
            text += output_sentence + "\n"

            input_sent = input_pack.get_entry(input_id)
            cross_link = MultiPackLink(
                data_pack, data_pack.subentry(self.input_pack_name, input_sent),
                data_pack.subentry(self.output_pack_name, sent))
            data_pack.add_entry(cross_link)
            # We may also consider adding two link with opposite directions
            # Here the unidirectional link indicates the generation dependency
        output_pack.set_text(text)
Exemplo n.º 7
0
    def _parse_pack(self, file_path: str) -> Iterator[MultiPack]:

        m_pack: MultiPack = MultiPack()

        input_pack_name = self.config.input_pack_name
        output_pack_name = self.config.output_pack_name

        text = ""
        offset = 0
        with open(file_path, "r", encoding="utf8") as doc:

            input_pack = DataPack(doc_id=file_path)

            for line in doc:
                line = line.strip()

                if len(line) == 0:
                    continue

                # add sentence
                sent = Sentence(input_pack, offset, offset + len(line))
                input_pack.add_entry(sent)
                text += line + '\n'
                offset = offset + len(line) + 1

            input_pack.set_text(text, replace_func=self.text_replace_operation)

            output_pack = DataPack()

            m_pack.update_pack({
                input_pack_name: input_pack,
                output_pack_name: output_pack
            })

            yield m_pack
Exemplo n.º 8
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = self.new_pack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":

                    line_list1 = line.split("\"sentenceTokens\":")
                    line_list2 = line_list1[1].split(",\"verbEntries\"")
                    sentence = line_list2[0]
                    # Add sentence.

                    temp_offset = offset
                    temp_offset = temp_offset + len(line_list1[0])
                    temp_offset = temp_offset + len("\"sentenceTokens\":")

                    Sentence(pack, temp_offset, temp_offset + len(sentence))

                    # For \n
                    offset += len(line) + 1
                    text += line + " "

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path

        yield pack
Exemplo n.º 9
0
    def _parse_pack(self, base_and_path: Tuple[str,
                                               str]) -> Iterator[MultiPack]:
        base_dir, file_path = base_and_path

        m_pack: MultiPack = MultiPack()

        input_pack_name = self.config.input_pack_name
        output_pack_name = self.config.output_pack_name

        text = ""
        offset = 0
        with open(file_path, "r", encoding="utf8") as doc:
            # Remove long path from the beginning.
            doc_id = file_path[file_path.startswith(base_dir) and len(base_dir
                                                                      ):]
            doc_id = doc_id.strip(os.path.sep)

            input_pack = m_pack.add_pack(input_pack_name)
            input_pack.doc_id = doc_id

            for line in doc:
                line = line.strip()

                if len(line) == 0:
                    continue

                # add sentence
                Sentence(input_pack, offset, offset + len(line))
                text += line + '\n'
                offset = offset + len(line) + 1

            input_pack.set_text(text, replace_func=self.text_replace_operation)
            # Create a output pack without text.
            m_pack.add_pack(output_pack_name)
            yield m_pack
Exemplo n.º 10
0
    def _process(self, input_pack: DataPack):
        pattern = '\\.\\s*'
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            end = m.end()
            Sentence(input_pack, start, end)
            start = end
Exemplo n.º 11
0
    def _process(self, input_pack: DataPack):
        doc = input_pack.text

        if len(doc) == 0:
            logging.warning("Find empty text in doc.")

        # sentence parsing
        sentences = self.nlp(doc).sentences

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            Sentence(
                input_pack,
                sentence.tokens[0].start_char,
                sentence.tokens[-1].end_char,
            )

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    misc = word.misc.split("|")

                    t_start = -1
                    t_end = -1
                    for m in misc:
                        k, v = m.split("=")
                        if k == "start_char":
                            t_start = int(v)
                        elif k == "end_char":
                            t_end = int(v)

                    if t_start < 0 or t_end < 0:
                        raise ValueError(
                            "Cannot determine word start or end for "
                            "stanfordnlp."
                        )

                    token = Token(input_pack, t_start, t_end)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.head - 1]  # Head token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.deprel
Exemplo n.º 12
0
def build_ngram(sent: Sentence, n: int):
    # Should exclude light words from ngrams.

    if n == 1:
        return [[t] for t in sent.get(Token)]

    ngrams = []
    ngram = []
    k = 0
    for i, t in enumerate(sent.get(Token)):
        if k < n:
            ngram.append(t)
            k += 1
        else:
            if len(ngrams) > 0:
                ngram = ngram[1:] + [t]
            ngrams.append(ngram)
    return ngrams
    def test_back_translation(self):
        random.seed(0)
        data_pack = DataPack()
        text = "Natural Language Processing has never been made this simple!"
        data_pack.set_text(text)
        sent = Sentence(data_pack, 0, len(text))
        data_pack.add_entry(sent)

        translated_text = "The treatment of natural language has never been easier!"
        assert(translated_text == self.bta.replace(sent)[1])
Exemplo n.º 14
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")
                    sentence: str = oie_component[0]

                    # Add sentence.
                    Sentence(pack, offset, offset + len(sentence))
                    offset += len(sentence) + 1
                    text += sentence + " "

                    head_predicate: str = oie_component[1]
                    full_predicate: str = oie_component[2]

                    # Add head predicate.
                    token: Token = Token(pack,
                                         offset,
                                         offset + len(head_predicate))
                    offset += len(head_predicate) + 1
                    text += head_predicate + " "

                    # Add full predicate.
                    predicate_mention: PredicateMention = PredicateMention(pack,
                                                         offset,
                                                         offset
                                                         + len(full_predicate))
                    predicate_mention.headword = token
                    offset += len(full_predicate) + 1
                    text += full_predicate + " "

                    for arg in oie_component[3:]:
                        # Add predicate argument.
                        predicate_arg: PredicateArgument = \
                            PredicateArgument(pack,
                                              offset,
                                              offset + len(arg))
                        offset += len(arg) + 1
                        text += arg + " "

                        # Add predicate link.
                        PredicateLink(pack, predicate_mention, predicate_arg)

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path

        yield pack
Exemplo n.º 15
0
 def _process(self, input_pack: DataPack):
     text = input_pack.text
     end_pos = 0
     paragraphs = [p for p in text.split('\n') if p]
     for paragraph in paragraphs:
         sentences = sent_tokenize(paragraph)
         for sentence_text in sentences:
             begin_pos = text.find(sentence_text, end_pos)
             end_pos = begin_pos + len(sentence_text)
             sentence_entry = Sentence(input_pack, begin_pos, end_pos)
             input_pack.add_or_get_entry(sentence_entry)
Exemplo n.º 16
0
 def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
     with open(file_path, "r", encoding="utf8") as doc:
         for line in doc:
             pack = DataPack(doc_id=file_path)
             line = line.strip()
             if len(line) == 0:
                 continue
             sent = Sentence(pack, 0, len(line))
             pack.add_entry(sent)
             pack.set_text(line)
             self.count += 1
             yield pack
Exemplo n.º 17
0
    def _process(self, input_pack: DataPack):
        doc = input_pack.text
        end_pos = 0

        # sentence parsing
        sentences = self.nlp(doc).sentences  # type: ignore

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            begin_pos = doc.find(sentence.words[0].text, end_pos)
            end_pos = doc.find(sentence.words[-1].text, begin_pos) + len(
                sentence.words[-1].text)
            sentence_entry = Sentence(input_pack, begin_pos, end_pos)
            input_pack.add_or_get_entry(sentence_entry)

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                offset = sentence_entry.span.begin
                end_pos_word = 0

                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    begin_pos_word = sentence_entry.text. \
                        find(word.text, end_pos_word)
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack,
                                  begin_pos_word + offset,
                                  end_pos_word + offset
                                  )

                    if "pos" in self.processors:
                        token.set_fields(pos=word.pos)
                        token.set_fields(upos=word.upos)
                        token.set_fields(xpos=word.xpos)

                    if "lemma" in self.processors:
                        token.set_fields(lemma=word.lemma)

                    tokens.append(token)
                    input_pack.add_or_get_entry(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.governor - 1]  # Root token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.set_fields(
                        rel_type=word.dependency_relation)

                    input_pack.add_or_get_entry(relation_entry)
Exemplo n.º 18
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + '_copy'
        else:
            copy_pack.pack_name = 'copy'

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)
Exemplo n.º 19
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:  # type: ignore
        with open(file_path, "r", encoding="utf8") as doc:
            for line in doc:
                line = line.strip()
                if len(line) == 0:
                    continue

                m_pack = MultiPack()
                pack = m_pack.add_pack("pack")
                pack.set_text(line)

                Sentence(pack, 0, len(line))
                self.count += 1

                yield m_pack  # type: ignore
Exemplo n.º 20
0
 def _parse_pack(self,
                 file_path: str) -> Iterator[DataPack]:  # type: ignore
     with open(file_path, "r", encoding="utf8") as doc:
         for line in doc:
             m_pack = MultiPack()
             pack = DataPack(doc_id=file_path)
             line = line.strip()
             if len(line) == 0:
                 continue
             sent = Sentence(pack, 0, len(line))
             pack.add_entry(sent)
             pack.set_text(line)
             self.count += 1
             m_pack.update_pack({"pack": pack})
             yield m_pack  # type: ignore
Exemplo n.º 21
0
    def _process(self, input_pack: DataPack):
        # pylint: disable=no-self-use
        text = input_pack.text

        begin_pos = 0
        while begin_pos < len(text):
            end_pos = min(text.find('.', begin_pos))
            if end_pos == -1:
                end_pos = len(text) - 1
            sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1)
            input_pack.add_or_get_entry(sentence_entry)

            begin_pos = end_pos + 1
            while begin_pos < len(text) and text[begin_pos] == " ":
                begin_pos += 1
Exemplo n.º 22
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + "_copy"
        else:
            copy_pack.pack_name = "copy"

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)

        e: EntityMention
        for e in from_pack.get(EntityMention):
            EntityMention(copy_pack, e.begin, e.end)
Exemplo n.º 23
0
    def _parse_pack(self, file_path: str) -> Iterator[MultiPack]:
        m_pack: MultiPack = MultiPack()

        input_pack_name = "input_src"
        output_pack_name = "output_tgt"

        with open(file_path, "r", encoding="utf8") as doc:
            text = ""
            offset = 0

            sentence_cnt = 0

            input_pack = DataPack(doc_id=file_path)

            for line in doc:
                line = line.strip()
                if len(line) == 0:
                    # skip empty lines
                    continue
                # add sentence
                sent = Sentence(input_pack, offset, offset + len(line))
                input_pack.add_entry(sent)
                text += line + '\n'
                offset = offset + len(line) + 1

                sentence_cnt += 1

                if sentence_cnt >= 20:
                    break

            input_pack.set_text(text, replace_func=self.text_replace_operation)

        output_pack = DataPack()

        m_pack.update_pack({
            input_pack_name: input_pack,
            output_pack_name: output_pack
        })

        yield m_pack
Exemplo n.º 24
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")

                    # Add sentence.
                    sentence = oie_component[0]
                    text += sentence + "\n"
                    Sentence(pack, offset, offset + len(sentence))

                    # Find argument 1.
                    arg1_begin = sentence.find(oie_component[3]) + offset
                    arg1_end = arg1_begin + len(oie_component[3])
                    arg1: EntityMention = EntityMention(
                        pack, arg1_begin, arg1_end)

                    # Find argument 2.
                    arg2_begin = sentence.find(oie_component[4]) + offset
                    arg2_end = arg2_begin + len(oie_component[4])
                    arg2: EntityMention = EntityMention(
                        pack, arg2_begin, arg2_end)

                    head_relation = RelationLink(pack, arg1, arg2)
                    head_relation.rel_type = oie_component[2]

                    offset += len(sentence) + 1

        self.set_text(pack, text)
        pack.pack_name = os.path.basename(file_path)
        yield pack
Exemplo n.º 25
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)

                    if fields.pos_tag is not None:
                        token.pos = fields.pos_tag
                    if fields.word_sense is not None:
                        token.sense = fields.word_sense

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)

                        pred_mention.predicate_lemma = fields.lemmatised_word
                        pred_mention.is_verb = word_is_verbal_predicate

                        if fields.framenet_id is not None:
                            pred_mention.framenet_id = fields.framenet_id

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            link = PredicateLink(pack, predicate, arg[0])
                            link.arg_type = arg[1]

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.speaker = speaker
                    if part_id is not None:
                        sent.part_id = int(part_id)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                group = CoreferenceGroup(pack)
                group.add_members(mention_list)

            text = " ".join(words)
            pack.set_text(text, replace_func=self.text_replace_operation)

            _ = Document(pack, 0, len(text))
            if document_id is not None:
                pack.pack_name = document_id
        yield pack
Exemplo n.º 26
0
    def _parse_pack(self, doc_lines) -> Iterator[DataPack]:
        # pylint: disable=no-self-use
        token_comp_fields = [
            "id", "form", "lemma", "pos", "ud_xpos", "features", "head",
            "label", "enhanced_dependency_relations", "ud_misc"
        ]

        token_multi_fields = [
            "features", "ud_misc", "enhanced_dependency_relations"
        ]

        token_feature_fields = ["features", "ud_misc"]

        token_entry_fields = ["lemma", "pos", "ud_xpos", "features", "ud_misc"]

        data_pack: DataPack = DataPack()
        doc_sent_begin: int = 0
        doc_num_sent: int = 0
        doc_text: str = ''
        doc_offset: int = 0
        doc_id: str

        sent_text: str
        sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {}

        for line in doc_lines:
            line = line.strip()
            line_comps = line.split()

            if line.startswith("# newdoc"):
                doc_id = line.split("=")[1].strip()

            elif line.startswith("# sent"):
                sent_text = ''

            elif len(line_comps) > 0 and \
                    line_comps[0].strip().isdigit():
                # token
                token_comps: Dict[str, Any] = {}

                for index, key in enumerate(token_comp_fields):
                    token_comps[key] = str(line_comps[index])

                    if key in token_multi_fields:
                        values = str(token_comps[key]).split("|") \
                            if token_comps[key] != '_' else []
                        if key not in token_feature_fields:
                            token_comps[key] = values
                        else:
                            feature_lst = [
                                elem.split('=', 1) for elem in values
                            ]
                            feature_dict = {
                                elem[0]: elem[1]
                                for elem in feature_lst
                            }
                            token_comps[key] = feature_dict

                word: str = token_comps["form"]
                word_begin = doc_offset
                word_end = doc_offset + len(word)

                token: Token \
                    = Token(data_pack, word_begin, word_end)
                kwargs = {key: token_comps[key] for key in token_entry_fields}

                # add token
                token.set_fields(**kwargs)
                data_pack.add_or_get_entry(token)

                sent_tokens[str(token_comps["id"])] = (token_comps, token)

                sent_text += word + " "
                doc_offset = word_end + 1

            elif line == "":
                # sentence ends
                sent_text = sent_text.strip()
                doc_text += ' ' + sent_text

                # add dependencies for a sentence when all the tokens have been
                # added
                for token_id in sent_tokens:
                    token_comps, token = sent_tokens[token_id]

                    def add_dependency(dep_parent, dep_child, dep_label,
                                       dep_type, data_pack_):
                        """Adds dependency to a data_pack
                        Args:
                            dep_parent: dependency parent token
                            dep_child: dependency child token
                            dep_label: dependency label
                            dep_type: "primary" or "enhanced" dependency
                            data_pack_: data_pack to which the
                            dependency is to be added
                        """
                        dependency = Dependency(data_pack, dep_parent,
                                                dep_child)
                        dependency.dep_label = dep_label
                        dependency.type = dep_type
                        data_pack_.add_or_get_entry(dependency)

                    # add primary dependency
                    label = token_comps["label"]
                    if label == "root":
                        token.is_root = True
                    else:
                        token.is_root = False
                        head = sent_tokens[token_comps["head"]][1]
                        add_dependency(head, token, label, "primary",
                                       data_pack)

                    # add enhanced dependencies
                    for dep in token_comps["enhanced_dependency_relations"]:
                        head_id, label = dep.split(":", 1)
                        if label != "root":
                            head = sent_tokens[head_id][1]
                            add_dependency(head, token, label, "enhanced",
                                           data_pack)

                # add sentence
                sent = Sentence(data_pack, doc_sent_begin, doc_offset - 1)
                data_pack.add_or_get_entry(sent)

                doc_sent_begin = doc_offset
                doc_num_sent += 1

        # add doc to data_pack
        document = Document(data_pack, 0, len(doc_text))
        data_pack.add_or_get_entry(document)
        data_pack.meta.doc_id = doc_id
        data_pack.set_text(doc_text.strip())

        yield data_pack
Exemplo n.º 27
0
    def _parse_pack(self, doc_lines) -> Iterator[DataPack]:
        token_comp_fields = ["id", "form", "lemma", "pos",
                             "ud_xpos", "ud_features", "head", "label",
                             "enhanced_dependency_relations", "ud_misc"]

        token_multi_fields = ["ud_features", "ud_misc",
                              "enhanced_dependency_relations"]

        token_feature_fields = ["ud_features", "ud_misc"]

        data_pack: DataPack = DataPack()
        doc_sent_begin: int = 0
        doc_num_sent: int = 0
        doc_text: str = ''
        doc_offset: int = 0
        doc_id: str

        sent_text: str
        sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {}

        for line in doc_lines:
            line = line.strip()
            line_comps = line.split()

            if line.startswith("# newdoc"):
                doc_id = line.split("=")[1].strip()

            elif line.startswith("# sent"):
                sent_text = ''

            elif len(line_comps) > 0 and \
                    line_comps[0].strip().isdigit():
                # token
                token_comps: Dict[str, Any] = {}

                for index, key in enumerate(token_comp_fields):
                    token_comps[key] = str(line_comps[index])

                    if key in token_multi_fields:
                        values = str(token_comps[key]).split("|") \
                            if token_comps[key] != '_' else []
                        if key not in token_feature_fields:
                            token_comps[key] = values
                        else:
                            feature_lst = [elem.split('=', 1)
                                           for elem in values]
                            feature_dict = {elem[0]: elem[1]
                                            for elem in feature_lst}
                            token_comps[key] = feature_dict

                word: str = token_comps["form"]
                word_begin = doc_offset
                word_end = doc_offset + len(word)

                # add token
                token: Token = Token(data_pack, word_begin, word_end)

                token.lemma = token_comps['lemma']
                token.pos = token_comps['pos']
                token.ud_xpos = token_comps['ud_xpos']
                token.ud_features = token_comps['ud_features']
                token.ud_misc = token_comps['ud_misc']

                sent_tokens[str(token_comps["id"])] = (token_comps, token)

                sent_text += word + " "
                doc_offset = word_end + 1

            elif line == "":
                # sentence ends
                sent_text = sent_text.strip()
                doc_text += ' ' + sent_text

                # add dependencies for a sentence when all the tokens have been
                # added
                for token_id in sent_tokens:
                    token_comps, token = sent_tokens[token_id]

                    # add primary dependency
                    label = token_comps["label"]
                    if label == "root":
                        token.is_root = True
                    else:
                        token.is_root = False
                        head = sent_tokens[token_comps["head"]][1]
                        dependency = Dependency(data_pack, head, token)
                        dependency.dep_label = label

                    # add enhanced dependencies
                    for dep in token_comps["enhanced_dependency_relations"]:
                        head_id, label = dep.split(":", 1)
                        if label != "root":
                            head = sent_tokens[head_id][1]
                            enhanced_dependency = \
                                EnhancedDependency(data_pack, head, token)
                            enhanced_dependency.dep_label = label

                # add sentence
                Sentence(data_pack, doc_sent_begin, doc_offset - 1)

                doc_sent_begin = doc_offset
                doc_num_sent += 1

        doc_text = doc_text.strip()
        data_pack.set_text(doc_text)

        # add doc to data_pack
        Document(data_pack, 0, len(doc_text))
        data_pack.pack_name = doc_id

        yield data_pack
Exemplo n.º 28
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()

        with open(file_path, 'r', encoding='utf8') as fp:
            txt = ""
            offset = 0

            while True:
                sent_line: str = fp.readline()
                if not sent_line:
                    break

                if len(sent_line.split()) == 0:
                    continue

                relation_line: str = fp.readline()
                # Command line is not used.
                _ = fp.readline()

                sent_line = sent_line[sent_line.find('"') +
                                      1:sent_line.rfind('"')]
                index1 = sent_line.find("<e1>")
                index2 = sent_line.find("<e2>")
                # 5 is the length of "</e1>", include both <e1> and
                # </e1> when extracting the string.
                e1 = sent_line[index1:sent_line.find("</e1>") + 5]
                e2 = sent_line[index2:sent_line.find("</e2>") + 5]
                # Remove <e1> and </e1> in the sentence.
                sent_line = sent_line.replace(e1, e1[4:-5])
                sent_line = sent_line.replace(e2, e2[4:-5])
                # Remove <e1> and </e1> in e1.
                e1 = e1[4:-5]
                e2 = e2[4:-5]
                # Re-calculate the index after removing <e1>, </e1> in
                # in the sentence.
                if index1 < index2:
                    diff1 = 0
                    diff2 = 9
                else:
                    diff1 = 9
                    diff2 = 0
                index1 += offset - diff1
                index2 += offset - diff2

                Sentence(pack, offset, offset + len(sent_line))
                entry1 = EntityMention(pack, index1, index1 + len(e1))
                entry2 = EntityMention(pack, index2, index2 + len(e2))
                offset += len(sent_line) + 1
                txt += sent_line + " "

                pair = relation_line[relation_line.find("(") +
                                     1:relation_line.find(")")]

                if "," in pair:
                    parent, _ = pair.split(",")
                    if parent == "e1":
                        relation = RelationLink(pack, entry1, entry2)
                    else:
                        relation = RelationLink(pack, entry2, entry1)
                    relation.rel_type = relation_line[:relation_line.find("(")]
                else:
                    # For "Other" relation, just set parent as e1
                    # set child as e2.
                    relation = RelationLink(pack, entry1, entry2)
                    relation.rel_type = relation_line.strip()

        pack.set_text(txt, replace_func=self.text_replace_operation)
        pack.pack_name = os.path.basename(file_path)

        yield pack
Exemplo n.º 29
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag}
                token = Token(pack, word_begin, word_end)

                token.set_fields(**kwargs_i)
                pack.add_or_get_entry(token)

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                sent = Sentence(pack, sentence_begin, offset - 1)
                pack.add_or_get_entry(sent)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            sent = Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1
            pack.add_or_get_entry(sent)

        document = Document(pack, 0, len(text))
        pack.add_or_get_entry(document)

        pack.set_text(text, replace_func=self.text_replace_operation)
        pack.meta.doc_id = file_path
        doc.close()

        yield pack
Exemplo n.º 30
0
 def _process(self, input_pack: DataPack):
     for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
         Sentence(input_pack, begin, end)