示例#1
0
文件: common.py 项目: JiKook31/thesis
    def prettify(self, tokens: List[str], tags: List[str],
                 lemmas: List[str]) -> Union[List[str], str]:
        """Prettifies output of morphological tagger.

        Args:
            tokens: tokenized source sentence
            tags: list of tags, the output of a tagger
            lemmas: list of lemmas, the output of a lemmatizer

        Returns:
            the prettified output of the tagger.

        Examples:
            >>> sent = "John really likes pizza .".split()
            >>> tags = ["PROPN,Number=Sing", "ADV",
            >>>         "VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
            >>>         "NOUN,Number=Sing", "PUNCT"]
            >>> lemmas = "John really like pizza .".split()
            >>> prettifier = LemmatizedOutputPrettifier()
            >>> self.prettify(sent, tags, lemmas)
                1	John	John	PROPN	_	Number=Sing	_	_	_	_
                2	really	really	ADV	_	_	_	_	_	_
                3	likes	like	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	_	_	_	_
                4	pizza	pizza	NOUN	_	Number=Sing	_	_	_	_
                5	.	.	PUNCT	_	_	_	_	_	_
        """
        answer = []
        for i, (word, tag, lemma) in enumerate(zip(tokens, tags, lemmas)):
            pos, tag = make_pos_and_tag(tag, sep=",")
            answer.append(
                self.format_string.format(i + 1, word, pos, tag, lemma))
        if self.return_string:
            answer = self.begin + self.sep.join(answer) + self.end
        return answer
示例#2
0
def prettify(sent, tags, return_string=True, begin="",  end="", sep="\n"):
    """

    x: str, sentence
    y: list, a sequence of tags

    x = "John likes, really likes pizza"
    y = ["NNP", "VBZ", "PUNCT", "RB", "VBZ", "NN"]

    answer:
        1  John    NNP
        2  likes   VBZ
        3  ,   PUNCT
        4  really  RB
        5  likes   VBZ
        6  pizza   NN
        7  .    SENT
    """
    if isinstance(sent, str):
        words = [x for x in re.split("(\w+|[,.])", sent) if x.strip() != ""]
    else:
        words = sent
    answer = []
    for i, (word, tag) in enumerate(zip(words, tags)):
        answer.append("{}\t{}\t{}\t{}".format(i+1, word, *make_pos_and_tag(tag)))
    if return_string:
        answer = begin + sep.join(answer) + end
    return answer
示例#3
0
def prettify(sent: Union[str, List[str]],
             tags: List[str],
             return_string: bool = True,
             begin: str = "",
             end: str = "",
             sep: str = "\n") -> Union[List[str], str]:
    """Prettifies output of morphological tagger.

    Args:
        sent: source sentence (either tokenized or not)
        tags: list of tags, the output of a tagger
        return_string: whether to return a list of strings or a single string
        begin: a string to append in the beginning
        end: a string to append in the end
        sep: separator between word analyses

    Returns:
        the prettified output of the tagger.

    Examples:
        >>> sent = "John likes, really likes pizza"
        >>> tags = ["NNP", "VBZ", "PUNCT", "RB", "VBZ", "NN"]
        >>> prettify(sent, tags)
        1  John    NNP
        2  likes   VBZ
        3  ,   PUNCT
        4  really  RB
        5  likes   VBZ
        6  pizza   NN
        7  .    SENT
    """
    if isinstance(sent, str):
        words = [x for x in re.split("(\w+|[,.])", sent) if x.strip() != ""]
    else:
        words = sent
    answer = []
    for i, (word, tag) in enumerate(zip(words, tags)):
        answer.append("{}\t{}\t{}\t{}".format(i + 1, word,
                                              *make_pos_and_tag(tag)))
    if return_string:
        answer = begin + sep.join(answer) + end
    return answer
示例#4
0
 def _make_tag_trie(self):
     self._nodes = [defaultdict(dict)]
     self._start_nodes_for_pos = dict()
     self._data = [None]
     for tag, code in self._t2i.items():
         pos, tag = make_pos_and_tag(tag, sep=",", return_mode="sorted_items")
         start = self._start_nodes_for_pos.get(pos)
         if start is None:
             start = self._start_nodes_for_pos[pos] = len(self._nodes)
             self._nodes.append(defaultdict(dict))
             self._data.append(None)
         for key, value in tag:
             values_dict = self._nodes[start][key]
             child = values_dict.get(value)
             if child is None:
                 child = values_dict[value] = len(self._nodes)
                 self._nodes.append(defaultdict(dict))
                 self._data.append(None)
             start = child
         self._data[start] = code
     return self
示例#5
0
    def prettify(self, tokens: List[str],
                 tags: List[str]) -> Union[List[str], str]:
        """Prettifies output of morphological tagger.

        Args:
            tokens: tokenized source sentence
            tags: list of tags, the output of a tagger

        Returns:
            the prettified output of the tagger.

        Examples:
            >>> sent = "John really likes pizza .".split()
            >>> tags = ["PROPN,Number=Sing", "ADV",
            >>>         "VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
            >>>         "NOUN,Number=Sing", "PUNCT"]
            >>> prettifier = TagOutputPrettifier(mode='basic')
            >>> self.prettify(sent, tags)
                1	John	PROPN	Number=Sing
                2	really	ADV	_
                3	likes	VERB	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
                4	pizza	NOUN	Number=Sing
                5	.	PUNCT	_
            >>> prettifier = TagOutputPrettifier(mode='ud')
            >>> self.prettify(sent, tags)
                1	John	_	PROPN	_	Number=Sing	_	_	_	_
                2	really	_	ADV	_	_	_	_	_	_
                3	likes	_	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	_	_	_	_
                4	pizza	_	NOUN	_	Number=Sing	_	_	_	_
                5	.	_	PUNCT	_	_	_	_	_	_
        """
        answer = []
        for i, (word, tag) in enumerate(zip(tokens, tags)):
            answer.append(
                self.format_string.format(i + 1, word, *make_pos_and_tag(tag)))
        if self.return_string:
            answer = self.begin + self.sep.join(answer) + self.end
        return answer
示例#6
0
    def prettify(self, tokens: List[str], tags: List[str]) -> Union[List[str], str]:
        """Prettifies output of morphological tagger.

        Args:
            tokens: tokenized source sentence
            tags: list of tags, the output of a tagger

        Returns:
            the prettified output of the tagger.

        Examples:
            >>> sent = "John really likes pizza .".split()
            >>> tags = ["PROPN,Number=Sing", "ADV",
            >>>         "VERB,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
            >>>         "NOUN,Number=Sing", "PUNCT"]
            >>> prettifier = TagOutputPrettifier(mode='basic')
            >>> self.prettify(sent, tags)
                1	John	PROPN	Number=Sing
                2	really	ADV	_
                3	likes	VERB	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
                4	pizza	NOUN	Number=Sing
                5	.	PUNCT	_
            >>> prettifier = TagOutputPrettifier(mode='ud')
            >>> self.prettify(sent, tags)
                1	John	_	PROPN	_	Number=Sing	_	_	_	_
                2	really	_	ADV	_	_	_	_	_	_
                3	likes	_	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	_	_	_	_
                4	pizza	_	NOUN	_	Number=Sing	_	_	_	_
                5	.	_	PUNCT	_	_	_	_	_	_
        """
        answer = []
        for i, (word, tag) in enumerate(zip(tokens, tags)):
            answer.append(self.format_string.format(i + 1, word, *make_pos_and_tag(tag)))
        if self.return_string:
            answer = self.begin + self.sep.join(answer) + self.end
        return answer