Exemplo n.º 1
0
 def __init__(self,
              category: str,
              midasi: str,
              sentence: BList,
              mid_range: range,
              mrph2dmid: Dict[Morpheme, int]):
     self.category: str = category
     self.midasi: str = midasi
     self.sid: str = sentence.sid
     self.mid_range: range = mid_range
     dmid_start = mrph2dmid[sentence.mrph_list()[mid_range[0]]]
     dmid_end = mrph2dmid[sentence.mrph_list()[mid_range[-1]]]
     self.dmid_range: range = range(dmid_start, dmid_end + 1)
Exemplo n.º 2
0
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int,
             tokenizer: BertTokenizer):
    """
    各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する.
    1文に分割しても max_subword_length を超えるような長い文はそのまま出力する
    """
    did2sids: Dict[str, List[str]] = defaultdict(list)
    did2cumlens: Dict[str, List[int]] = {}
    sid2knp: Dict[str, str] = {}

    for knp_file in input_dir.glob('*.knp'):
        with knp_file.open() as fin:
            did = knp_file.stem
            did2cumlens[did] = [0]
            buff = ''
            for line in fin:
                buff += line
                if line.strip() == 'EOS':
                    blist = BList(buff)
                    did2sids[did].append(blist.sid)
                    did2cumlens[did].append(did2cumlens[did][-1] + len(
                        tokenizer.tokenize(' '.join(
                            m.midasi for m in blist.mrph_list()))))
                    sid2knp[blist.sid] = buff
                    buff = ''

    for did, sids in did2sids.items():
        cum: List[int] = did2cumlens[did]
        end = 1
        # end を探索
        while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length:
            end += 1

        idx = 0
        while end < len(sids) + 1:
            start = 0
            # start を探索
            while cum[end] - cum[start] > max_subword_length:
                start += 1
                if start == end - 1:
                    break
            with output_dir.joinpath(f'{did}-{idx:02}.knp').open(
                    mode='w') as fout:
                fout.write(''.join(
                    sid2knp[sid]
                    for sid in sids[start:end]))  # start から end まで書き出し
            idx += 1
            end += 1
Exemplo n.º 3
0
class Sentence:
    """ KWDLC(または Kyoto Corpus)の1文書を扱うクラス

    Attributes:
        blist (BList): KNPのBListオブジェクト
        doc_id (str): 文書ID
        bps (List[BasePhrase]): 含まれる基本句のリスト
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): 1文についてのKNPのtab出力
            dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID
            dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID
            doc_id(str): 文書ID
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """文ID"""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """形態素とその文書レベルIDを紐付ける辞書"""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """表層表現"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        return self.blist.bnst_list()

    def tag_list(self):
        return self.blist.tag_list()

    def mrph_list(self):
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """含まれる基本句の数"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'
Exemplo n.º 4
0
class Sentence:
    """A class to represent a single sentence.

    Attributes:
        blist (BList): BList object of pyknp.
        doc_id (str): The document ID of this sentence.
        bps (List[BasePhrase]): Base phrases in this sentence.
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): KNP format string of this sentence.
            dtid_offset (int): The document-wide tag ID of the previous base phrase.
            dmid_offset (int): The document-wide morpheme ID of the previous morpheme.
            doc_id(str): The document ID of this sentence.
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """A sentence ID."""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        """A document-wide tag ID."""
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """A mapping from morpheme to its document-wide ID."""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """A surface expression"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        """Return list of Bunsetsu object in pyknp."""
        return self.blist.bnst_list()

    def tag_list(self):
        """Return list of Tag object in pyknp."""
        return self.blist.tag_list()

    def mrph_list(self):
        """Return list of Morpheme object in pyknp."""
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """Number of base phrases in this sentence"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'
Exemplo n.º 5
0
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: TokenizeHandlerMeta):
    """
    各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する.
    1文に分割しても max_subword_length を超えるような長い文はそのまま出力する
    """
    did2sids: Dict[str, List[str]] = defaultdict(list)
    did2cumlens: Dict[str, List[int]] = {}
    sid2knp: Dict[str, str] = {}

    max_all_tokens_len = 0

    for knp_file in input_dir.glob('*.knp'):
        with knp_file.open() as fin:
            did = knp_file.stem
            did2cumlens[did] = [0]
            buff = ''
            for line in fin:
                buff += line
                if line.strip() == 'EOS':
                    blist = BList(buff)
                    did2sids[did].append(blist.sid)
                    all_tokens, *_ = tokenizer.get_tokenized_tokens(list(m.midasi for m in blist.mrph_list()))
                    max_all_tokens_len = max(max_all_tokens_len, len(all_tokens))
                    did2cumlens[did].append(
                        did2cumlens[did][-1] + len(all_tokens)
                        # did2cumlens[did][-1] + len(tokenizer.tokenize(' '.join(m.midasi for m in blist.mrph_list())))
                    )
                    sid2knp[blist.sid] = buff
                    buff = ''

    print(f"max_tokens_length per sentence -> {max_all_tokens_len}")
    # assert max_all_tokens_len <= max_subword_length
    # if max_all_tokens_len > max_subword_length:
    #     raise ValueError(f"max_tokens_length exceeded max_subword_length\n{max_all_tokens_len}>{max_subword_length}")
    document_divide_unit_list = []
    for did, sids in did2sids.items():
        cum: List[int] = did2cumlens[did]
        end = 1
        # end を探索
        while end < len(sids) and cum[end+1] - cum[0] <= max_subword_length:
            end += 1

        idx = 0
        while end < len(sids) + 1:
            start = 0
            # start を探索
            while cum[end] - cum[start] > max_subword_length:
                start += 1
                if start == end - 1:
                    break
            document_divide_unit_list.append(
                DocumentDivideUnit(did, idx, start, end)
            )
            # with output_dir.joinpath(f'{did}-{idx:02}.knp').open('wt') as fout:
            #     fout.write(''.join(sid2knp[sid] for sid in sids[start:end]))  # start から end まで書き出し
            idx += 1
            end += 1

    _write_partial_document = partial(
        write_partial_document,
        did2sids=did2sids,
        sid2knp=sid2knp,
        output_dir=output_dir
    )
    with Pool() as pool:
        list(pool.imap(_write_partial_document, document_divide_unit_list))