def __init__(self, category: str, midasi: str, sentence: BList, mid_range: range, mrph2dmid: Dict[Morpheme, int]): self.category: str = category self.midasi: str = midasi self.sid: str = sentence.sid self.mid_range: range = mid_range dmid_start = mrph2dmid[sentence.mrph_list()[mid_range[0]]] dmid_end = mrph2dmid[sentence.mrph_list()[mid_range[-1]]] self.dmid_range: range = range(dmid_start, dmid_end + 1)
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: BertTokenizer): """ 各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する. 1文に分割しても max_subword_length を超えるような長い文はそのまま出力する """ did2sids: Dict[str, List[str]] = defaultdict(list) did2cumlens: Dict[str, List[int]] = {} sid2knp: Dict[str, str] = {} for knp_file in input_dir.glob('*.knp'): with knp_file.open() as fin: did = knp_file.stem did2cumlens[did] = [0] buff = '' for line in fin: buff += line if line.strip() == 'EOS': blist = BList(buff) did2sids[did].append(blist.sid) did2cumlens[did].append(did2cumlens[did][-1] + len( tokenizer.tokenize(' '.join( m.midasi for m in blist.mrph_list())))) sid2knp[blist.sid] = buff buff = '' for did, sids in did2sids.items(): cum: List[int] = did2cumlens[did] end = 1 # end を探索 while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length: end += 1 idx = 0 while end < len(sids) + 1: start = 0 # start を探索 while cum[end] - cum[start] > max_subword_length: start += 1 if start == end - 1: break with output_dir.joinpath(f'{did}-{idx:02}.knp').open( mode='w') as fout: fout.write(''.join( sid2knp[sid] for sid in sids[start:end])) # start から end まで書き出し idx += 1 end += 1
class Sentence: """ KWDLC(または Kyoto Corpus)の1文書を扱うクラス Attributes: blist (BList): KNPのBListオブジェクト doc_id (str): 文書ID bps (List[BasePhrase]): 含まれる基本句のリスト """ def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): 1文についてのKNPのtab出力 dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID doc_id(str): 文書ID """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id]) @property def sid(self) -> str: """文ID""" return self.blist.sid @property def dtids(self) -> List[int]: return [bp.dtid for bp in self.bps] @property def mrph2dmid(self) -> Dict[Morpheme, int]: """形態素とその文書レベルIDを紐付ける辞書""" return self._mrph2dmid @property def surf(self) -> str: """表層表現""" return ''.join(bp.surf for bp in self.bps) def bnst_list(self): return self.blist.bnst_list() def tag_list(self): return self.blist.tag_list() def mrph_list(self): return self.blist.mrph_list() def __len__(self) -> int: """含まれる基本句の数""" return len(self.bps) def __getitem__(self, tid: int) -> Optional[BasePhrase]: if 0 <= tid < len(self): return self.bps[tid] else: logger.error(f'base phrase: {tid} out of range') return None def __iter__(self) -> Iterator[BasePhrase]: return iter(self.bps) def __eq__(self, other: 'Sentence') -> bool: return self.sid == other.sid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'Sentence(\'{self.surf}\', sid: {self.sid})'
class Sentence: """A class to represent a single sentence. Attributes: blist (BList): BList object of pyknp. doc_id (str): The document ID of this sentence. bps (List[BasePhrase]): Base phrases in this sentence. """ def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): KNP format string of this sentence. dtid_offset (int): The document-wide tag ID of the previous base phrase. dmid_offset (int): The document-wide morpheme ID of the previous morpheme. doc_id(str): The document ID of this sentence. """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id]) @property def sid(self) -> str: """A sentence ID.""" return self.blist.sid @property def dtids(self) -> List[int]: """A document-wide tag ID.""" return [bp.dtid for bp in self.bps] @property def mrph2dmid(self) -> Dict[Morpheme, int]: """A mapping from morpheme to its document-wide ID.""" return self._mrph2dmid @property def surf(self) -> str: """A surface expression""" return ''.join(bp.surf for bp in self.bps) def bnst_list(self): """Return list of Bunsetsu object in pyknp.""" return self.blist.bnst_list() def tag_list(self): """Return list of Tag object in pyknp.""" return self.blist.tag_list() def mrph_list(self): """Return list of Morpheme object in pyknp.""" return self.blist.mrph_list() def __len__(self) -> int: """Number of base phrases in this sentence""" return len(self.bps) def __getitem__(self, tid: int) -> Optional[BasePhrase]: if 0 <= tid < len(self): return self.bps[tid] else: logger.error(f'base phrase: {tid} out of range') return None def __iter__(self) -> Iterator[BasePhrase]: return iter(self.bps) def __eq__(self, other: 'Sentence') -> bool: return self.sid == other.sid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'Sentence(\'{self.surf}\', sid: {self.sid})'
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: TokenizeHandlerMeta): """ 各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する. 1文に分割しても max_subword_length を超えるような長い文はそのまま出力する """ did2sids: Dict[str, List[str]] = defaultdict(list) did2cumlens: Dict[str, List[int]] = {} sid2knp: Dict[str, str] = {} max_all_tokens_len = 0 for knp_file in input_dir.glob('*.knp'): with knp_file.open() as fin: did = knp_file.stem did2cumlens[did] = [0] buff = '' for line in fin: buff += line if line.strip() == 'EOS': blist = BList(buff) did2sids[did].append(blist.sid) all_tokens, *_ = tokenizer.get_tokenized_tokens(list(m.midasi for m in blist.mrph_list())) max_all_tokens_len = max(max_all_tokens_len, len(all_tokens)) did2cumlens[did].append( did2cumlens[did][-1] + len(all_tokens) # did2cumlens[did][-1] + len(tokenizer.tokenize(' '.join(m.midasi for m in blist.mrph_list()))) ) sid2knp[blist.sid] = buff buff = '' print(f"max_tokens_length per sentence -> {max_all_tokens_len}") # assert max_all_tokens_len <= max_subword_length # if max_all_tokens_len > max_subword_length: # raise ValueError(f"max_tokens_length exceeded max_subword_length\n{max_all_tokens_len}>{max_subword_length}") document_divide_unit_list = [] for did, sids in did2sids.items(): cum: List[int] = did2cumlens[did] end = 1 # end を探索 while end < len(sids) and cum[end+1] - cum[0] <= max_subword_length: end += 1 idx = 0 while end < len(sids) + 1: start = 0 # start を探索 while cum[end] - cum[start] > max_subword_length: start += 1 if start == end - 1: break document_divide_unit_list.append( DocumentDivideUnit(did, idx, start, end) ) # with output_dir.joinpath(f'{did}-{idx:02}.knp').open('wt') as fout: # fout.write(''.join(sid2knp[sid] for sid in sids[start:end])) # start から end まで書き出し idx += 1 end += 1 _write_partial_document = partial( write_partial_document, did2sids=did2sids, sid2knp=sid2knp, output_dir=output_dir ) with Pool() as pool: list(pool.imap(_write_partial_document, document_divide_unit_list))