def __init__(self, category: str, midasi: str, sentence: BList, mid_range: range, mrph2dmid: Dict[Morpheme, int]): self.category: str = category self.midasi: str = midasi self.sid: str = sentence.sid self.mid_range: range = mid_range dmid_start = mrph2dmid[sentence.mrph_list()[mid_range[0]]] dmid_end = mrph2dmid[sentence.mrph_list()[mid_range[-1]]] self.dmid_range: range = range(dmid_start, dmid_end + 1)
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: BertTokenizer): """ 各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する. 1文に分割しても max_subword_length を超えるような長い文はそのまま出力する """ did2sids: Dict[str, List[str]] = defaultdict(list) did2cumlens: Dict[str, List[int]] = {} sid2knp: Dict[str, str] = {} for knp_file in input_dir.glob('*.knp'): with knp_file.open() as fin: did = knp_file.stem did2cumlens[did] = [0] buff = '' for line in fin: buff += line if line.strip() == 'EOS': blist = BList(buff) did2sids[did].append(blist.sid) did2cumlens[did].append(did2cumlens[did][-1] + len( tokenizer.tokenize(' '.join( m.midasi for m in blist.mrph_list())))) sid2knp[blist.sid] = buff buff = '' for did, sids in did2sids.items(): cum: List[int] = did2cumlens[did] end = 1 # end を探索 while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length: end += 1 idx = 0 while end < len(sids) + 1: start = 0 # start を探索 while cum[end] - cum[start] > max_subword_length: start += 1 if start == end - 1: break with output_dir.joinpath(f'{did}-{idx:02}.knp').open( mode='w') as fout: fout.write(''.join( sid2knp[sid] for sid in sids[start:end])) # start から end まで書き出し idx += 1 end += 1
def parse(self, sentence): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern)
def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT): """ JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す Args: juman_str (str): ある文に関するJUMANの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.options if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$' % self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$' % self.pattern) return BList(knp_lines, self.pattern, juman_format)
def load_knp_result(self, knp_lines: str) -> BList: if knp_lines.strip().endswith("EOS"): blist = BList(knp_lines.strip(), self.knp.pattern) self.__register_attributes(blist) return blist else: raise NoEOSError
def __knp_parse(self, juman_str: str) -> BList: if self.knp.socket: knp_lines = self.knp.socket.query(juman_str, pattern=r"^%s$" % self.knp.pattern) else: knp_lines = self.knp.subprocess.query(juman_str, pattern=r"^%s$" % self.knp.pattern) return BList(knp_lines, self.knp.pattern)
def result(self, input_str): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern)
def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)
def __call__(self, document: "Document", blist: BList) -> Sentence: sentence = Sentence(document, blist.sid, Builder.ssid, blist) start: Optional[Tag] = None end: Optional[Tag] = None head: Optional[Tag] = None for tag in blist.tag_list(): if not start: start = tag if not head and "節-主辞" in tag.features: head = tag if not end and "節-区切" in tag.features: end = tag if head: EventBuilder()(sentence, start, head, end) start, end, head = None, None, None document.sentences.append(sentence) Builder.ssid += 1 for bid, bnst in enumerate(blist.bnst_list()): for tag in bnst.tag_list(): Builder.stid_bid_map[(sentence.ssid, tag.tag_id)] = bid Builder.stid_tag_map[(sentence.ssid, tag.tag_id)] = tag return sentence
def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT): """ JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す Args: juman_str (str): ある文に関するJUMANの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ knp_lines = self.analyzer.query(juman_str, pattern=r'^%s$' % self.pattern) return BList(knp_lines, self.pattern, juman_format)
def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): KNP format string of this sentence. dtid_offset (int): The document-wide tag ID of the previous base phrase. dmid_offset (int): The document-wide morpheme ID of the previous morpheme. doc_id(str): The document ID of this sentence. """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id])
def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): 1文についてのKNPのtab出力 dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID doc_id(str): 文書ID """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id])
def generate_event_pairs_and_svg_from_json(args): json_file = args.json_file target_sub_category = args.target_sub_category target_relations = args.target_relation svg_dir = args.svg_dir svg_detail_dir = args.svg_detail_dir evgviz = EventGraphVisualizer() all_event_pairs = [] with lzma.open(json_file, mode='rt') as f: for line in f: json_obj = json.loads(line.strip()) sub_category = json_obj['sub_category'] fuman_id = json_obj['id'] fuman_split_knp = json_obj['fuman_split_knp'] if target_sub_category is None or sub_category == target_sub_category: evg = EventGraph.build( [BList(''.join(knp_list)) for knp_list in fuman_split_knp]) # extract and print event pairs event_pairs = extract_event_pairs(evg, fuman_id, target_relations) all_event_pairs.extend(event_pairs) # output an SVG if svg_dir is specified if event_pairs and fuman_id not in error_ids: if svg_dir: svg_filename = os.path.join(svg_dir, fuman_id + '.svg') if not os.path.exists(svg_filename): print("generating {}".format(svg_filename), file=sys.stderr) try: evgviz.make_image(evg, svg_filename, with_detail=False, with_original_text=False) if svg_detail_dir: svg_detail_filename = os.path.join( svg_detail_dir, fuman_id + '.svg') evgviz.make_image(evg, svg_detail_filename, with_original_text=False) except subprocess.CalledProcessError as err: print( "subprocess.CalledProcessError: {}".format( err), file=sys.stderr) print(json.dumps(all_event_pairs, indent=2, ensure_ascii=False))
def __init__( self, knp_string: str, doc_id: str, cases: List[str], corefs: List[str], relax_cases: bool, extract_nes: bool, use_pas_tag: bool, ) -> None: self.knp_string: str = knp_string self.doc_id: str = doc_id self.cases: List[str] = cases self.corefs: List[str] = corefs self.relax_cases: bool = relax_cases self.extract_nes: bool = extract_nes self.use_pas_tag: bool = use_pas_tag self.sid2sentence: Dict[str, BList] = OrderedDict() buff = [] for line in knp_string.strip().split('\n'): buff.append(line) if line.strip() == 'EOS': sentence = BList('\n'.join(buff) + '\n') if sentence.sid in self.sid2sentence: logger.warning(f'{sentence.sid:24}duplicated sid found') self.sid2sentence[sentence.sid] = sentence buff = [] self.bnst2dbid = {} self.tag2dtid = {} self.mrph2dmid = {} self._assign_document_wide_id() self._pas: Dict[int, Pas] = OrderedDict() self.mentions: Dict[int, Mention] = OrderedDict() self.entities: Dict[int, Entity] = OrderedDict() if use_pas_tag: self._analyze_pas() else: self._analyze_rel() if extract_nes: self.named_entities: List[NamedEntity] = [] self._extract_nes()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--knp-file', required=True, type=str, help='path to knp file') parser.add_argument('--tsv-file', required=True, type=str, help='path to tsv file') parser.add_argument('--output-dir', required=True, type=str, help='path to directory where split knp files are exported') args = parser.parse_args() output_dir = Path(args.output_dir) output_dir.mkdir(exist_ok=True) sent2knp = {} with open(args.knp_file, mode='rt', errors='ignore') as f: buff = '' for line in tqdm(f.readlines(), desc='1/3'): buff += line if line.strip() == 'EOS': sent = ''.join(bnst.midasi for bnst in BList(buff).bnst_list()) sent2knp[sent] = buff buff = '' did2knp = defaultdict(str) ignored_dids = set() with open(args.tsv_file, mode='rt', errors='ignore') as f: for line in tqdm(f.readlines(), desc='2/3'): line = line.strip() sid, _, sent = line.split('\t') did = '-'.join(sid.split('-')[:-1]) if sent not in sent2knp: ignored_dids.add(did) continue knp_string = sent2knp[sent] assert knp_string.startswith('# ') knp_string = knp_string[:2] + f'S-ID:{sid} ' + knp_string[2:] did2knp[did] += knp_string for did, knp_result in tqdm(did2knp.items(), desc='3/3'): if did in ignored_dids: continue with output_dir.joinpath(f'{did}.knp').open(mode='wt') as f: f.write(knp_result)
def parse(self, sentence): """ 文字列 sentence を対象として構文解析を行い,構文解析結果オブジェクトを返す. """ assert (isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket(self.server, self.port, "RUN -tab -normal\n") else: command = "%s %s" % (self.command, self.option) if self.rcfile: command += " -r %s" % self.rcfile self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=self.pattern) else: knp_lines = self.subprocess.query(juman_str, pattern=self.pattern) return BList(knp_lines, self.pattern)
def run(): parser = create_arg_parser() args = parser.parse_args() docs = [] # データセットに含まれる各文書ファイルを順に読み込む for doc_file in sorted(glob.glob(f"{args.di_repo}/*/*", recursive=True)): results = [] buf = "" with open(doc_file) as f: # 文書に含まれる文とその固有表現ラベルを読み込む buf = "" for line in f: buf += line if "EOS" in line: result = BList(buf) add_ne_tag_to_mrphs(result) results.append(result) buf = "" docs.append(results) # データセットをランダムに並べ替える random.shuffle(docs) # データセットの分割: 8:1:1 num_train = int(0.8 * len(docs)) num_test = int(0.1 * len(docs)) train_docs = docs[:num_train] validation_docs = docs[num_train:-num_test] test_docs = docs[-num_test:] # データセットをファイルに書き込む os.makedirs(args.dest, exist_ok=True) write_file(f"{args.dest}/kwdlc_ner_train.txt", train_docs) write_file(f"{args.dest}/kwdlc_ner_validation.txt", validation_docs) write_file(f"{args.dest}/kwdlc_ner_test.txt", test_docs)
class Sentence: """ KWDLC(または Kyoto Corpus)の1文書を扱うクラス Attributes: blist (BList): KNPのBListオブジェクト doc_id (str): 文書ID bps (List[BasePhrase]): 含まれる基本句のリスト """ def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): 1文についてのKNPのtab出力 dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID doc_id(str): 文書ID """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id]) @property def sid(self) -> str: """文ID""" return self.blist.sid @property def dtids(self) -> List[int]: return [bp.dtid for bp in self.bps] @property def mrph2dmid(self) -> Dict[Morpheme, int]: """形態素とその文書レベルIDを紐付ける辞書""" return self._mrph2dmid @property def surf(self) -> str: """表層表現""" return ''.join(bp.surf for bp in self.bps) def bnst_list(self): return self.blist.bnst_list() def tag_list(self): return self.blist.tag_list() def mrph_list(self): return self.blist.mrph_list() def __len__(self) -> int: """含まれる基本句の数""" return len(self.bps) def __getitem__(self, tid: int) -> Optional[BasePhrase]: if 0 <= tid < len(self): return self.bps[tid] else: logger.error(f'base phrase: {tid} out of range') return None def __iter__(self) -> Iterator[BasePhrase]: return iter(self.bps) def __eq__(self, other: 'Sentence') -> bool: return self.sid == other.sid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'Sentence(\'{self.surf}\', sid: {self.sid})'
def result(self, input_str): return BList(input_str, self.pattern)
def result(self, input_str): return BList(input_str, self.EOS)
def knp(self, sentence): assert isinstance(sentence, str) result = BList(self.parse_sentence(sentence)) return result
class Sentence: """A class to represent a single sentence. Attributes: blist (BList): BList object of pyknp. doc_id (str): The document ID of this sentence. bps (List[BasePhrase]): Base phrases in this sentence. """ def __init__( self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): KNP format string of this sentence. dtid_offset (int): The document-wide tag ID of the previous base phrase. dmid_offset (int): The document-wide morpheme ID of the previous morpheme. doc_id(str): The document ID of this sentence. """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict( ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id]) @property def sid(self) -> str: """A sentence ID.""" return self.blist.sid @property def dtids(self) -> List[int]: """A document-wide tag ID.""" return [bp.dtid for bp in self.bps] @property def mrph2dmid(self) -> Dict[Morpheme, int]: """A mapping from morpheme to its document-wide ID.""" return self._mrph2dmid @property def surf(self) -> str: """A surface expression""" return ''.join(bp.surf for bp in self.bps) def bnst_list(self): """Return list of Bunsetsu object in pyknp.""" return self.blist.bnst_list() def tag_list(self): """Return list of Tag object in pyknp.""" return self.blist.tag_list() def mrph_list(self): """Return list of Morpheme object in pyknp.""" return self.blist.mrph_list() def __len__(self) -> int: """Number of base phrases in this sentence""" return len(self.bps) def __getitem__(self, tid: int) -> Optional[BasePhrase]: if 0 <= tid < len(self): return self.bps[tid] else: logger.error(f'base phrase: {tid} out of range') return None def __iter__(self) -> Iterator[BasePhrase]: return iter(self.bps) def __eq__(self, other: 'Sentence') -> bool: return self.sid == other.sid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'Sentence(\'{self.surf}\', sid: {self.sid})'
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: TokenizeHandlerMeta): """ 各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する. 1文に分割しても max_subword_length を超えるような長い文はそのまま出力する """ did2sids: Dict[str, List[str]] = defaultdict(list) did2cumlens: Dict[str, List[int]] = {} sid2knp: Dict[str, str] = {} max_all_tokens_len = 0 for knp_file in input_dir.glob('*.knp'): with knp_file.open() as fin: did = knp_file.stem did2cumlens[did] = [0] buff = '' for line in fin: buff += line if line.strip() == 'EOS': blist = BList(buff) did2sids[did].append(blist.sid) all_tokens, *_ = tokenizer.get_tokenized_tokens(list(m.midasi for m in blist.mrph_list())) max_all_tokens_len = max(max_all_tokens_len, len(all_tokens)) did2cumlens[did].append( did2cumlens[did][-1] + len(all_tokens) # did2cumlens[did][-1] + len(tokenizer.tokenize(' '.join(m.midasi for m in blist.mrph_list()))) ) sid2knp[blist.sid] = buff buff = '' print(f"max_tokens_length per sentence -> {max_all_tokens_len}") # assert max_all_tokens_len <= max_subword_length # if max_all_tokens_len > max_subword_length: # raise ValueError(f"max_tokens_length exceeded max_subword_length\n{max_all_tokens_len}>{max_subword_length}") document_divide_unit_list = [] for did, sids in did2sids.items(): cum: List[int] = did2cumlens[did] end = 1 # end を探索 while end < len(sids) and cum[end+1] - cum[0] <= max_subword_length: end += 1 idx = 0 while end < len(sids) + 1: start = 0 # start を探索 while cum[end] - cum[start] > max_subword_length: start += 1 if start == end - 1: break document_divide_unit_list.append( DocumentDivideUnit(did, idx, start, end) ) # with output_dir.joinpath(f'{did}-{idx:02}.knp').open('wt') as fout: # fout.write(''.join(sid2knp[sid] for sid in sids[start:end])) # start から end まで書き出し idx += 1 end += 1 _write_partial_document = partial( write_partial_document, did2sids=did2sids, sid2knp=sid2knp, output_dir=output_dir ) with Pool() as pool: list(pool.imap(_write_partial_document, document_divide_unit_list))