def load_file(self, filepath): """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively. Args: filepath: ``.conllx`` or ``.conllu`` file path. """ if filepath.endswith('.conllu'): # See https://universaldependencies.org/format.html field_names = [ 'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC' ] else: field_names = [ 'ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL' ] fp = TimingFileIterator(filepath) for idx, sent in enumerate(read_conll(fp)): sample = {} for i, field in enumerate(field_names): sample[field] = [cell[i] for cell in sent] if not self._prune or not self._prune(sample): yield sample fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')
def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' use_pos = self.use_pos conllu = filepath.endswith('.conllu') enhanced_only = self.config.get('enhanced_only', None) for i, sent in enumerate(read_conll(filepath)): parsed_sent = [] if conllu: for cell in sent: ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] deps = cell[8] deps = [x.split(':', 1) for x in deps.split('|')] heads = [int(x[0]) for x in deps if x[0].isdigit()] rels = [x[1] for x in deps if x[0].isdigit()] if enhanced_only: if head in heads: offset = heads.index(head) heads.pop(offset) rels.pop(offset) else: if head not in heads: heads.append(head) rels.append(deprel) parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels]) else: prev_cells = None heads = [] rels = [] for j, cell in enumerate(sent): ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] if prev_cells and ID != prev_cells[0]: # found end of token parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) heads = [] rels = [] heads.append(head) rels.append(deprel) prev_cells = [ID, form, cpos, head, deprel ] if use_pos else [ID, form, head, deprel] parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) yield parsed_sent
def make_ctb_tasks(chtbs, out_root, part): for task in ['cws', 'pos', 'par', 'dep']: os.makedirs(join(out_root, task), exist_ok=True) timer = CountdownTimer(len(chtbs)) par_path = join(out_root, 'par', f'{part}.txt') with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \ open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \ open(par_path, 'w', encoding='utf-8') as par: for f in chtbs: with open(f, encoding='utf-8') as src: content = src.read() trees = split_str_to_trees(content) for tree in trees: try: tree = Tree.fromstring(tree) except ValueError: print(tree) exit(1) words = [] for word, tag in tree.pos(): if tag == '-NONE-' or not tag: continue tag = tag.split('-')[0] if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' pos.write('{}\t{}\n'.format(word, tag)) words.append(word) cws.write(' '.join(words)) par.write(tree.pformat(margin=sys.maxsize)) for fp in cws, pos, par: fp.write('\n') timer.log( f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]', erase=False) remove_all_ec(par_path) dep_path = join(out_root, 'dep', f'{part}.conllx') convert_to_stanford_dependency_330(par_path, dep_path) sents = list(read_conll(dep_path)) with open(dep_path, 'w') as out: for sent in sents: for i, cells in enumerate(sent): tag = cells[3] tag = tag.split('-')[0] # NT-SHORT ---> NT if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' cells[3] = cells[4] = tag out.write('\t'.join(str(x) for x in cells)) out.write('\n') out.write('\n')
def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' use_pos = self.use_pos conllu = filepath.endswith('.conllu') for sent in read_conll(filepath): for i, cell in enumerate(sent): form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] # if conllu: # deps = cell[8] # deps = [x.split(':', 1) for x in deps.split('|')] # heads = [int(x[0]) for x in deps if '_' not in x[0] and '.' not in x[0]] # rels = [x[1] for x in deps if '_' not in x[0] and '.' not in x[0]] # if head in heads: # offset = heads.index(head) # if not self.rel_vocab or rels[offset] in self.rel_vocab: # deprel = rels[offset] sent[i] = [form, cpos, head, deprel] if use_pos else [form, head, deprel] yield sent