def create_traindata(args): self = TrainingDataCreator(args.PATH, args.word_freq_cut, args.cat_freq_cut, args.afix_freq_cut) trees = [ tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED' ] logger.info(f'loaded {len(trees)} trees') for tree in trees: self._traverse(tree) self._create_samples(trees) cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut} self._write(cats, args.OUT / 'target.txt') words = { k: v for k, v in self.words.items() if v >= self.word_freq_cut } self._write(words, args.OUT / 'words.txt') suffixes = { k: v for k, v in self.suffixes.items() if v >= self.afix_freq_cut } self._write(suffixes, args.OUT / 'suffixes.txt') prefixes = { k: v for k, v in self.prefixes.items() if v >= self.afix_freq_cut } self._write(prefixes, args.OUT / 'prefixes.txt') seen_rules = { f'{c1} {c2}': v for (c1, c2), v in self.seen_rules.items() if c1 in cats and c2 in cats } self._write(seen_rules, args.OUT / 'seen_rules.txt') unary_rules = { f'{c1} {c2}': v for (c1, c2), v in self.unary_rules.items() } self._write(unary_rules, args.OUT / 'unary_rules.txt') with open(args.OUT / 'traindata.json', 'w') as f: logger.info(f'writing to {f.name}') json.dump(self.samples, f) with open(args.OUT / 'trainsents.txt', 'w') as f: logger.info(f'writing to {f.name}') for sent in self.sents: print(sent, file=f) with open(args.OUT / 'trainsents.conll', 'w') as f: logger.info(f'writing to {f.name}') self._to_conll(f)
def get_deps_from_auto(auto_file): candc_dir = os.environ.get('CANDC', None) if not candc_dir: die('did not find C&C parser at CANDC environmental variable.') CANDC_DIR = Path(candc_dir).resolve() GENERATE = CANDC_DIR / 'bin' / 'generate' MARKEDUP = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats' / 'markedup' CATS = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats' if not GENERATE.exists(): logger.error( 'Currently the evalution script requires C&C parser compiled from its source.' ) die('expected: $CANDC/bin/generate') elif not MARKEDUP.exists() or not CATS.exists: logger.error('The C&C directory is not configured expectedly.') die('expected: $CANDC/src/data/ccg/cats/markedup') tmp = tempfile.mktemp() print(tmp) with open(tmp, 'w') as f: for _, tokens, tree in read_auto(auto_file): print(tree.auto_flat(tokens=tokens), file=f) command = f'{GENERATE} -j {CATS} {MARKEDUP} {tmp}' proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) results, error = proc.communicate() if len(error.decode('utf-8')) > 0: die(f'caught error in running $CANDC/bin/generate: {error.decode("utf-8")}' ) lines = iter(results.decode('utf-8').split('\n')) deps, udeps = set(), set() rule_ids = {} line = next(lines) while line != '': line = next(lines) for line in lines: line = line.strip() if len(line) == 0: # If 0, no dependencies for this sentence - probably a conversion script error. parsed = len(rule_ids) > 0 yield parsed, deps, udeps, rule_ids deps, udeps = set(), set() rule_ids = {} continue fields = line.split() pred, cat, slot, arg, rule_id = fields[:5] pred_word = pred.rsplit('_')[0] arg_word = arg.rsplit('_')[0] if not ignore(pred_word, cat, slot, arg_word, rule_id): cat = strip_markup(cat) deps.add((pred, cat, slot, arg)) rule_ids[(pred, cat, slot, arg)] = rule_id udeps.add((pred, arg))
def convert_json(autopath): self = TrainingDataCreator(autopath, None, None, None) trees = [ tree for _, _, tree in read_auto(self.filepath) if tree.word != 'FAILED' ] logger.info(f'loaded {len(trees)} trees') self._create_samples(trees) return self.samples
def create_testdata(args): self = TrainingDataCreator(args.PATH, args.word_freq_cut, args.cat_freq_cut, args.afix_freq_cut) trees = [tree for _, _, tree in read_auto(self.filepath)] self._create_samples(trees) with open(args.OUT / 'testdata.json', 'w') as f: logger.info(f'writing to {f.name}') json.dump(self.samples, f) with open(args.OUT / 'testsents.txt', 'w') as f: logger.info(f'writing to {f.name}') for sent in self.sents: print(sent, file=f) with open(args.OUT / 'testsents.conll', 'w') as f: logger.info(f'writing to {f.name}') self._to_conll(f)
def _read(self, file_path): for _, _, ccg_tree in read_auto(file_path): tree = ccg_to_nltk_tree(ccg_tree) pos_tags = [x[1] for x in tree.pos()] if self._use_pos_tags else None yield self.text_to_instance(tree.leaves(), pos_tags, tree)