def main(args): argp = _argparser().parse_args(args[1:]) if argp.re_parse: escape = False else: escape = not argp.do_not_escape if argp.override_tags: re_parse = True else: re_parse = argp.re_parse for line in (l.rstrip("\n") for l in argp.input): if re_parse: head = PTB_HEAD_REGEX.match(line).groupdict()["head"] gdicts_tag, gdicts_tok = tee((m.groupdict() for m in PTB_TOK_TAG_REGEX.finditer(line)), 2) tokens = (d["token"] for d in gdicts_tok) if not argp.override_tags: tags = (d["tag"] for d in gdicts_tag) else: tags = flattagger(d["token"] for d in gdicts_tag) else: head = HEAD_BY_PARSER[argp.parser] soup = line.split() tokens, tokens_to_tag = tee( ( ptb_escape(t, preserve_quotes=True) if escape else t for t in (ptb_escape_quotes_tokens(soup) if escape else soup) ), 2, ) tags = flattagger(tokens_to_tag) argp.output.write( "({} (S {}))\n".format(head, " ".join("({} {})".format(tag, token) for tag, token in izip(tags, tokens))) ) return 0
def main(args): argp = _argparser().parse_args(args[1:]) if argp.re_parse: escape = False else: escape = not argp.do_not_escape if argp.override_tags: re_parse = True else: re_parse = argp.re_parse for line in (l.rstrip('\n') for l in argp.input): if re_parse: head = PTB_HEAD_REGEX.match(line).groupdict()['head'] gdicts_tag, gdicts_tok = tee( (m.groupdict() for m in PTB_TOK_TAG_REGEX.finditer(line)), 2) tokens = (d['token'] for d in gdicts_tok) if not argp.override_tags: tags = (d['tag'] for d in gdicts_tag) else: tags = flattagger(d['token'] for d in gdicts_tag) else: head = HEAD_BY_PARSER[argp.parser] soup = line.split() tokens, tokens_to_tag = tee(( ptb_escape(t, preserve_quotes=True) if escape else t for t in (ptb_escape_quotes_tokens(soup) if escape else soup)), 2) tags = flattagger(tokens_to_tag) argp.output.write('({} (S {}))\n'.format( head, ' '.join('({} {})'.format(tag, token) for tag, token in izip(tags, tokens)))) return 0
'[': '-LRB-', ']': '-RRB-', '}': '-LRB-', '{': '-RRB-', # The question mark handling below is consistent with Stanford and # McCCJ, but not with Enju that assigns it a PoS-tag instead. '?': '.', # The exclamation mark handling below is consistent with Stanford, but # not Enju for the same reason as above. '!': '.', '$': '$', } # Generate a mapping from each escaped version of a token to the same tag for token, escaped_token in set(( t, ptb_escape(t), ) for t in TAG_BY_TOK if ptb_escape(t) != t): TAG_BY_TOK[escaped_token] = TAG_BY_TOK[token] ### def flattagger(tokens): for token in tokens: try: tag = TAG_BY_TOK[token] except KeyError: # Go for the majority-class PoS-tag. tag = 'NN' yield tag
"(": "-LRB-", ")": "-RRB-", "[": "-LRB-", "]": "-RRB-", "}": "-LRB-", "{": "-RRB-", # The question mark handling below is consistent with Stanford and # McCCJ, but not with Enju that assigns it a PoS-tag instead. "?": ".", # The exclamation mark handling below is consistent with Stanford, but # not Enju for the same reason as above. "!": ".", "$": "$", } # Generate a mapping from each escaped version of a token to the same tag for token, escaped_token in set((t, ptb_escape(t)) for t in TAG_BY_TOK if ptb_escape(t) != t): TAG_BY_TOK[escaped_token] = TAG_BY_TOK[token] ### def flattagger(tokens): for token in tokens: try: tag = TAG_BY_TOK[token] except KeyError: # Go for the majority-class PoS-tag. tag = "NN" yield tag def _argparser():