示例#1
0
def main():
    argparser = configargparse.ArgParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate")
    add_verbose_arg(argparser, help="detailed evaluation output")
    add_boolean_option(argparser, "wikification", "Spotlight to wikify any named node (for AMR)")
    argparser.add_argument("-o", "--out-dir", help="output directory (if unspecified, files are not written)")
    args = argparser.parse_args()

    scores = []
    for pattern in args.filenames:
        filenames = glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            print("\rConverting '%s'" % filename, end="")
            if args.out_dir or args.verbose:
                print(flush=True)
            basename, ext = os.path.splitext(os.path.basename(filename))
            passage_format = ext.lstrip(".")
            converters = CONVERTERS.get(passage_format, CONVERTERS["amr"])
            evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate
            with open(filename, encoding="utf-8") as f:
                for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True):
                    if args.out_dir:
                        os.makedirs(args.out_dir, exist_ok=True)
                        outfile = "%s/%s.xml" % (args.out_dir, passage.ID)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        ioutil.passage2file(passage, outfile)
                    try:
                        guessed = converters[1](passage, wikification=args.wikification, use_original=False)
                    except Exception as e:
                        raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e
                    if args.out_dir:
                        outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        with open(outfile, "w", encoding="utf-8") as f_out:
                            print("\n".join(guessed), file=f_out)
                    try:
                        s = evaluator(guessed, ref, verbose=args.verbose > 1)
                    except Exception as e:
                        raise ValueError("Error evaluating conversion of %s" % filename) from e
                    scores.append(s)
                    if args.verbose:
                        print(passage_id)
                        s.print()
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores(scores).print()

    sys.exit(0)
示例#2
0
文件: config.py 项目: danielhers/tupa
    def __init__(self, *args):
        self.arg_parser = ap = ArgParser(description="Transition-based parser for UCCA.",
                                         formatter_class=ArgumentDefaultsHelpFormatter)
        ap.add_argument("passages", nargs="*", help="passage files/directories to test on/parse")
        ap.add_argument("--version", action="version", version="")
        ap.add_argument("-C", "--config", is_config_file=True, help="configuration file to get arguments from")
        ap.add_argument("-m", "--models", nargs="+", help="model file basename(s) to load/save, ensemble if >1 "
                                                          "(default: <format>_<model_type>")
        ap.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=BIRNN, help="model type")
        ap.add_argument("-B", "--beam", type=int, choices=(1,), default=1, help="beam size for beam search")
        add_boolean_option(ap, "evaluate", "evaluation of parsed passages", short="e")
        add_verbose_arg(ap, help="detailed parse output")
        constructions.add_argument(ap)
        add_boolean_option(ap, "sentences", "split to sentences")
        add_boolean_option(ap, "paragraphs", "split to paragraphs")
        ap.add_argument("--timeout", type=float, help="max number of seconds to wait for a single passage")

        group = ap.add_argument_group(title="Training parameters")
        group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on")
        group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on")
        group.add_argument("-I", "--iterations", nargs="+", type=Iterations,
                           default=(Iterations(50), Iterations("100 --optimizer=" + EXTRA_TRAINER)),
                           help="number of training iterations along with optional hyperparameters per part")
        group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation")
        group.add_argument("--seed", type=int, default=1, help="random number generator seed")
        add_boolean_option(group, "early-update", "early update procedure (finish example on first error)")
        group.add_argument("--save-every", type=int, help="every this many passages, evaluate on dev and save model")
        add_boolean_option(group, "eval-test", "evaluate on test whenever evaluating on dev, but keep results hidden")
        add_boolean_option(group, "ignore-case", "pre-convert all input files to lower-case in training and test")

        group = ap.add_argument_group(title="Output files")
        group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files")
        group.add_argument("-p", "--prefix", default="", help="output filename prefix")
        add_boolean_option(group, "write", "writing parsed output to files", default=True, short_no="W")
        group.add_argument("-j", "--join", help="if output format is textual, write all to one file with this basename")
        group.add_argument("-l", "--log", help="output log file (default: model filename + .log)")
        group.add_argument("--devscores", help="output CSV file for dev scores (default: model filename + .dev.csv)")
        group.add_argument("--testscores", help="output CSV file for test scores (default: model filename + .test.csv)")
        group.add_argument("--action-stats", help="output CSV file for action statistics")
        add_boolean_option(group, "normalize", "apply normalizations to output in case format is UCCA", default=False)
        ap.add_argument("-f", "--formats", nargs="+", choices=FILE_FORMATS, default=(),
                        help="input formats for creating all parameters before training starts "
                             "(otherwise created dynamically based on filename suffix), "
                             "and output formats for written files (each will be written; default: UCCA XML)")
        ap.add_argument("-u", "--unlabeled", nargs="*", choices=FORMATS, help="to ignore labels in")
        ap.add_argument("--lang", default="en", help="two-letter language code to use as the default language")
        add_boolean_option(ap, "multilingual", "separate model parameters per language (passage.attrib['lang'])")

        group = ap.add_argument_group(title="Sanity checks")
        add_boolean_option(group, "check-loops", "check for parser state loop")
        add_boolean_option(group, "verify", "check for oracle reproducing original passage")
        add_boolean_option(group, "validate-oracle", "require oracle output to respect constraints", default=True)
        add_param_arguments(ap)

        group = ap.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay", type=float, default=1e-5, help="weight decay for parameters")
        add_boolean_option(group, "dynet-apply-weight-decay-on-load", "workaround for clab/dynet#1206", default=False)
        add_boolean_option(group, "dynet-gpu", "GPU for training")
        group.add_argument("--dynet-gpus", type=int, default=1, help="how many GPUs you want to use")
        add_boolean_option(group, "dynet-autobatch", "auto-batching of training examples")
        DYNET_ARG_NAMES.update(get_group_arg_names(group))

        ap.add_argument("-H", "--hyperparams", type=HyperparamsInitializer.action, nargs="*",
                        help="shared hyperparameters or hyperparameters for specific formats, "
                             'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"',
                        default=[HyperparamsInitializer.action("shared --lstm-layers 2")])
        ap.add_argument("--copy-shared", nargs="*", choices=FORMATS, help="formats whose parameters shall be "
                                                                          "copied from loaded shared parameters")
        self.args = FallbackNamespace(ap.parse_args(args if args else None))

        if self.args.config:
            print("Loading configuration from '%s'." % self.args.config)

        if self.args.passages and self.args.write:
            os.makedirs(self.args.outdir, exist_ok=True)

        if self.args.models:
            if not self.args.log:
                self.args.log = self.args.models[0] + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.models[0] + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.models[0] + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"
        self.sub_configs = []  # Copies to be stored in Models so that they do not interfere with each other
        self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None
        self._vocab = {}
        self.original_values = {}
        self.random = np.random
        self.update()
示例#3
0
文件: config.py 项目: OfirArviv/tupa
    def __init__(self, *args):
        self.arg_parser = ap = ArgParser(
            description="Transition-based parser for UCCA.",
            formatter_class=ArgumentDefaultsHelpFormatter)

        add_boolean_option(ap,
                           "use-bert",
                           default=False,
                           description="whether to use bert embeddings")
        ap.add_argument("--bert-model",
                        choices=[
                            "bert-base-uncased", "bert-large-uncased",
                            "bert-base-cased", "bert-large-cased",
                            "bert-base-multilingual-cased"
                        ],
                        default="bert-base-multilingual-cased")
        ap.add_argument("--bert-layers",
                        type=int,
                        nargs='+',
                        default=[-1, -2, -3, -4])
        ap.add_argument("--bert-layers-pooling",
                        choices=["weighted", "sum", "concat"],
                        default="weighted")
        ap.add_argument("--bert-token-align-by",
                        choices=["first", "sum", "mean"],
                        default="sum")
        ap.add_argument("--bert-multilingual", choices=[0], type=int)
        add_boolean_option(
            ap,
            "bert-use-default-word-embeddings",
            default=False,
            description="whether to use default word embeddings")
        ap.add_argument("--bert-dropout",
                        type=float,
                        default=0,
                        choices=np.linspace(0, 0.9, num=10))

        ap.add_argument("passages",
                        nargs="*",
                        help="passage files/directories to test on/parse")
        ap.add_argument("--version", action="version", version="")
        ap.add_argument("-C",
                        "--config",
                        is_config_file=True,
                        help="configuration file to get arguments from")
        ap.add_argument(
            "-m",
            "--models",
            nargs="+",
            help="model file basename(s) to load/save, ensemble if >1 "
            "(default: <format>_<model_type>")
        ap.add_argument("-c",
                        "--classifier",
                        choices=CLASSIFIERS,
                        default=BIRNN,
                        help="model type")
        ap.add_argument("-B",
                        "--beam",
                        type=int,
                        choices=(1, ),
                        default=1,
                        help="beam size for beam search")
        add_boolean_option(ap,
                           "evaluate",
                           "evaluation of parsed passages",
                           short="e")
        add_verbose_arg(ap, help="detailed parse output")
        constructions.add_argument(ap)
        add_boolean_option(ap, "sentences", "split to sentences")
        add_boolean_option(ap, "paragraphs", "split to paragraphs")
        ap.add_argument(
            "--timeout",
            type=float,
            help="max number of seconds to wait for a single passage")

        group = ap.add_argument_group(title="Training parameters")
        group.add_argument("-t",
                           "--train",
                           nargs="+",
                           default=(),
                           help="passage files/directories to train on")
        group.add_argument("-d",
                           "--dev",
                           nargs="+",
                           default=(),
                           help="passage files/directories to tune on")
        group.add_argument(
            "-I",
            "--iterations",
            nargs="+",
            type=Iterations,
            default=(Iterations(50),
                     Iterations("100 --optimizer=" + EXTRA_TRAINER)),
            help=
            "number of training iterations along with optional hyperparameters per part"
        )
        group.add_argument("--folds",
                           type=int,
                           choices=(3, 5, 10),
                           help="#folds for cross validation")
        group.add_argument("--seed",
                           type=int,
                           default=1,
                           help="random number generator seed")
        add_boolean_option(
            group, "early-update",
            "early update procedure (finish example on first error)")
        group.add_argument(
            "--save-every",
            type=int,
            help="every this many passages, evaluate on dev and save model")
        add_boolean_option(
            group, "eval-test",
            "evaluate on test whenever evaluating on dev, but keep results hidden"
        )
        add_boolean_option(
            group, "ignore-case",
            "pre-convert all input files to lower-case in training and test")

        group = ap.add_argument_group(title="Output files")
        group.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory for parsed files")
        group.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
        add_boolean_option(group,
                           "write",
                           "writing parsed output to files",
                           default=True,
                           short_no="W")
        group.add_argument(
            "-j",
            "--join",
            help=
            "if output format is textual, write all to one file with this basename"
        )
        group.add_argument(
            "-l",
            "--log",
            help="output log file (default: model filename + .log)")
        group.add_argument(
            "--devscores",
            help=
            "output CSV file for dev scores (default: model filename + .dev.csv)"
        )
        group.add_argument(
            "--testscores",
            help=
            "output CSV file for test scores (default: model filename + .test.csv)"
        )
        group.add_argument("--action-stats",
                           help="output CSV file for action statistics")
        add_boolean_option(
            group,
            "normalize",
            "apply normalizations to output in case format is UCCA",
            default=False)
        ap.add_argument(
            "-f",
            "--formats",
            nargs="+",
            choices=FILE_FORMATS,
            default=(),
            help=
            "input formats for creating all parameters before training starts "
            "(otherwise created dynamically based on filename suffix), "
            "and output formats for written files (each will be written; default: UCCA XML)"
        )
        ap.add_argument("-u",
                        "--unlabeled",
                        nargs="*",
                        choices=FORMATS,
                        help="to ignore labels in")
        ap.add_argument(
            "--lang",
            default="en",
            help="two-letter language code to use as the default language")
        add_boolean_option(
            ap, "multilingual",
            "separate model parameters per language (passage.attrib['lang'])")

        group = ap.add_argument_group(title="Sanity checks")
        add_boolean_option(group, "check-loops", "check for parser state loop")
        add_boolean_option(group, "verify",
                           "check for oracle reproducing original passage")
        add_boolean_option(group,
                           "validate-oracle",
                           "require oracle output to respect constraints",
                           default=True)
        add_param_arguments(ap)

        group = ap.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay",
                           type=float,
                           default=1e-5,
                           help="weight decay for parameters")
        add_boolean_option(group,
                           "dynet-apply-weight-decay-on-load",
                           "workaround for clab/dynet#1206",
                           default=False)
        add_boolean_option(group, "dynet-gpu", "GPU for training")
        group.add_argument("--dynet-gpus",
                           type=int,
                           default=1,
                           help="how many GPUs you want to use")
        add_boolean_option(group, "dynet-autobatch",
                           "auto-batching of training examples")
        add_boolean_option(group, "dynet-check-validity",
                           "check validity of expressions immediately")
        DYNET_ARG_NAMES.update(get_group_arg_names(group))

        ap.add_argument(
            "-H",
            "--hyperparams",
            type=HyperparamsInitializer.action,
            nargs="*",
            help=
            "shared hyperparameters or hyperparameters for specific formats, "
            'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"',
            default=[HyperparamsInitializer.action("shared --lstm-layers 2")])
        ap.add_argument("--copy-shared",
                        nargs="*",
                        choices=FORMATS,
                        help="formats whose parameters shall be "
                        "copied from loaded shared parameters")
        self.args = FallbackNamespace(ap.parse_args(args if args else None))

        if self.args.config:
            print("Loading configuration from '%s'." % self.args.config)

        if self.args.passages and self.args.write:
            os.makedirs(self.args.outdir, exist_ok=True)

        if self.args.models:
            if not self.args.log:
                self.args.log = self.args.models[0] + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.models[0] + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.models[0] + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"
        self.sub_configs = [
        ]  # Copies to be stored in Models so that they do not interfere with each other
        self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None
        self._vocab = {}
        self.original_values = {}
        self.random = np.random
        self.update()
示例#4
0
        help="file to write aggregated counts to, in CSV format")
    add_boolean_option(argparser,
                       "unlabeled",
                       "print unlabeled F1 for individual passages",
                       short="u")
    add_boolean_option(argparser,
                       "enhanced",
                       "read enhanced dependencies",
                       default=True)
    add_boolean_option(argparser,
                       "normalize",
                       "normalize passages before evaluation",
                       short="N",
                       default=True)
    add_boolean_option(argparser,
                       "matching-ids",
                       "skip passages without a match (by ID)",
                       short="i")
    add_boolean_option(argparser,
                       "basename",
                       "force passage ID to be file basename",
                       short="b")
    add_boolean_option(argparser, "units", "print mutual and unique units")
    add_boolean_option(argparser, "errors",
                       "print confusion matrix with error distribution")
    group = argparser.add_mutually_exclusive_group()
    add_verbose_arg(group, help="detailed evaluation output")
    add_boolean_option(group, "quiet", "do not print anything", short="q")
    ucca_constructions.add_argument(argparser)
    main(argparser.parse_args())
示例#5
0
def main(args):
    for passages, out_dir, lang in read_specs(args):
        scores = []
        if not args.verbose:
            passages = tqdm(passages, unit=" passages", desc="Parsing " + out_dir)
        for passage, parsed in ANNOTATORS[args.parser](passages, lang, args.verbose):
            if args.write:
                write_passage(parsed, args)
            else:
                map_labels(parsed, args.label_map)
            if args.evaluate:
                evaluator = EVALUATORS[args.output_format]
                _, converter = CONVERTERS[args.output_format]
                if converter is not None:
                    passage, parsed = map(converter, (passage, parsed))
                scores.append(evaluator.evaluate(parsed, passage, verbose=args.verbose > 1))
        if scores:
            Scores(scores).print()


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description=desc)
    add_specs_args(argparser)
    argparser.add_argument("--parser", choices=PARSERS, default=SPACY, help="dependency parser to use (default: spacy)")
    argparser.add_argument("--output-format", choices=CONVERTERS, help="output file format (default: UCCA)")
    add_convert_args(argparser)
    argparser.add_argument("-e", "--evaluate", action="store_true", help="evaluate against original passages")
    argparser.add_argument("-W", "--no-write", action="store_false", dest="write", help="write parsed passages")
    add_verbose_arg(argparser, help="detailed output")
    main(argparser.parse_args())
示例#6
0
            "Cannot specify --extra-normalization without --normalize")
    return args


if __name__ == '__main__':
    argparser = configargparse.ArgParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="file names to convert and evaluate")
    argparser.add_argument(
        "-f",
        "--format",
        choices=CONVERTERS,
        default="amr",
        help="default format (if cannot determine by suffix)")
    add_verbose_arg(argparser, help="detailed evaluation output")
    argparser.add_argument("--units",
                           action="store_true",
                           help="print mutual and unique units")
    add_boolean_option(argparser, "wikification",
                       "Spotlight to wikify any named node (for AMR)")
    argparser.add_argument(
        "-o",
        "--out-dir",
        help="output directory (if unspecified, files are not written)")
    argparser.add_argument("-n",
                           "--normalize",
                           action="store_true",
                           help="normalize passages before conversion")
    argparser.add_argument("-e",
                           "--extra-normalization",