예제 #1
0
    xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \
        api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys)
    guessed, ref = [convert.from_site(x) for x in xmls]
    if args.units or args.fscore or args.errors:
        evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors,
                 constructions=args.constructions, verbose=True)


if __name__ == '__main__':
    argparser = ArgumentParser(description="Evaluate passages on UCCA DB")
    argparser.add_argument("--db", "-d", required=True, dest="db_filename", help="the db file name")
    argparser.add_argument("--host", "--hst", help="the host name")
    group = argparser.add_mutually_exclusive_group()
    group.add_argument("-p", "--pid", type=int, help="the passage ID")
    group.add_argument("-x", "--from_xids", action="store_true",
                       help="interpret the ref and the guessed parameters as Xids in the db")
    argparser.add_argument("--guessed", "-g", required=True,
                           help="if a db is defined - the username for the guessed annotation; "
                                "else - the xml file name for the guessed annotation")
    argparser.add_argument("-r", "--ref", required=True,
                           help="if a db is defined - the username for the reference annotation; "
                                "else - the xml file name for the reference annotation")
    argparser.add_argument("-u", "--units", action="store_true",
                           help="the units the annotations have in common, and those each has separately")
    argparser.add_argument("-f", "--fscore", action="store_true",
                           help="outputs the traditional P,R,F instead of the scene structure evaluation")
    argparser.add_argument("-e", "--errors", action="store_true",
                           help="prints the error distribution according to its frequency")
    constructions.add_argument(argparser)
    main(argparser.parse_args())
예제 #2
0
from argparse import ArgumentParser

from ucca import constructions
from ucca.ioutil import read_files_and_dirs

if __name__ == "__main__":
    argparser = ArgumentParser(
        description="Extract linguistic constructions from UCCA corpus.")
    argparser.add_argument("passages",
                           nargs="+",
                           help="the corpus, given as xml/pickle file names")
    constructions.add_argument(argparser, False)
    argparser.add_argument("-v",
                           "--verbose",
                           action="store_true",
                           help="print tagged text for each passage")
    args = argparser.parse_args()
    for passage in read_files_and_dirs(args.passages):
        if args.verbose:
            print("%s:" % passage.ID)
        extracted = constructions.extract_edges(
            passage, constructions=args.constructions, verbose=args.verbose)
        if any(extracted.values()):
            if not args.verbose:
                print("%s:" % passage.ID)
            for construction, edges in extracted.items():
                if edges:
                    print("  %s:" % construction.description)
                    for edge in edges:
                        print("    %s [%s %s]" % (edge, edge.tag, edge.child))
            print()
예제 #3
0
파일: config.py 프로젝트: OfirArviv/tupa
    def __init__(self, *args):
        self.arg_parser = ap = ArgParser(
            description="Transition-based parser for UCCA.",
            formatter_class=ArgumentDefaultsHelpFormatter)

        add_boolean_option(ap,
                           "use-bert",
                           default=False,
                           description="whether to use bert embeddings")
        ap.add_argument("--bert-model",
                        choices=[
                            "bert-base-uncased", "bert-large-uncased",
                            "bert-base-cased", "bert-large-cased",
                            "bert-base-multilingual-cased"
                        ],
                        default="bert-base-multilingual-cased")
        ap.add_argument("--bert-layers",
                        type=int,
                        nargs='+',
                        default=[-1, -2, -3, -4])
        ap.add_argument("--bert-layers-pooling",
                        choices=["weighted", "sum", "concat"],
                        default="weighted")
        ap.add_argument("--bert-token-align-by",
                        choices=["first", "sum", "mean"],
                        default="sum")
        ap.add_argument("--bert-multilingual", choices=[0], type=int)
        add_boolean_option(
            ap,
            "bert-use-default-word-embeddings",
            default=False,
            description="whether to use default word embeddings")
        ap.add_argument("--bert-dropout",
                        type=float,
                        default=0,
                        choices=np.linspace(0, 0.9, num=10))

        ap.add_argument("passages",
                        nargs="*",
                        help="passage files/directories to test on/parse")
        ap.add_argument("--version", action="version", version="")
        ap.add_argument("-C",
                        "--config",
                        is_config_file=True,
                        help="configuration file to get arguments from")
        ap.add_argument(
            "-m",
            "--models",
            nargs="+",
            help="model file basename(s) to load/save, ensemble if >1 "
            "(default: <format>_<model_type>")
        ap.add_argument("-c",
                        "--classifier",
                        choices=CLASSIFIERS,
                        default=BIRNN,
                        help="model type")
        ap.add_argument("-B",
                        "--beam",
                        type=int,
                        choices=(1, ),
                        default=1,
                        help="beam size for beam search")
        add_boolean_option(ap,
                           "evaluate",
                           "evaluation of parsed passages",
                           short="e")
        add_verbose_arg(ap, help="detailed parse output")
        constructions.add_argument(ap)
        add_boolean_option(ap, "sentences", "split to sentences")
        add_boolean_option(ap, "paragraphs", "split to paragraphs")
        ap.add_argument(
            "--timeout",
            type=float,
            help="max number of seconds to wait for a single passage")

        group = ap.add_argument_group(title="Training parameters")
        group.add_argument("-t",
                           "--train",
                           nargs="+",
                           default=(),
                           help="passage files/directories to train on")
        group.add_argument("-d",
                           "--dev",
                           nargs="+",
                           default=(),
                           help="passage files/directories to tune on")
        group.add_argument(
            "-I",
            "--iterations",
            nargs="+",
            type=Iterations,
            default=(Iterations(50),
                     Iterations("100 --optimizer=" + EXTRA_TRAINER)),
            help=
            "number of training iterations along with optional hyperparameters per part"
        )
        group.add_argument("--folds",
                           type=int,
                           choices=(3, 5, 10),
                           help="#folds for cross validation")
        group.add_argument("--seed",
                           type=int,
                           default=1,
                           help="random number generator seed")
        add_boolean_option(
            group, "early-update",
            "early update procedure (finish example on first error)")
        group.add_argument(
            "--save-every",
            type=int,
            help="every this many passages, evaluate on dev and save model")
        add_boolean_option(
            group, "eval-test",
            "evaluate on test whenever evaluating on dev, but keep results hidden"
        )
        add_boolean_option(
            group, "ignore-case",
            "pre-convert all input files to lower-case in training and test")

        group = ap.add_argument_group(title="Output files")
        group.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory for parsed files")
        group.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
        add_boolean_option(group,
                           "write",
                           "writing parsed output to files",
                           default=True,
                           short_no="W")
        group.add_argument(
            "-j",
            "--join",
            help=
            "if output format is textual, write all to one file with this basename"
        )
        group.add_argument(
            "-l",
            "--log",
            help="output log file (default: model filename + .log)")
        group.add_argument(
            "--devscores",
            help=
            "output CSV file for dev scores (default: model filename + .dev.csv)"
        )
        group.add_argument(
            "--testscores",
            help=
            "output CSV file for test scores (default: model filename + .test.csv)"
        )
        group.add_argument("--action-stats",
                           help="output CSV file for action statistics")
        add_boolean_option(
            group,
            "normalize",
            "apply normalizations to output in case format is UCCA",
            default=False)
        ap.add_argument(
            "-f",
            "--formats",
            nargs="+",
            choices=FILE_FORMATS,
            default=(),
            help=
            "input formats for creating all parameters before training starts "
            "(otherwise created dynamically based on filename suffix), "
            "and output formats for written files (each will be written; default: UCCA XML)"
        )
        ap.add_argument("-u",
                        "--unlabeled",
                        nargs="*",
                        choices=FORMATS,
                        help="to ignore labels in")
        ap.add_argument(
            "--lang",
            default="en",
            help="two-letter language code to use as the default language")
        add_boolean_option(
            ap, "multilingual",
            "separate model parameters per language (passage.attrib['lang'])")

        group = ap.add_argument_group(title="Sanity checks")
        add_boolean_option(group, "check-loops", "check for parser state loop")
        add_boolean_option(group, "verify",
                           "check for oracle reproducing original passage")
        add_boolean_option(group,
                           "validate-oracle",
                           "require oracle output to respect constraints",
                           default=True)
        add_param_arguments(ap)

        group = ap.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay",
                           type=float,
                           default=1e-5,
                           help="weight decay for parameters")
        add_boolean_option(group,
                           "dynet-apply-weight-decay-on-load",
                           "workaround for clab/dynet#1206",
                           default=False)
        add_boolean_option(group, "dynet-gpu", "GPU for training")
        group.add_argument("--dynet-gpus",
                           type=int,
                           default=1,
                           help="how many GPUs you want to use")
        add_boolean_option(group, "dynet-autobatch",
                           "auto-batching of training examples")
        add_boolean_option(group, "dynet-check-validity",
                           "check validity of expressions immediately")
        DYNET_ARG_NAMES.update(get_group_arg_names(group))

        ap.add_argument(
            "-H",
            "--hyperparams",
            type=HyperparamsInitializer.action,
            nargs="*",
            help=
            "shared hyperparameters or hyperparameters for specific formats, "
            'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"',
            default=[HyperparamsInitializer.action("shared --lstm-layers 2")])
        ap.add_argument("--copy-shared",
                        nargs="*",
                        choices=FORMATS,
                        help="formats whose parameters shall be "
                        "copied from loaded shared parameters")
        self.args = FallbackNamespace(ap.parse_args(args if args else None))

        if self.args.config:
            print("Loading configuration from '%s'." % self.args.config)

        if self.args.passages and self.args.write:
            os.makedirs(self.args.outdir, exist_ok=True)

        if self.args.models:
            if not self.args.log:
                self.args.log = self.args.models[0] + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.models[0] + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.models[0] + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"
        self.sub_configs = [
        ]  # Copies to be stored in Models so that they do not interfere with each other
        self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None
        self._vocab = {}
        self.original_values = {}
        self.random = np.random
        self.update()
예제 #4
0
파일: config.py 프로젝트: danielhers/tupa
    def __init__(self, *args):
        self.arg_parser = ap = ArgParser(description="Transition-based parser for UCCA.",
                                         formatter_class=ArgumentDefaultsHelpFormatter)
        ap.add_argument("passages", nargs="*", help="passage files/directories to test on/parse")
        ap.add_argument("--version", action="version", version="")
        ap.add_argument("-C", "--config", is_config_file=True, help="configuration file to get arguments from")
        ap.add_argument("-m", "--models", nargs="+", help="model file basename(s) to load/save, ensemble if >1 "
                                                          "(default: <format>_<model_type>")
        ap.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=BIRNN, help="model type")
        ap.add_argument("-B", "--beam", type=int, choices=(1,), default=1, help="beam size for beam search")
        add_boolean_option(ap, "evaluate", "evaluation of parsed passages", short="e")
        add_verbose_arg(ap, help="detailed parse output")
        constructions.add_argument(ap)
        add_boolean_option(ap, "sentences", "split to sentences")
        add_boolean_option(ap, "paragraphs", "split to paragraphs")
        ap.add_argument("--timeout", type=float, help="max number of seconds to wait for a single passage")

        group = ap.add_argument_group(title="Training parameters")
        group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on")
        group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on")
        group.add_argument("-I", "--iterations", nargs="+", type=Iterations,
                           default=(Iterations(50), Iterations("100 --optimizer=" + EXTRA_TRAINER)),
                           help="number of training iterations along with optional hyperparameters per part")
        group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation")
        group.add_argument("--seed", type=int, default=1, help="random number generator seed")
        add_boolean_option(group, "early-update", "early update procedure (finish example on first error)")
        group.add_argument("--save-every", type=int, help="every this many passages, evaluate on dev and save model")
        add_boolean_option(group, "eval-test", "evaluate on test whenever evaluating on dev, but keep results hidden")
        add_boolean_option(group, "ignore-case", "pre-convert all input files to lower-case in training and test")

        group = ap.add_argument_group(title="Output files")
        group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files")
        group.add_argument("-p", "--prefix", default="", help="output filename prefix")
        add_boolean_option(group, "write", "writing parsed output to files", default=True, short_no="W")
        group.add_argument("-j", "--join", help="if output format is textual, write all to one file with this basename")
        group.add_argument("-l", "--log", help="output log file (default: model filename + .log)")
        group.add_argument("--devscores", help="output CSV file for dev scores (default: model filename + .dev.csv)")
        group.add_argument("--testscores", help="output CSV file for test scores (default: model filename + .test.csv)")
        group.add_argument("--action-stats", help="output CSV file for action statistics")
        add_boolean_option(group, "normalize", "apply normalizations to output in case format is UCCA", default=False)
        ap.add_argument("-f", "--formats", nargs="+", choices=FILE_FORMATS, default=(),
                        help="input formats for creating all parameters before training starts "
                             "(otherwise created dynamically based on filename suffix), "
                             "and output formats for written files (each will be written; default: UCCA XML)")
        ap.add_argument("-u", "--unlabeled", nargs="*", choices=FORMATS, help="to ignore labels in")
        ap.add_argument("--lang", default="en", help="two-letter language code to use as the default language")
        add_boolean_option(ap, "multilingual", "separate model parameters per language (passage.attrib['lang'])")

        group = ap.add_argument_group(title="Sanity checks")
        add_boolean_option(group, "check-loops", "check for parser state loop")
        add_boolean_option(group, "verify", "check for oracle reproducing original passage")
        add_boolean_option(group, "validate-oracle", "require oracle output to respect constraints", default=True)
        add_param_arguments(ap)

        group = ap.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay", type=float, default=1e-5, help="weight decay for parameters")
        add_boolean_option(group, "dynet-apply-weight-decay-on-load", "workaround for clab/dynet#1206", default=False)
        add_boolean_option(group, "dynet-gpu", "GPU for training")
        group.add_argument("--dynet-gpus", type=int, default=1, help="how many GPUs you want to use")
        add_boolean_option(group, "dynet-autobatch", "auto-batching of training examples")
        DYNET_ARG_NAMES.update(get_group_arg_names(group))

        ap.add_argument("-H", "--hyperparams", type=HyperparamsInitializer.action, nargs="*",
                        help="shared hyperparameters or hyperparameters for specific formats, "
                             'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"',
                        default=[HyperparamsInitializer.action("shared --lstm-layers 2")])
        ap.add_argument("--copy-shared", nargs="*", choices=FORMATS, help="formats whose parameters shall be "
                                                                          "copied from loaded shared parameters")
        self.args = FallbackNamespace(ap.parse_args(args if args else None))

        if self.args.config:
            print("Loading configuration from '%s'." % self.args.config)

        if self.args.passages and self.args.write:
            os.makedirs(self.args.outdir, exist_ok=True)

        if self.args.models:
            if not self.args.log:
                self.args.log = self.args.models[0] + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.models[0] + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.models[0] + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"
        self.sub_configs = []  # Copies to be stored in Models so that they do not interfere with each other
        self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None
        self._vocab = {}
        self.original_values = {}
        self.random = np.random
        self.update()
예제 #5
0
        help="file to write aggregated counts to, in CSV format")
    add_boolean_option(argparser,
                       "unlabeled",
                       "print unlabeled F1 for individual passages",
                       short="u")
    add_boolean_option(argparser,
                       "enhanced",
                       "read enhanced dependencies",
                       default=True)
    add_boolean_option(argparser,
                       "normalize",
                       "normalize passages before evaluation",
                       short="N",
                       default=True)
    add_boolean_option(argparser,
                       "matching-ids",
                       "skip passages without a match (by ID)",
                       short="i")
    add_boolean_option(argparser,
                       "basename",
                       "force passage ID to be file basename",
                       short="b")
    add_boolean_option(argparser, "units", "print mutual and unique units")
    add_boolean_option(argparser, "errors",
                       "print confusion matrix with error distribution")
    group = argparser.add_mutually_exclusive_group()
    add_verbose_arg(group, help="detailed evaluation output")
    add_boolean_option(group, "quiet", "do not print anything", short="q")
    ucca_constructions.add_argument(argparser)
    main(argparser.parse_args())
예제 #6
0
        words = {}
        xmltoconll(passage)
        t = split2sentences(passage)
        i = 0
        for sen in t:
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1

        while (1):
            word = input('\nType the word below\n\n')
            for node in passage.nodes:
                t = passage.nodes[node]
                if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)):
                    #print('Word: %s\nWord ID: %s' %(t.text,t.ID))
                    #ans = input('\nDo you want to continue with wordi Id : %s', t.ID)
                    path = []
                    path = find_path(passage.nodes[t.ID], path)
                    break
            print(' '.join(path))


if __name__ == "__main__":
    argparser = ArgumentParser(
        description=
        "Xml to conll and find the path of the word from UCCA xml file.")
    argparser.add_argument("passages",
                           nargs="+",
                           help="the corpus, given as xml/pickle file names")
    add_argument(argparser, False)
    main(argparser.parse_args())
예제 #7
0
from argparse import ArgumentParser

from ucca import constructions
from ucca.ioutil import read_files_and_dirs

if __name__ == "__main__":
    argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.")
    argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
    constructions.add_argument(argparser, False)
    argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
    args = argparser.parse_args()
    for passage in read_files_and_dirs(args.passages):
        if args.verbose:
            print("%s:" % passage.ID)
        extracted = constructions.extract_edges(passage, constructions=args.constructions, verbose=args.verbose)
        if any(extracted.values()):
            if not args.verbose:
                print("%s:" % passage.ID)
            for construction, edges in extracted.items():
                if edges:
                    print("  %s:" % construction.description)
                    for edge in edges:
                        print("    %s [%s %s]" % (edge, edge.tag, edge.child))
            print()
예제 #8
0
파일: config.py 프로젝트: StefPac/tupa
    def __init__(self, *args):
        argparser = argparse.ArgumentParser(
            description="""Transition-based parser for UCCA.""",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        argparser.add_argument(
            "passages",
            nargs="*",
            help="passage files/directories to test on/parse")
        argparser.add_argument(
            "-m",
            "--model",
            help=
            "model file basename to load/save (default: <format>_<model_type>")
        argparser.add_argument("-c",
                               "--classifier",
                               choices=CLASSIFIERS,
                               default=SPARSE,
                               help="model type")
        argparser.add_argument("-B",
                               "--beam",
                               type=int,
                               choices=(1, ),
                               default=1,
                               help="beam size for beam search")
        add_boolean_option(argparser,
                           "evaluate",
                           "evaluation of parsed passages",
                           short="e")
        add_verbose_argument(argparser, help="detailed parse output")
        group = argparser.add_argument_group(title="Node labels")
        group.add_argument("--max-node-labels",
                           type=int,
                           default=0,
                           help="max number of node labels to allow")
        group.add_argument("--max-node-categories",
                           type=int,
                           default=0,
                           help="max number of node categories to allow")
        group.add_argument("--min-node-label-count",
                           type=int,
                           default=2,
                           help="min number of occurrences for a label")
        add_boolean_option(group, "use-gold-node-labels",
                           "gold node labels when parsing")
        add_boolean_option(group, "wikification",
                           "use Spotlight to wikify any named node")
        constructions.add_argument(argparser)
        add_boolean_option(argparser, "sentences", "split to sentences")
        add_boolean_option(argparser, "paragraphs", "split to paragraphs")
        group = argparser.add_argument_group(title="Training parameters")
        group.add_argument("-t",
                           "--train",
                           nargs="+",
                           default=(),
                           help="passage files/directories to train on")
        group.add_argument("-d",
                           "--dev",
                           nargs="+",
                           default=(),
                           help="passage files/directories to tune on")
        group.add_argument("-I",
                           "--iterations",
                           type=int,
                           default=1,
                           help="number of training iterations")
        group.add_argument("--folds",
                           type=int,
                           choices=(3, 5, 10),
                           help="#folds for cross validation")
        group.add_argument("--seed",
                           type=int,
                           default=1,
                           help="random number generator seed")
        group = argparser.add_argument_group(title="Output files")
        group.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory for parsed files")
        group.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
        add_boolean_option(group,
                           "write",
                           "writing parsed output to files",
                           default=True,
                           short_no="W")
        group.add_argument(
            "-l",
            "--log",
            help="output log file (default: model filename + .log)")
        group.add_argument(
            "--devscores",
            help=
            "output CSV file for dev scores (default: model filename + .dev.csv)"
        )
        group.add_argument(
            "--testscores",
            help=
            "output CSV file for test scores (default: model filename + .test.csv)"
        )
        argparser.add_argument("-f",
                               "--format",
                               choices=FORMATS,
                               help="input and output format")
        argparser.add_argument(
            "--output-format",
            choices=FORMATS,
            help="output format, if different from input format")
        group = argparser.add_argument_group(title="Structural constraints")
        add_boolean_option(group, "linkage", "linkage nodes and edges")
        add_boolean_option(group, "implicit", "implicit nodes and edges")
        add_boolean_option(group, "remote", "remote edges", default=True)
        add_boolean_option(group,
                           "constraints",
                           "scheme-specific rules",
                           default=True)
        add_boolean_option(group, "require-connected",
                           "constraint that output graph must be connected")
        group.add_argument("--orphan-label",
                           default="orphan",
                           help="edge label to use for nodes without parents")
        group.add_argument("--max-action-ratio",
                           type=float,
                           default=100,
                           help="max action/terminal ratio")
        group.add_argument("--max-node-ratio",
                           type=float,
                           default=10,
                           help="max node/terminal ratio")
        group.add_argument("--max-height",
                           type=int,
                           default=20,
                           help="max graph height")
        group = argparser.add_mutually_exclusive_group()
        group.add_argument("--swap",
                           choices=(REGULAR, COMPOUND),
                           default=REGULAR,
                           help="swap transitions")
        group.add_argument("--no-swap",
                           action="store_false",
                           dest="swap",
                           help="exclude swap transitions")
        argparser.add_argument(
            "--max-swap",
            type=int,
            default=15,
            help="if compound swap enabled, maximum swap size")
        group = argparser.add_argument_group(title="Sanity checks")
        add_boolean_option(group, "check-loops", "check for parser state loop")
        add_boolean_option(group, "verify",
                           "check for oracle reproducing original passage")
        group = argparser.add_argument_group(
            title="General classifier training parameters")
        group.add_argument(
            "--learning-rate",
            type=float,
            help="rate for model weight updates (default: by trainer/1)")
        group.add_argument("--learning-rate-decay",
                           type=float,
                           default=0.0,
                           help="learning rate decay per iteration")
        group.add_argument("--swap-importance",
                           type=int,
                           default=1,
                           help="learning rate factor for Swap")
        add_boolean_option(
            group, "early-update",
            "early update procedure (finish example on first error)")
        group.add_argument(
            "--save-every",
            type=int,
            help="every this many passages, evaluate on dev and save model")
        group = argparser.add_argument_group(title="Perceptron parameters")
        group.add_argument("--min-update",
                           type=int,
                           default=5,
                           help="minimum #updates for using a feature")
        self.sparse_arg_names = get_group_arg_names(group)
        group = argparser.add_argument_group(title="Neural network parameters")
        group.add_argument("--word-dim-external",
                           type=int,
                           default=300,
                           help="dimension for external word embeddings")
        group.add_argument(
            "--word-vectors",
            help="file to load external word embeddings from (default: GloVe)")
        add_boolean_option(group, "update-word-vectors",
                           "external word vectors in training parameters")
        group.add_argument("--word-dim",
                           type=int,
                           default=100,
                           help="dimension for learned word embeddings")
        group.add_argument("--tag-dim",
                           type=int,
                           default=10,
                           help="dimension for POS tag embeddings")
        group.add_argument("--dep-dim",
                           type=int,
                           default=10,
                           help="dimension for dependency relation embeddings")
        group.add_argument("--edge-label-dim",
                           type=int,
                           default=20,
                           help="dimension for edge label embeddings")
        group.add_argument("--node-label-dim",
                           type=int,
                           default=0,
                           help="dimension for node label embeddings")
        group.add_argument("--node-category-dim",
                           type=int,
                           default=0,
                           help="dimension for node category embeddings")
        group.add_argument(
            "--punct-dim",
            type=int,
            default=2,
            help="dimension for separator punctuation embeddings")
        group.add_argument("--action-dim",
                           type=int,
                           default=5,
                           help="dimension for input action type embeddings")
        group.add_argument("--ner-dim",
                           type=int,
                           default=5,
                           help="dimension for input entity type embeddings")
        group.add_argument("--output-dim",
                           type=int,
                           default=50,
                           help="dimension for output action embeddings")
        group.add_argument("--layer-dim",
                           type=int,
                           default=500,
                           help="dimension for hidden layers")
        group.add_argument("--layers",
                           type=int,
                           default=2,
                           help="number of hidden layers")
        group.add_argument("--lstm-layer-dim",
                           type=int,
                           default=500,
                           help="dimension for LSTM hidden layers")
        group.add_argument("--lstm-layers",
                           type=int,
                           default=2,
                           help="number of LSTM hidden layers")
        group.add_argument("--embedding-layer-dim",
                           type=int,
                           default=500,
                           help="dimension for layers before LSTM")
        group.add_argument("--embedding-layers",
                           type=int,
                           default=1,
                           help="number of layers before LSTM")
        group.add_argument("--activation",
                           choices=ACTIVATIONS,
                           default=ACTIVATIONS[0],
                           help="activation function")
        group.add_argument("--init",
                           choices=INITIALIZATIONS,
                           default=INITIALIZATIONS[0],
                           help="weight initialization")
        group.add_argument("--minibatch-size",
                           type=int,
                           default=200,
                           help="mini-batch size for optimization")
        group.add_argument("--optimizer",
                           choices=OPTIMIZERS,
                           default=OPTIMIZERS[0],
                           help="algorithm for optimization")
        group.add_argument("--max-words-external",
                           type=int,
                           help="max external word vectors to use")
        group.add_argument("--max-words",
                           type=int,
                           default=10000,
                           help="max number of words to keep embeddings for")
        group.add_argument(
            "--max-tags",
            type=int,
            default=100,
            help="max number of POS tags to keep embeddings for")
        group.add_argument(
            "--max-deps",
            type=int,
            default=100,
            help="max number of dep labels to keep embeddings for")
        group.add_argument("--max-edge-labels",
                           type=int,
                           default=15,
                           help="max number of edge labels for embeddings")
        group.add_argument("--max-puncts",
                           type=int,
                           default=5,
                           help="max number of punctuations for embeddings")
        group.add_argument("--max-action-types",
                           type=int,
                           default=10,
                           help="max number of action types for embeddings")
        group.add_argument("--max-action-labels",
                           type=int,
                           default=100,
                           help="max number of action labels to allow")
        group.add_argument("--max-ner-types",
                           type=int,
                           default=18,
                           help="max number of entity types to allow")
        group.add_argument("--word-dropout",
                           type=float,
                           default=0.25,
                           help="word dropout parameter")
        group.add_argument("--word-dropout-external",
                           type=float,
                           default=0.25,
                           help="word dropout for word vectors")
        group.add_argument("--dropout",
                           type=float,
                           default=0.5,
                           help="dropout parameter between layers")
        group.add_argument("--max-length",
                           type=int,
                           default=120,
                           help="maximum length of input sentence")
        self.nn_arg_names = get_group_arg_names(group)
        group = argparser.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay",
                           type=float,
                           default=1e-6,
                           help="weight decay for parameters")
        add_boolean_option(group, "dynet-gpu", "GPU for training")
        group.add_argument("--dynet-gpus",
                           type=int,
                           default=1,
                           help="how many GPUs you want to use")
        group.add_argument("--dynet-gpu-ids",
                           help="the GPUs that you want to use by device ID")
        add_boolean_option(group, "dynet-viz",
                           "visualization of neural network structure")
        add_boolean_option(group, "dynet-autobatch",
                           "auto-batching of training examples")
        self.dynet_arg_names = get_group_arg_names(group)
        self.args = argparser.parse_args(args if args else None)

        if self.args.model:
            if not self.args.log:
                self.args.log = self.args.model + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.model + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.model + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"
        if self.args.format == "amr":
            self.node_labels = True
            self.args.implicit = True
            if not self.args.node_label_dim:
                self.args.node_label_dim = 20
            if not self.args.max_node_labels:
                self.args.max_node_labels = 1000
            if not self.args.node_category_dim:
                self.args.node_category_dim = 5
            if not self.args.max_node_categories:
                self.args.max_node_categories = 25
            self.args.max_action_labels = max(self.args.max_action_labels, 600)
            self.args.max_edge_labels = max(self.args.max_edge_labels, 110)
        else:
            self.node_labels = False
            self.args.node_label_dim = self.args.max_node_labels = \
                self.args.node_category_dim = self.args.max_node_categories = 0
        self.input_converter, self.output_converter = CONVERTERS.get(
            self.args.format, (None, None))
        if self.args.output_format:
            _, self.output_converter = CONVERTERS.get(self.args.output_format,
                                                      (None, None))
        else:
            self.args.output_format = self.args.format
        if self.output_converter is not None:
            self.output_converter = partial(
                self.output_converter, wikification=self.args.wikification)
        self._logger = None
        self.set_external()
        self.random = np.random
예제 #9
0
파일: config.py 프로젝트: ml-lab/tupa
    def __init__(self, *args):
        argparser = argparse.ArgumentParser(
            description="""Transition-based parser for UCCA.""",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        argparser.add_argument(
            "passages",
            nargs="*",
            help="passage files/directories to test on/parse")
        argparser.add_argument(
            "-m",
            "--model",
            help="model file to load/save (default: ucca_<model_type>")
        argparser.add_argument("-c",
                               "--classifier",
                               choices=CLASSIFIERS,
                               default=SPARSE_PERCEPTRON,
                               help="model type")
        argparser.add_argument("-B",
                               "--beam",
                               choices=(1, ),
                               default=1,
                               help="beam size for beam search (1 for greedy)")
        argparser.add_argument("-e",
                               "--evaluate",
                               action="store_true",
                               help="evaluate parsed passages")
        argparser.add_argument("-v",
                               "--verbose",
                               nargs="?",
                               action=VAction,
                               default=0,
                               help="detailed parse output")
        constructions.add_argument(argparser)
        group = argparser.add_mutually_exclusive_group()
        group.add_argument("-s",
                           "--sentences",
                           action="store_true",
                           help="separate passages to sentences")
        group.add_argument("-a",
                           "--paragraphs",
                           action="store_true",
                           help="separate passages to paragraphs")
        group = argparser.add_argument_group(title="Training parameters")
        group.add_argument("-t",
                           "--train",
                           nargs="+",
                           default=(),
                           help="passage files/directories to train on")
        group.add_argument("-d",
                           "--dev",
                           nargs="+",
                           default=(),
                           help="passage files/directories to tune on")
        group.add_argument("-I",
                           "--iterations",
                           type=int,
                           default=1,
                           help="number of training iterations")
        group.add_argument("--folds",
                           type=int,
                           choices=(3, 5, 10),
                           help="#folds for cross validation")
        group.add_argument("--seed",
                           type=int,
                           default=1,
                           help="random number generator seed")
        group = argparser.add_argument_group(title="Output files")
        group.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory for parsed files")
        group.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
        group.add_argument("-W",
                           "--no-write",
                           action="store_true",
                           help="do not write parsed passages to files")
        group.add_argument(
            "-l",
            "--log",
            help="output log file (default: model filename + .log)")
        group.add_argument(
            "--devscores",
            help=
            "output CSV file for dev scores (default: model filename + .dev.csv)"
        )
        group.add_argument(
            "--testscores",
            help=
            "output CSV file for test scores (default: model filename + .test.csv)"
        )
        group = argparser.add_argument_group(title="Structural constraints")
        group.add_argument("--linkage",
                           action="store_true",
                           help="include linkage nodes and edges")
        group.add_argument("--implicit",
                           action="store_true",
                           help="include implicit nodes and edges")
        group.add_argument("--no-remote",
                           action="store_false",
                           dest="remote",
                           help="ignore remote edges")
        group.add_argument("--no-constraints",
                           action="store_false",
                           dest="constraints",
                           help="ignore UCCA rules")
        group.add_argument("--max-nodes",
                           type=float,
                           default=3.0,
                           help="max non-terminal/terminal ratio")
        group.add_argument("--max-height",
                           type=int,
                           default=20,
                           help="max graph height")
        group = argparser.add_mutually_exclusive_group()
        group.add_argument("--no-swap",
                           action="store_false",
                           dest="swap",
                           help="disable Swap transitions entirely")
        group.add_argument("--compound-swap",
                           action="store_true",
                           help="enable compound swap")
        group = argparser.add_argument_group(title="Sanity checks")
        group.add_argument("--check-loops",
                           action="store_true",
                           help="abort if the parser enters a state loop")
        group.add_argument("--verify",
                           action="store_true",
                           help="verify oracle reproduces original passage")
        group = group.add_mutually_exclusive_group()
        group.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="read and write passages in Pickle")
        group.add_argument("-f",
                           "--format",
                           choices=convert.CONVERTERS,
                           help="output format for parsed files")
        group = argparser.add_argument_group(
            title="General classifier training parameters")
        group.add_argument("--swap-importance",
                           type=int,
                           default=1,
                           help="learning rate factor for Swap")
        group.add_argument("--early-update",
                           action="store_true",
                           help="move to next example on incorrect prediction")
        group.add_argument("--word-dim-external",
                           type=int,
                           default=300,
                           help="dimension for external word embeddings")
        group.add_argument(
            "--word-vectors",
            help="file to load external word embeddings from (default: GloVe)")
        group = argparser.add_argument_group(title="Perceptron parameters")
        group.add_argument("--learning-rate",
                           type=float,
                           default=1.0,
                           help="rate for model weight updates")
        group.add_argument("--learning-rate-decay",
                           type=float,
                           default=0.0,
                           help="learning rate decay per iteration")
        group.add_argument("--min-update",
                           type=int,
                           default=5,
                           help="minimum #updates for using a feature")
        group = argparser.add_argument_group(title="Neural network parameters")
        group.add_argument("--update-word-vectors",
                           action="store_true",
                           help="tune the external word embeddings")
        group.add_argument("--word-dim",
                           type=int,
                           default=100,
                           help="dimension for learned word embeddings")
        group.add_argument("--tag-dim",
                           type=int,
                           default=10,
                           help="dimension for POS tag embeddings")
        group.add_argument("--dep-dim",
                           type=int,
                           default=10,
                           help="dimension for dependency relation embeddings")
        group.add_argument("--label-dim",
                           type=int,
                           default=20,
                           help="dimension for edge label embeddings")
        group.add_argument(
            "--punct-dim",
            type=int,
            default=2,
            help="dimension for separator punctuation embeddings")
        group.add_argument("--gap-dim",
                           type=int,
                           default=2,
                           help="dimension for gap type embeddings")
        group.add_argument("--action-dim",
                           type=int,
                           default=5,
                           help="dimension for action type embeddings")
        group.add_argument("--layer-dim",
                           type=int,
                           default=500,
                           help="dimension for hidden layers")
        group.add_argument("--layers",
                           type=int,
                           default=2,
                           help="number of hidden layers")
        group.add_argument("--lstm-layer-dim",
                           type=int,
                           default=500,
                           help="dimension for LSTM hidden layers")
        group.add_argument("--lstm-layers",
                           type=int,
                           default=2,
                           help="number of LSTM hidden layers")
        group.add_argument("--embedding-layer-dim",
                           type=int,
                           default=500,
                           help="dimension for layers before LSTM")
        group.add_argument("--embedding-layers",
                           type=int,
                           default=1,
                           help="number of layers before LSTM")
        group.add_argument("--activation",
                           choices=ACTIVATIONS,
                           default=ACTIVATIONS[0],
                           help="activation function")
        group.add_argument("--init",
                           choices=INITIALIZATIONS,
                           default=INITIALIZATIONS[0],
                           help="weight initialization")
        group.add_argument("--max-labels",
                           type=int,
                           default=100,
                           help="max number of actions to allow")
        group.add_argument(
            "--save-every",
            type=int,
            help="every this many passages, evaluate on dev and save model")
        group.add_argument("--minibatch-size",
                           type=int,
                           default=200,
                           help="mini-batch size for optimization")
        group.add_argument("--optimizer",
                           choices=OPTIMIZERS,
                           default=OPTIMIZERS[0],
                           help="algorithm for optimization")
        group.add_argument("--max-words-external",
                           type=int,
                           help="max external word vectors to use")
        group.add_argument("--max-words",
                           type=int,
                           default=10000,
                           help="max number of words to keep embeddings for")
        group.add_argument(
            "--max-tags",
            type=int,
            default=100,
            help="max number of POS tags to keep embeddings for")
        group.add_argument(
            "--max-deps",
            type=int,
            default=100,
            help="max number of dep labels to keep embeddings for")
        group.add_argument("--max-edge-labels",
                           type=int,
                           default=15,
                           help="max number of edge labels for embeddings")
        group.add_argument("--max-puncts",
                           type=int,
                           default=5,
                           help="max number of punctuations for embeddings")
        group.add_argument(
            "--max-gaps",
            type=int,
            default=3,
            help="max number of gap types to keep embeddings for")
        group.add_argument("--max-actions",
                           type=int,
                           default=10,
                           help="max number of action types for embeddings")
        group.add_argument("--word-dropout",
                           type=float,
                           default=0.25,
                           help="word dropout parameter")
        group.add_argument("--word-dropout-external",
                           type=float,
                           default=0.25,
                           help="word dropout for word vectors")
        group.add_argument("--dropout",
                           type=float,
                           default=0.5,
                           help="dropout parameter between layers")
        group = argparser.add_argument_group(title="DyNet parameters")
        group.add_argument("--dynet-mem", help="memory for dynet")
        group.add_argument("--dynet-weight-decay",
                           type=float,
                           help="weight decay for parameters (default 1e-6)")
        group.add_argument("--dynet-gpu",
                           action="store_true",
                           help="use the GPU")
        group.add_argument("--dynet-gpus",
                           type=int,
                           help="how many GPUs you want to use")
        group.add_argument("--dynet-gpu-ids",
                           help="the GPUs that you want to use by device ID")
        group.add_argument("--dynet-viz",
                           action="store_true",
                           help="visualize NN and exit")
        self.args = argparser.parse_args(args if args else None)

        if self.args.model:
            if not self.args.log:
                self.args.log = self.args.model + ".log"
            if self.args.dev and not self.args.devscores:
                self.args.devscores = self.args.model + ".dev.csv"
            if self.args.passages and not self.args.testscores:
                self.args.testscores = self.args.model + ".test.csv"
        elif not self.args.log:
            self.args.log = "parse.log"

        self._log_file = None
        self.set_external()
        self.random = np.random