xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \ api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys) guessed, ref = [convert.from_site(x) for x in xmls] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, constructions=args.constructions, verbose=True) if __name__ == '__main__': argparser = ArgumentParser(description="Evaluate passages on UCCA DB") argparser.add_argument("--db", "-d", required=True, dest="db_filename", help="the db file name") argparser.add_argument("--host", "--hst", help="the host name") group = argparser.add_mutually_exclusive_group() group.add_argument("-p", "--pid", type=int, help="the passage ID") group.add_argument("-x", "--from_xids", action="store_true", help="interpret the ref and the guessed parameters as Xids in the db") argparser.add_argument("--guessed", "-g", required=True, help="if a db is defined - the username for the guessed annotation; " "else - the xml file name for the guessed annotation") argparser.add_argument("-r", "--ref", required=True, help="if a db is defined - the username for the reference annotation; " "else - the xml file name for the reference annotation") argparser.add_argument("-u", "--units", action="store_true", help="the units the annotations have in common, and those each has separately") argparser.add_argument("-f", "--fscore", action="store_true", help="outputs the traditional P,R,F instead of the scene structure evaluation") argparser.add_argument("-e", "--errors", action="store_true", help="prints the error distribution according to its frequency") constructions.add_argument(argparser) main(argparser.parse_args())
from argparse import ArgumentParser from ucca import constructions from ucca.ioutil import read_files_and_dirs if __name__ == "__main__": argparser = ArgumentParser( description="Extract linguistic constructions from UCCA corpus.") argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") constructions.add_argument(argparser, False) argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for passage in read_files_and_dirs(args.passages): if args.verbose: print("%s:" % passage.ID) extracted = constructions.extract_edges( passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def __init__(self, *args): self.arg_parser = ap = ArgParser( description="Transition-based parser for UCCA.", formatter_class=ArgumentDefaultsHelpFormatter) add_boolean_option(ap, "use-bert", default=False, description="whether to use bert embeddings") ap.add_argument("--bert-model", choices=[ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-cased" ], default="bert-base-multilingual-cased") ap.add_argument("--bert-layers", type=int, nargs='+', default=[-1, -2, -3, -4]) ap.add_argument("--bert-layers-pooling", choices=["weighted", "sum", "concat"], default="weighted") ap.add_argument("--bert-token-align-by", choices=["first", "sum", "mean"], default="sum") ap.add_argument("--bert-multilingual", choices=[0], type=int) add_boolean_option( ap, "bert-use-default-word-embeddings", default=False, description="whether to use default word embeddings") ap.add_argument("--bert-dropout", type=float, default=0, choices=np.linspace(0, 0.9, num=10)) ap.add_argument("passages", nargs="*", help="passage files/directories to test on/parse") ap.add_argument("--version", action="version", version="") ap.add_argument("-C", "--config", is_config_file=True, help="configuration file to get arguments from") ap.add_argument( "-m", "--models", nargs="+", help="model file basename(s) to load/save, ensemble if >1 " "(default: <format>_<model_type>") ap.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=BIRNN, help="model type") ap.add_argument("-B", "--beam", type=int, choices=(1, ), default=1, help="beam size for beam search") add_boolean_option(ap, "evaluate", "evaluation of parsed passages", short="e") add_verbose_arg(ap, help="detailed parse output") constructions.add_argument(ap) add_boolean_option(ap, "sentences", "split to sentences") add_boolean_option(ap, "paragraphs", "split to paragraphs") ap.add_argument( "--timeout", type=float, help="max number of seconds to wait for a single passage") group = ap.add_argument_group(title="Training parameters") group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on") group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on") group.add_argument( "-I", "--iterations", nargs="+", type=Iterations, default=(Iterations(50), Iterations("100 --optimizer=" + EXTRA_TRAINER)), help= "number of training iterations along with optional hyperparameters per part" ) group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation") group.add_argument("--seed", type=int, default=1, help="random number generator seed") add_boolean_option( group, "early-update", "early update procedure (finish example on first error)") group.add_argument( "--save-every", type=int, help="every this many passages, evaluate on dev and save model") add_boolean_option( group, "eval-test", "evaluate on test whenever evaluating on dev, but keep results hidden" ) add_boolean_option( group, "ignore-case", "pre-convert all input files to lower-case in training and test") group = ap.add_argument_group(title="Output files") group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files") group.add_argument("-p", "--prefix", default="", help="output filename prefix") add_boolean_option(group, "write", "writing parsed output to files", default=True, short_no="W") group.add_argument( "-j", "--join", help= "if output format is textual, write all to one file with this basename" ) group.add_argument( "-l", "--log", help="output log file (default: model filename + .log)") group.add_argument( "--devscores", help= "output CSV file for dev scores (default: model filename + .dev.csv)" ) group.add_argument( "--testscores", help= "output CSV file for test scores (default: model filename + .test.csv)" ) group.add_argument("--action-stats", help="output CSV file for action statistics") add_boolean_option( group, "normalize", "apply normalizations to output in case format is UCCA", default=False) ap.add_argument( "-f", "--formats", nargs="+", choices=FILE_FORMATS, default=(), help= "input formats for creating all parameters before training starts " "(otherwise created dynamically based on filename suffix), " "and output formats for written files (each will be written; default: UCCA XML)" ) ap.add_argument("-u", "--unlabeled", nargs="*", choices=FORMATS, help="to ignore labels in") ap.add_argument( "--lang", default="en", help="two-letter language code to use as the default language") add_boolean_option( ap, "multilingual", "separate model parameters per language (passage.attrib['lang'])") group = ap.add_argument_group(title="Sanity checks") add_boolean_option(group, "check-loops", "check for parser state loop") add_boolean_option(group, "verify", "check for oracle reproducing original passage") add_boolean_option(group, "validate-oracle", "require oracle output to respect constraints", default=True) add_param_arguments(ap) group = ap.add_argument_group(title="DyNet parameters") group.add_argument("--dynet-mem", help="memory for dynet") group.add_argument("--dynet-weight-decay", type=float, default=1e-5, help="weight decay for parameters") add_boolean_option(group, "dynet-apply-weight-decay-on-load", "workaround for clab/dynet#1206", default=False) add_boolean_option(group, "dynet-gpu", "GPU for training") group.add_argument("--dynet-gpus", type=int, default=1, help="how many GPUs you want to use") add_boolean_option(group, "dynet-autobatch", "auto-batching of training examples") add_boolean_option(group, "dynet-check-validity", "check validity of expressions immediately") DYNET_ARG_NAMES.update(get_group_arg_names(group)) ap.add_argument( "-H", "--hyperparams", type=HyperparamsInitializer.action, nargs="*", help= "shared hyperparameters or hyperparameters for specific formats, " 'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"', default=[HyperparamsInitializer.action("shared --lstm-layers 2")]) ap.add_argument("--copy-shared", nargs="*", choices=FORMATS, help="formats whose parameters shall be " "copied from loaded shared parameters") self.args = FallbackNamespace(ap.parse_args(args if args else None)) if self.args.config: print("Loading configuration from '%s'." % self.args.config) if self.args.passages and self.args.write: os.makedirs(self.args.outdir, exist_ok=True) if self.args.models: if not self.args.log: self.args.log = self.args.models[0] + ".log" if self.args.dev and not self.args.devscores: self.args.devscores = self.args.models[0] + ".dev.csv" if self.args.passages and not self.args.testscores: self.args.testscores = self.args.models[0] + ".test.csv" elif not self.args.log: self.args.log = "parse.log" self.sub_configs = [ ] # Copies to be stored in Models so that they do not interfere with each other self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None self._vocab = {} self.original_values = {} self.random = np.random self.update()
def __init__(self, *args): self.arg_parser = ap = ArgParser(description="Transition-based parser for UCCA.", formatter_class=ArgumentDefaultsHelpFormatter) ap.add_argument("passages", nargs="*", help="passage files/directories to test on/parse") ap.add_argument("--version", action="version", version="") ap.add_argument("-C", "--config", is_config_file=True, help="configuration file to get arguments from") ap.add_argument("-m", "--models", nargs="+", help="model file basename(s) to load/save, ensemble if >1 " "(default: <format>_<model_type>") ap.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=BIRNN, help="model type") ap.add_argument("-B", "--beam", type=int, choices=(1,), default=1, help="beam size for beam search") add_boolean_option(ap, "evaluate", "evaluation of parsed passages", short="e") add_verbose_arg(ap, help="detailed parse output") constructions.add_argument(ap) add_boolean_option(ap, "sentences", "split to sentences") add_boolean_option(ap, "paragraphs", "split to paragraphs") ap.add_argument("--timeout", type=float, help="max number of seconds to wait for a single passage") group = ap.add_argument_group(title="Training parameters") group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on") group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on") group.add_argument("-I", "--iterations", nargs="+", type=Iterations, default=(Iterations(50), Iterations("100 --optimizer=" + EXTRA_TRAINER)), help="number of training iterations along with optional hyperparameters per part") group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation") group.add_argument("--seed", type=int, default=1, help="random number generator seed") add_boolean_option(group, "early-update", "early update procedure (finish example on first error)") group.add_argument("--save-every", type=int, help="every this many passages, evaluate on dev and save model") add_boolean_option(group, "eval-test", "evaluate on test whenever evaluating on dev, but keep results hidden") add_boolean_option(group, "ignore-case", "pre-convert all input files to lower-case in training and test") group = ap.add_argument_group(title="Output files") group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files") group.add_argument("-p", "--prefix", default="", help="output filename prefix") add_boolean_option(group, "write", "writing parsed output to files", default=True, short_no="W") group.add_argument("-j", "--join", help="if output format is textual, write all to one file with this basename") group.add_argument("-l", "--log", help="output log file (default: model filename + .log)") group.add_argument("--devscores", help="output CSV file for dev scores (default: model filename + .dev.csv)") group.add_argument("--testscores", help="output CSV file for test scores (default: model filename + .test.csv)") group.add_argument("--action-stats", help="output CSV file for action statistics") add_boolean_option(group, "normalize", "apply normalizations to output in case format is UCCA", default=False) ap.add_argument("-f", "--formats", nargs="+", choices=FILE_FORMATS, default=(), help="input formats for creating all parameters before training starts " "(otherwise created dynamically based on filename suffix), " "and output formats for written files (each will be written; default: UCCA XML)") ap.add_argument("-u", "--unlabeled", nargs="*", choices=FORMATS, help="to ignore labels in") ap.add_argument("--lang", default="en", help="two-letter language code to use as the default language") add_boolean_option(ap, "multilingual", "separate model parameters per language (passage.attrib['lang'])") group = ap.add_argument_group(title="Sanity checks") add_boolean_option(group, "check-loops", "check for parser state loop") add_boolean_option(group, "verify", "check for oracle reproducing original passage") add_boolean_option(group, "validate-oracle", "require oracle output to respect constraints", default=True) add_param_arguments(ap) group = ap.add_argument_group(title="DyNet parameters") group.add_argument("--dynet-mem", help="memory for dynet") group.add_argument("--dynet-weight-decay", type=float, default=1e-5, help="weight decay for parameters") add_boolean_option(group, "dynet-apply-weight-decay-on-load", "workaround for clab/dynet#1206", default=False) add_boolean_option(group, "dynet-gpu", "GPU for training") group.add_argument("--dynet-gpus", type=int, default=1, help="how many GPUs you want to use") add_boolean_option(group, "dynet-autobatch", "auto-batching of training examples") DYNET_ARG_NAMES.update(get_group_arg_names(group)) ap.add_argument("-H", "--hyperparams", type=HyperparamsInitializer.action, nargs="*", help="shared hyperparameters or hyperparameters for specific formats, " 'e.g., "shared --lstm-layer-dim=100 --lstm-layers=1" "ucca --word-dim=300"', default=[HyperparamsInitializer.action("shared --lstm-layers 2")]) ap.add_argument("--copy-shared", nargs="*", choices=FORMATS, help="formats whose parameters shall be " "copied from loaded shared parameters") self.args = FallbackNamespace(ap.parse_args(args if args else None)) if self.args.config: print("Loading configuration from '%s'." % self.args.config) if self.args.passages and self.args.write: os.makedirs(self.args.outdir, exist_ok=True) if self.args.models: if not self.args.log: self.args.log = self.args.models[0] + ".log" if self.args.dev and not self.args.devscores: self.args.devscores = self.args.models[0] + ".dev.csv" if self.args.passages and not self.args.testscores: self.args.testscores = self.args.models[0] + ".test.csv" elif not self.args.log: self.args.log = "parse.log" self.sub_configs = [] # Copies to be stored in Models so that they do not interfere with each other self._logger = self.format = self.hyperparams = self.iteration_hyperparams = None self._vocab = {} self.original_values = {} self.random = np.random self.update()
help="file to write aggregated counts to, in CSV format") add_boolean_option(argparser, "unlabeled", "print unlabeled F1 for individual passages", short="u") add_boolean_option(argparser, "enhanced", "read enhanced dependencies", default=True) add_boolean_option(argparser, "normalize", "normalize passages before evaluation", short="N", default=True) add_boolean_option(argparser, "matching-ids", "skip passages without a match (by ID)", short="i") add_boolean_option(argparser, "basename", "force passage ID to be file basename", short="b") add_boolean_option(argparser, "units", "print mutual and unique units") add_boolean_option(argparser, "errors", "print confusion matrix with error distribution") group = argparser.add_mutually_exclusive_group() add_verbose_arg(group, help="detailed evaluation output") add_boolean_option(group, "quiet", "do not print anything", short="q") ucca_constructions.add_argument(argparser) main(argparser.parse_args())
words = {} xmltoconll(passage) t = split2sentences(passage) i = 0 for sen in t: print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 while (1): word = input('\nType the word below\n\n') for node in passage.nodes: t = passage.nodes[node] if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)): #print('Word: %s\nWord ID: %s' %(t.text,t.ID)) #ans = input('\nDo you want to continue with wordi Id : %s', t.ID) path = [] path = find_path(passage.nodes[t.ID], path) break print(' '.join(path)) if __name__ == "__main__": argparser = ArgumentParser( description= "Xml to conll and find the path of the word from UCCA xml file.") argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") add_argument(argparser, False) main(argparser.parse_args())
from argparse import ArgumentParser from ucca import constructions from ucca.ioutil import read_files_and_dirs if __name__ == "__main__": argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.") argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") constructions.add_argument(argparser, False) argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for passage in read_files_and_dirs(args.passages): if args.verbose: print("%s:" % passage.ID) extracted = constructions.extract_edges(passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def __init__(self, *args): argparser = argparse.ArgumentParser( description="""Transition-based parser for UCCA.""", formatter_class=argparse.ArgumentDefaultsHelpFormatter) argparser.add_argument( "passages", nargs="*", help="passage files/directories to test on/parse") argparser.add_argument( "-m", "--model", help= "model file basename to load/save (default: <format>_<model_type>") argparser.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=SPARSE, help="model type") argparser.add_argument("-B", "--beam", type=int, choices=(1, ), default=1, help="beam size for beam search") add_boolean_option(argparser, "evaluate", "evaluation of parsed passages", short="e") add_verbose_argument(argparser, help="detailed parse output") group = argparser.add_argument_group(title="Node labels") group.add_argument("--max-node-labels", type=int, default=0, help="max number of node labels to allow") group.add_argument("--max-node-categories", type=int, default=0, help="max number of node categories to allow") group.add_argument("--min-node-label-count", type=int, default=2, help="min number of occurrences for a label") add_boolean_option(group, "use-gold-node-labels", "gold node labels when parsing") add_boolean_option(group, "wikification", "use Spotlight to wikify any named node") constructions.add_argument(argparser) add_boolean_option(argparser, "sentences", "split to sentences") add_boolean_option(argparser, "paragraphs", "split to paragraphs") group = argparser.add_argument_group(title="Training parameters") group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on") group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on") group.add_argument("-I", "--iterations", type=int, default=1, help="number of training iterations") group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation") group.add_argument("--seed", type=int, default=1, help="random number generator seed") group = argparser.add_argument_group(title="Output files") group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files") group.add_argument("-p", "--prefix", default="", help="output filename prefix") add_boolean_option(group, "write", "writing parsed output to files", default=True, short_no="W") group.add_argument( "-l", "--log", help="output log file (default: model filename + .log)") group.add_argument( "--devscores", help= "output CSV file for dev scores (default: model filename + .dev.csv)" ) group.add_argument( "--testscores", help= "output CSV file for test scores (default: model filename + .test.csv)" ) argparser.add_argument("-f", "--format", choices=FORMATS, help="input and output format") argparser.add_argument( "--output-format", choices=FORMATS, help="output format, if different from input format") group = argparser.add_argument_group(title="Structural constraints") add_boolean_option(group, "linkage", "linkage nodes and edges") add_boolean_option(group, "implicit", "implicit nodes and edges") add_boolean_option(group, "remote", "remote edges", default=True) add_boolean_option(group, "constraints", "scheme-specific rules", default=True) add_boolean_option(group, "require-connected", "constraint that output graph must be connected") group.add_argument("--orphan-label", default="orphan", help="edge label to use for nodes without parents") group.add_argument("--max-action-ratio", type=float, default=100, help="max action/terminal ratio") group.add_argument("--max-node-ratio", type=float, default=10, help="max node/terminal ratio") group.add_argument("--max-height", type=int, default=20, help="max graph height") group = argparser.add_mutually_exclusive_group() group.add_argument("--swap", choices=(REGULAR, COMPOUND), default=REGULAR, help="swap transitions") group.add_argument("--no-swap", action="store_false", dest="swap", help="exclude swap transitions") argparser.add_argument( "--max-swap", type=int, default=15, help="if compound swap enabled, maximum swap size") group = argparser.add_argument_group(title="Sanity checks") add_boolean_option(group, "check-loops", "check for parser state loop") add_boolean_option(group, "verify", "check for oracle reproducing original passage") group = argparser.add_argument_group( title="General classifier training parameters") group.add_argument( "--learning-rate", type=float, help="rate for model weight updates (default: by trainer/1)") group.add_argument("--learning-rate-decay", type=float, default=0.0, help="learning rate decay per iteration") group.add_argument("--swap-importance", type=int, default=1, help="learning rate factor for Swap") add_boolean_option( group, "early-update", "early update procedure (finish example on first error)") group.add_argument( "--save-every", type=int, help="every this many passages, evaluate on dev and save model") group = argparser.add_argument_group(title="Perceptron parameters") group.add_argument("--min-update", type=int, default=5, help="minimum #updates for using a feature") self.sparse_arg_names = get_group_arg_names(group) group = argparser.add_argument_group(title="Neural network parameters") group.add_argument("--word-dim-external", type=int, default=300, help="dimension for external word embeddings") group.add_argument( "--word-vectors", help="file to load external word embeddings from (default: GloVe)") add_boolean_option(group, "update-word-vectors", "external word vectors in training parameters") group.add_argument("--word-dim", type=int, default=100, help="dimension for learned word embeddings") group.add_argument("--tag-dim", type=int, default=10, help="dimension for POS tag embeddings") group.add_argument("--dep-dim", type=int, default=10, help="dimension for dependency relation embeddings") group.add_argument("--edge-label-dim", type=int, default=20, help="dimension for edge label embeddings") group.add_argument("--node-label-dim", type=int, default=0, help="dimension for node label embeddings") group.add_argument("--node-category-dim", type=int, default=0, help="dimension for node category embeddings") group.add_argument( "--punct-dim", type=int, default=2, help="dimension for separator punctuation embeddings") group.add_argument("--action-dim", type=int, default=5, help="dimension for input action type embeddings") group.add_argument("--ner-dim", type=int, default=5, help="dimension for input entity type embeddings") group.add_argument("--output-dim", type=int, default=50, help="dimension for output action embeddings") group.add_argument("--layer-dim", type=int, default=500, help="dimension for hidden layers") group.add_argument("--layers", type=int, default=2, help="number of hidden layers") group.add_argument("--lstm-layer-dim", type=int, default=500, help="dimension for LSTM hidden layers") group.add_argument("--lstm-layers", type=int, default=2, help="number of LSTM hidden layers") group.add_argument("--embedding-layer-dim", type=int, default=500, help="dimension for layers before LSTM") group.add_argument("--embedding-layers", type=int, default=1, help="number of layers before LSTM") group.add_argument("--activation", choices=ACTIVATIONS, default=ACTIVATIONS[0], help="activation function") group.add_argument("--init", choices=INITIALIZATIONS, default=INITIALIZATIONS[0], help="weight initialization") group.add_argument("--minibatch-size", type=int, default=200, help="mini-batch size for optimization") group.add_argument("--optimizer", choices=OPTIMIZERS, default=OPTIMIZERS[0], help="algorithm for optimization") group.add_argument("--max-words-external", type=int, help="max external word vectors to use") group.add_argument("--max-words", type=int, default=10000, help="max number of words to keep embeddings for") group.add_argument( "--max-tags", type=int, default=100, help="max number of POS tags to keep embeddings for") group.add_argument( "--max-deps", type=int, default=100, help="max number of dep labels to keep embeddings for") group.add_argument("--max-edge-labels", type=int, default=15, help="max number of edge labels for embeddings") group.add_argument("--max-puncts", type=int, default=5, help="max number of punctuations for embeddings") group.add_argument("--max-action-types", type=int, default=10, help="max number of action types for embeddings") group.add_argument("--max-action-labels", type=int, default=100, help="max number of action labels to allow") group.add_argument("--max-ner-types", type=int, default=18, help="max number of entity types to allow") group.add_argument("--word-dropout", type=float, default=0.25, help="word dropout parameter") group.add_argument("--word-dropout-external", type=float, default=0.25, help="word dropout for word vectors") group.add_argument("--dropout", type=float, default=0.5, help="dropout parameter between layers") group.add_argument("--max-length", type=int, default=120, help="maximum length of input sentence") self.nn_arg_names = get_group_arg_names(group) group = argparser.add_argument_group(title="DyNet parameters") group.add_argument("--dynet-mem", help="memory for dynet") group.add_argument("--dynet-weight-decay", type=float, default=1e-6, help="weight decay for parameters") add_boolean_option(group, "dynet-gpu", "GPU for training") group.add_argument("--dynet-gpus", type=int, default=1, help="how many GPUs you want to use") group.add_argument("--dynet-gpu-ids", help="the GPUs that you want to use by device ID") add_boolean_option(group, "dynet-viz", "visualization of neural network structure") add_boolean_option(group, "dynet-autobatch", "auto-batching of training examples") self.dynet_arg_names = get_group_arg_names(group) self.args = argparser.parse_args(args if args else None) if self.args.model: if not self.args.log: self.args.log = self.args.model + ".log" if self.args.dev and not self.args.devscores: self.args.devscores = self.args.model + ".dev.csv" if self.args.passages and not self.args.testscores: self.args.testscores = self.args.model + ".test.csv" elif not self.args.log: self.args.log = "parse.log" if self.args.format == "amr": self.node_labels = True self.args.implicit = True if not self.args.node_label_dim: self.args.node_label_dim = 20 if not self.args.max_node_labels: self.args.max_node_labels = 1000 if not self.args.node_category_dim: self.args.node_category_dim = 5 if not self.args.max_node_categories: self.args.max_node_categories = 25 self.args.max_action_labels = max(self.args.max_action_labels, 600) self.args.max_edge_labels = max(self.args.max_edge_labels, 110) else: self.node_labels = False self.args.node_label_dim = self.args.max_node_labels = \ self.args.node_category_dim = self.args.max_node_categories = 0 self.input_converter, self.output_converter = CONVERTERS.get( self.args.format, (None, None)) if self.args.output_format: _, self.output_converter = CONVERTERS.get(self.args.output_format, (None, None)) else: self.args.output_format = self.args.format if self.output_converter is not None: self.output_converter = partial( self.output_converter, wikification=self.args.wikification) self._logger = None self.set_external() self.random = np.random
def __init__(self, *args): argparser = argparse.ArgumentParser( description="""Transition-based parser for UCCA.""", formatter_class=argparse.ArgumentDefaultsHelpFormatter) argparser.add_argument( "passages", nargs="*", help="passage files/directories to test on/parse") argparser.add_argument( "-m", "--model", help="model file to load/save (default: ucca_<model_type>") argparser.add_argument("-c", "--classifier", choices=CLASSIFIERS, default=SPARSE_PERCEPTRON, help="model type") argparser.add_argument("-B", "--beam", choices=(1, ), default=1, help="beam size for beam search (1 for greedy)") argparser.add_argument("-e", "--evaluate", action="store_true", help="evaluate parsed passages") argparser.add_argument("-v", "--verbose", nargs="?", action=VAction, default=0, help="detailed parse output") constructions.add_argument(argparser) group = argparser.add_mutually_exclusive_group() group.add_argument("-s", "--sentences", action="store_true", help="separate passages to sentences") group.add_argument("-a", "--paragraphs", action="store_true", help="separate passages to paragraphs") group = argparser.add_argument_group(title="Training parameters") group.add_argument("-t", "--train", nargs="+", default=(), help="passage files/directories to train on") group.add_argument("-d", "--dev", nargs="+", default=(), help="passage files/directories to tune on") group.add_argument("-I", "--iterations", type=int, default=1, help="number of training iterations") group.add_argument("--folds", type=int, choices=(3, 5, 10), help="#folds for cross validation") group.add_argument("--seed", type=int, default=1, help="random number generator seed") group = argparser.add_argument_group(title="Output files") group.add_argument("-o", "--outdir", default=".", help="output directory for parsed files") group.add_argument("-p", "--prefix", default="", help="output filename prefix") group.add_argument("-W", "--no-write", action="store_true", help="do not write parsed passages to files") group.add_argument( "-l", "--log", help="output log file (default: model filename + .log)") group.add_argument( "--devscores", help= "output CSV file for dev scores (default: model filename + .dev.csv)" ) group.add_argument( "--testscores", help= "output CSV file for test scores (default: model filename + .test.csv)" ) group = argparser.add_argument_group(title="Structural constraints") group.add_argument("--linkage", action="store_true", help="include linkage nodes and edges") group.add_argument("--implicit", action="store_true", help="include implicit nodes and edges") group.add_argument("--no-remote", action="store_false", dest="remote", help="ignore remote edges") group.add_argument("--no-constraints", action="store_false", dest="constraints", help="ignore UCCA rules") group.add_argument("--max-nodes", type=float, default=3.0, help="max non-terminal/terminal ratio") group.add_argument("--max-height", type=int, default=20, help="max graph height") group = argparser.add_mutually_exclusive_group() group.add_argument("--no-swap", action="store_false", dest="swap", help="disable Swap transitions entirely") group.add_argument("--compound-swap", action="store_true", help="enable compound swap") group = argparser.add_argument_group(title="Sanity checks") group.add_argument("--check-loops", action="store_true", help="abort if the parser enters a state loop") group.add_argument("--verify", action="store_true", help="verify oracle reproduces original passage") group = group.add_mutually_exclusive_group() group.add_argument("-b", "--binary", action="store_true", help="read and write passages in Pickle") group.add_argument("-f", "--format", choices=convert.CONVERTERS, help="output format for parsed files") group = argparser.add_argument_group( title="General classifier training parameters") group.add_argument("--swap-importance", type=int, default=1, help="learning rate factor for Swap") group.add_argument("--early-update", action="store_true", help="move to next example on incorrect prediction") group.add_argument("--word-dim-external", type=int, default=300, help="dimension for external word embeddings") group.add_argument( "--word-vectors", help="file to load external word embeddings from (default: GloVe)") group = argparser.add_argument_group(title="Perceptron parameters") group.add_argument("--learning-rate", type=float, default=1.0, help="rate for model weight updates") group.add_argument("--learning-rate-decay", type=float, default=0.0, help="learning rate decay per iteration") group.add_argument("--min-update", type=int, default=5, help="minimum #updates for using a feature") group = argparser.add_argument_group(title="Neural network parameters") group.add_argument("--update-word-vectors", action="store_true", help="tune the external word embeddings") group.add_argument("--word-dim", type=int, default=100, help="dimension for learned word embeddings") group.add_argument("--tag-dim", type=int, default=10, help="dimension for POS tag embeddings") group.add_argument("--dep-dim", type=int, default=10, help="dimension for dependency relation embeddings") group.add_argument("--label-dim", type=int, default=20, help="dimension for edge label embeddings") group.add_argument( "--punct-dim", type=int, default=2, help="dimension for separator punctuation embeddings") group.add_argument("--gap-dim", type=int, default=2, help="dimension for gap type embeddings") group.add_argument("--action-dim", type=int, default=5, help="dimension for action type embeddings") group.add_argument("--layer-dim", type=int, default=500, help="dimension for hidden layers") group.add_argument("--layers", type=int, default=2, help="number of hidden layers") group.add_argument("--lstm-layer-dim", type=int, default=500, help="dimension for LSTM hidden layers") group.add_argument("--lstm-layers", type=int, default=2, help="number of LSTM hidden layers") group.add_argument("--embedding-layer-dim", type=int, default=500, help="dimension for layers before LSTM") group.add_argument("--embedding-layers", type=int, default=1, help="number of layers before LSTM") group.add_argument("--activation", choices=ACTIVATIONS, default=ACTIVATIONS[0], help="activation function") group.add_argument("--init", choices=INITIALIZATIONS, default=INITIALIZATIONS[0], help="weight initialization") group.add_argument("--max-labels", type=int, default=100, help="max number of actions to allow") group.add_argument( "--save-every", type=int, help="every this many passages, evaluate on dev and save model") group.add_argument("--minibatch-size", type=int, default=200, help="mini-batch size for optimization") group.add_argument("--optimizer", choices=OPTIMIZERS, default=OPTIMIZERS[0], help="algorithm for optimization") group.add_argument("--max-words-external", type=int, help="max external word vectors to use") group.add_argument("--max-words", type=int, default=10000, help="max number of words to keep embeddings for") group.add_argument( "--max-tags", type=int, default=100, help="max number of POS tags to keep embeddings for") group.add_argument( "--max-deps", type=int, default=100, help="max number of dep labels to keep embeddings for") group.add_argument("--max-edge-labels", type=int, default=15, help="max number of edge labels for embeddings") group.add_argument("--max-puncts", type=int, default=5, help="max number of punctuations for embeddings") group.add_argument( "--max-gaps", type=int, default=3, help="max number of gap types to keep embeddings for") group.add_argument("--max-actions", type=int, default=10, help="max number of action types for embeddings") group.add_argument("--word-dropout", type=float, default=0.25, help="word dropout parameter") group.add_argument("--word-dropout-external", type=float, default=0.25, help="word dropout for word vectors") group.add_argument("--dropout", type=float, default=0.5, help="dropout parameter between layers") group = argparser.add_argument_group(title="DyNet parameters") group.add_argument("--dynet-mem", help="memory for dynet") group.add_argument("--dynet-weight-decay", type=float, help="weight decay for parameters (default 1e-6)") group.add_argument("--dynet-gpu", action="store_true", help="use the GPU") group.add_argument("--dynet-gpus", type=int, help="how many GPUs you want to use") group.add_argument("--dynet-gpu-ids", help="the GPUs that you want to use by device ID") group.add_argument("--dynet-viz", action="store_true", help="visualize NN and exit") self.args = argparser.parse_args(args if args else None) if self.args.model: if not self.args.log: self.args.log = self.args.model + ".log" if self.args.dev and not self.args.devscores: self.args.devscores = self.args.model + ".dev.csv" if self.args.passages and not self.args.testscores: self.args.testscores = self.args.model + ".test.csv" elif not self.args.log: self.args.log = "parse.log" self._log_file = None self.set_external() self.random = np.random