Пример #1
0
 def __init__(self,in_dim,h_dim,c_in_dim,c_h_dim,h_layers,pred_layer,learning_algo="sgd", learning_rate=0,
              embeds_file=None,activation=ACTIVATION_MAP["tanh"],mlp=0,activation_mlp=ACTIVATION_MAP["rectify"],
              backprob_embeds=True,noise_sigma=0.1, w_dropout_rate=0.25, c_dropout_rate=0.25,
              initializer=INITIALIZER_MAP["glorot"], builder=BUILDERS["lstmc"], crf=False, viterbi_loss=False,
              mimickx_model_path=None, dictionary=None, type_constraint=False,
              lex_dim=0, embed_lex=False):
     self.w2i = {}  # word to index mapping
     self.c2i = {}  # char to index mapping
     self.w2c_cache = {} # word to char index cache for frequent words
     self.wcount = None # word count
     self.ccount = None # char count
     self.task2tag2idx = {} # need one dictionary per task
     self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task
     self.model = dynet.ParameterCollection() #init model
     self.in_dim = in_dim
     self.h_dim = h_dim
     self.c_in_dim = c_in_dim
     self.c_h_dim = c_h_dim
     self.w_dropout_rate = w_dropout_rate
     self.c_dropout_rate = c_dropout_rate
     self.activation = activation
     self.mlp = mlp
     self.activation_mlp = activation_mlp
     self.noise_sigma = noise_sigma
     self.h_layers = h_layers
     self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors
     self.wembeds = None # lookup: embeddings for words
     self.cembeds = None # lookup: embeddings for characters
     self.lembeds = None # lookup: embeddings for lexical features (optional)
     self.embeds_file = embeds_file
     trainer_algo = TRAINER_MAP[learning_algo]
     if learning_rate > 0:
         ### TODO: better handling of additional learning-specific parameters
         self.trainer = trainer_algo(self.model, learning_rate=learning_rate)
     else:
         # using default learning rate
         self.trainer = trainer_algo(self.model)
     self.backprob_embeds = backprob_embeds
     self.initializer = initializer
     self.char_rnn = None # biRNN for character input
     self.builder = builder # default biRNN is an LSTM
     self.crf = crf
     self.viterbi_loss = viterbi_loss
     self.mimickx_model_path = mimickx_model_path
     if mimickx_model_path: # load
         self.mimickx_model = load_model(mimickx_model_path)
     self.dictionary = None
     self.type_constraint = type_constraint
     self.embed_lex = False
     self.l2i = {UNK: 0}  # lex feature to index mapping
     if dictionary:
         self.dictionary, self.dictionary_values = load_dict(dictionary)
         self.path_to_dictionary = dictionary
         if type_constraint:
             self.lex_dim = 0
         else:
             if embed_lex:
                 self.lex_dim = lex_dim
                 self.embed_lex = True
                 print("Embed lexical features")
                 # register property indices
                 for prop in self.dictionary_values:
                     self.l2i[prop] = len(self.l2i)
             else:
                 self.lex_dim = len(self.dictionary_values) #n-hot encoding
             print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr)
     else:
         self.dictionary = None
         self.path_to_dictionary = None
         self.lex_dim = 0
Пример #2
0
 def __init__(self,
              in_dim,
              h_dim,
              c_in_dim,
              c_h_dim,
              h_layers,
              pred_layer,
              learning_algo="sgd",
              learning_rate=0,
              embeds_file=None,
              activation=ACTIVATION_MAP["tanh"],
              mlp=0,
              activation_mlp=ACTIVATION_MAP["rectify"],
              backprob_embeds=True,
              noise_sigma=0.1,
              w_dropout_rate=0.25,
              c_dropout_rate=0.25,
              initializer=INITIALIZER_MAP["glorot"],
              builder=BUILDERS["lstmc"],
              crf=False,
              viterbi_loss=False,
              mimickx_model_path=None,
              dictionary=None,
              type_constraint=False,
              lex_dim=0,
              embed_lex=False):
     self.w2i = {}  # word to index mapping
     self.c2i = {}  # char to index mapping
     self.w2c_cache = {}  # word to char index cache for frequent words
     self.wcount = None  # word count
     self.ccount = None  # char count
     self.task2tag2idx = {}  # need one dictionary per task
     self.pred_layer = [int(layer) for layer in pred_layer
                        ]  # at which layer to predict each task
     self.model = dynet.ParameterCollection()  #init model
     self.in_dim = in_dim
     self.h_dim = h_dim
     self.c_in_dim = c_in_dim
     self.c_h_dim = c_h_dim
     self.w_dropout_rate = w_dropout_rate
     self.c_dropout_rate = c_dropout_rate
     self.activation = activation
     self.mlp = mlp
     self.activation_mlp = activation_mlp
     self.noise_sigma = noise_sigma
     self.h_layers = h_layers
     self.predictors = {
         "inner": [],
         "output_layers_dict": {},
         "task_expected_at": {}
     }  # the inner layers and predictors
     self.wembeds = None  # lookup: embeddings for words
     self.cembeds = None  # lookup: embeddings for characters
     self.lembeds = None  # lookup: embeddings for lexical features (optional)
     self.embeds_file = embeds_file
     trainer_algo = TRAINER_MAP[learning_algo]
     if learning_rate > 0:
         ### TODO: better handling of additional learning-specific parameters
         self.trainer = trainer_algo(self.model,
                                     learning_rate=learning_rate)
     else:
         # using default learning rate
         self.trainer = trainer_algo(self.model)
     self.backprob_embeds = backprob_embeds
     self.initializer = initializer
     self.char_rnn = None  # biRNN for character input
     self.builder = builder  # default biRNN is an LSTM
     self.crf = crf
     self.viterbi_loss = viterbi_loss
     self.mimickx_model_path = mimickx_model_path
     if mimickx_model_path:  # load
         self.mimickx_model = load_model(mimickx_model_path)
     self.dictionary = None
     self.type_constraint = type_constraint
     self.embed_lex = False
     self.l2i = {UNK: 0}  # lex feature to index mapping
     if dictionary:
         self.dictionary, self.dictionary_values = load_dict(dictionary)
         self.path_to_dictionary = dictionary
         if type_constraint:
             self.lex_dim = 0
         else:
             if embed_lex:
                 self.lex_dim = lex_dim
                 self.embed_lex = True
                 print("Embed lexical features")
                 # register property indices
                 for prop in self.dictionary_values:
                     self.l2i[prop] = len(self.l2i)
             else:
                 self.lex_dim = len(self.dictionary_values)  #n-hot encoding
             print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr)
     else:
         self.dictionary = None
         self.path_to_dictionary = None
         self.lex_dim = 0
Пример #3
0
def main():
    parser = argparse.ArgumentParser(description="""Run the bi-LSTM tagger""", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    group_main = parser.add_argument_group('Main', 'main arguments')
    group_main.add_argument("--model", help="path to store/load model [required]", required=True)
    group_main.add_argument("--train", nargs='*', help="path to train file [if multiple files are given actives MTL]") # allow multiple train files, each asociated with a task = position in the list
    group_main.add_argument("--dev", nargs='*', help="dev file(s)", required=False)
    group_main.add_argument("--test", nargs='*', help="test file(s) [same order as --train]", required=False)

    group_model = parser.add_argument_group('Model', 'specify model parameters')
    group_model.add_argument("--in_dim", help="input dimension", type=int, default=64) # default Polyglot size
    group_model.add_argument("--h_dim", help="hidden dimension [default: 100]", type=int, default=100)
    group_model.add_argument("--c_in_dim", help="input dimension for character embeddings", type=int, default=100)
    group_model.add_argument("--c_h_dim", help="hidden dimension for character embeddings", type=int, default=100)
    group_model.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False, type=int, default=1)
    group_model.add_argument("--pred_layer", nargs='*', help="predict task at this layer [default: last layer]", required=False) # for each task the layer on which it is predicted (default 1)
    group_model.add_argument("--embeds", help="word embeddings file", required=False, default=None)
    group_model.add_argument("--crf", help="use CRF instead of local decoding", default=False, action="store_true")
    group_model.add_argument("--viterbi-loss", help="Use viterbi loss training (only active if --crf is on)", action="store_true", default=False)
    group_model.add_argument("--transition-matrix", help="store transition matrix from CRF")

    group_model.add_argument("--builder", help="RNN builder (default: lstmc)", choices=BUILDERS.keys(), default="lstmc")

    group_model.add_argument("--mlp", help="add additional MLP layer of this dimension [default 0=disabled]", default=0, type=int)
    group_model.add_argument("--ac-mlp", help="activation function for optional MLP layer [rectify, tanh, ...] (default: tanh)",
                        default="tanh", choices=ACTIVATION_MAP.keys())
    group_model.add_argument("--ac", help="activation function between hidden layers [rectify, tanh, ...]", default="tanh",
                             choices=ACTIVATION_MAP.keys())

    group_input = parser.add_argument_group('Input', 'specific input options')
    group_input.add_argument("--raw", help="expects raw text input (one sentence per line)", required=False, action="store_true", default=False)

    group_output = parser.add_argument_group('Output', 'specific output options')
    group_output.add_argument("--dictionary", help="use dictionary as additional features or type constraints (with --type-constraints)", default=None)
    group_output.add_argument("--type-constraint", help="use dictionary as type constraints", default=False, action="store_true")
    group_output.add_argument("--embed-lex", help="use dictionary as type constraints", default=False, action="store_true")
    group_output.add_argument("--lex-dim", help="input dimension for lexical features", default=0, type=int)
    group_output.add_argument("--output", help="output predictions to file [word|gold|pred]", default=None)
    group_output.add_argument("--output-confidences", help="output tag confidences", action="store_true", default=False)
    group_output.add_argument("--save-embeds", help="save word embeddings to file", required=False, default=None)
    group_output.add_argument("--save-lexembeds", help="save lexicon embeddings to file", required=False, default=None)
    group_output.add_argument("--save-cwembeds", help="save character-based word-embeddings to file", required=False, default=None)
    group_output.add_argument("--save-lwembeds", help="save lexicon-based word-embeddings to file", required=False, default=None)
    group_output.add_argument("--mimickx-model", help="use mimickx model for OOVs", required=False, default=None, type=str)


    group_opt = parser.add_argument_group('Optimizer', 'specify training parameters')
    group_opt.add_argument("--iters", help="training iterations", type=int,default=20)
    group_opt.add_argument("--sigma", help="sigma of Gaussian noise",default=0.2, type=float)
    group_opt.add_argument("--trainer", help="trainer [default: sgd]", choices=TRAINER_MAP.keys(), default="sgd")
    group_opt.add_argument("--learning-rate", help="learning rate [0: use default]", default=0, type=float) # see: http://dynet.readthedocs.io/en/latest/optimizers.html
    group_opt.add_argument("--patience", help="patience [default: 0=not used], requires specification of --dev and model path --save", required=False, default=0, type=int)
    group_opt.add_argument("--log-losses", help="log loss (for each task if multiple active)", required=False, action="store_true", default=False)
    group_opt.add_argument("--word-dropout-rate", help="word dropout rate [default: 0.25], if 0=disabled, recommended: 0.25 (Kiperwasser & Goldberg, 2016)", required=False, default=0.25, type=float)
    group_opt.add_argument("--char-dropout-rate", help="char dropout rate [default: 0=disabled]", required=False, default=0.0, type=float)
    group_opt.add_argument("--disable-backprob-embeds", help="disable backprob into embeddings (default is to update)",
                        required=False, action="store_false", default=True)
    group_opt.add_argument("--initializer", help="initializer for embeddings (default: constant)",
                        choices=INITIALIZER_MAP.keys(), default="constant")


    group_dynet = parser.add_argument_group('DyNet', 'DyNet parameters')
    group_dynet.add_argument("--seed", help="random seed (also for DyNet)", required=False, type=int)
    group_dynet.add_argument("--dynet-mem", help="memory for DyNet", required=False, type=int)
    group_dynet.add_argument("--dynet-gpus", help="1 for GPU usage", default=0, type=int) # warning: non-deterministic results on GPU https://github.com/clab/dynet/issues/399
    group_dynet.add_argument("--dynet-autobatch", help="if 1 enable autobatching", default=0, type=int)
    group_dynet.add_argument("--minibatch-size", help="size of minibatch for autobatching (1=disabled)", default=1, type=int)

    try:
        args = parser.parse_args()
    except:
        parser.print_help()
        exit()

    if args.train:
        if len(args.train) > 1:
            if not args.pred_layer:
                print("--pred_layer required!")
                exit()
        elif len(args.train) == 1 and not args.pred_layer:
            args.pred_layer = [args.h_layers] # assumes h_layers is 1

    if args.c_in_dim == 0:
        print(">>> disable character embeddings <<<")

    if args.minibatch_size > 1:
        print(">>> using minibatch_size {} <<<".format(args.minibatch_size))

    if args.viterbi_loss:
        if not args.crf:
            print("--crf (global decoding) needs to be active when --viterbi is used")
            exit()
    if args.crf:
        if args.viterbi_loss:
            print(">>> using global decoding (Viterbi loss) <<<")
        else:
            print(">>> using global decoding (CRF, neg-log loss) <<<")

    if args.patience:
        if not args.dev or not args.model:
            print("patience requires a dev set and model path (--dev and --model)")
            exit()

    # check if --save folder exists
    if args.model:
        if os.path.isdir(args.model):
            if not os.path.exists(args.model):
                print("Creating {}..".format(args.model))
                os.makedirs(args.model)
        elif os.path.isdir(os.path.dirname(args.model)) and not os.path.exists(os.path.dirname(args.model)):
            print("Creating {}..".format(os.path.dirname(args.model)))
            os.makedirs(os.path.dirname(args.model))

    if args.output:
        if os.path.isdir(os.path.dirname(args.output)) and not os.path.exists(os.path.dirname(args.output)):
            os.makedirs(os.path.dirname(args.output))

    if not args.seed:
        ## set seed
        seed = random.randint(1, MAX_SEED)
    else:
        seed = args.seed

    print(">>> using seed: {} <<< ".format(seed))
    np.random.seed(seed)
    random.seed(seed)

    init_dynet(seed)

    if args.mimickx_model:
        from mimickx import Mimickx, load_model  # make sure PYTHONPATH is set
        print(">>> Loading mimickx model {} <<<".format(args.mimickx_model))

    model_path = args.model

    start = time.time()

    if args.train and len( args.train ) != 0:

        tagger = NNTagger(args.in_dim,
                          args.h_dim,
                          args.c_in_dim,
                          args.c_h_dim,
                          args.h_layers,
                          args.pred_layer,
                          embeds_file=args.embeds,
                          w_dropout_rate=args.word_dropout_rate,
                          c_dropout_rate=args.char_dropout_rate,
                          activation=ACTIVATION_MAP[args.ac],
                          mlp=args.mlp,
                          activation_mlp=ACTIVATION_MAP[args.ac_mlp],
                          noise_sigma=args.sigma,
                          learning_algo=args.trainer,
                          learning_rate=args.learning_rate,
                          backprob_embeds=args.disable_backprob_embeds,
                          initializer=INITIALIZER_MAP[args.initializer],
                          builder=BUILDERS[args.builder],
                          crf=args.crf,
                          mimickx_model_path=args.mimickx_model,
                          dictionary=args.dictionary, type_constraint=args.type_constraint,
                          lex_dim=args.lex_dim, embed_lex=args.embed_lex)

        dev = None
        train = SeqData(args.train)
        if args.dev:
            dev = SeqData(args.dev)

        tagger.fit(train, args.iters,
                   dev=dev,
                   model_path=model_path, patience=args.patience, minibatch_size=args.minibatch_size, log_losses=args.log_losses)

        if not args.dev and not args.patience:  # in case patience is active it gets saved in the fit function
            save(tagger, model_path)

    if args.test and len( args.test ) != 0:

        tagger = load(args.model, args.dictionary)

        # check if mimickx provided after training
        if args.mimickx_model:
            tagger.mimickx_model_path = args.mimickx_model
            tagger.mimickx_model = load_model(args.mimickx_model)

        stdout = sys.stdout
        # One file per test ...
        if args.test:
            test = SeqData(args.test, raw=args.raw) # read in all test data

            for i, test_file in enumerate(args.test): # expect them in same order
                if args.output is not None:
                    sys.stdout = codecs.open(args.output + ".task{}".format(i), 'w', encoding='utf-8')

                start_testing = time.time()

                print('\nTesting task{}'.format(i),file=sys.stderr)
                print('*******\n',file=sys.stderr)
                correct, total = tagger.evaluate(test, "task{}".format(i),
                                                 output_predictions=args.output,
                                                 output_confidences=args.output_confidences, raw=args.raw,
                                                 unk_tag=None)
                if not args.raw:
                    print("\nTask{} test accuracy on {} items: {:.4f}".format(i, i+1, correct/total),file=sys.stderr)
                print(("Done. Took {0:.2f} seconds in total (testing took {1:.2f} seconds).".format(time.time()-start,
                                                                                                    time.time()-start_testing)),file=sys.stderr)
                sys.stdout = stdout
    if args.train:
        print("Info: biLSTM\n\t"+"\n\t".join(["{}: {}".format(a,v) for a, v in vars(args).items()
                                          if a not in ["train","test","dev","pred_layer"]]))
    else:
        # print less when only testing, as not all train params are stored explicitly
        print("Info: biLSTM\n\t" + "\n\t".join(["{}: {}".format(a, v) for a, v in vars(args).items()
                                                if a not in ["train", "test", "dev", "pred_layer",
                                                             "initializer","ac","word_dropout_rate",
                                                             "patience","sigma","disable_backprob_embed",
                                                             "trainer", "dynet_seed", "dynet_mem","iters"]]))

    tagger = load(args.model, args.dictionary)

    if args.save_embeds:
        tagger.save_embeds(args.save_embeds)

    if args.save_lexembeds:
        tagger.save_lex_embeds(args.save_lexembeds)

    if args.save_cwembeds:
        tagger.save_cw_embeds(args.save_cwembeds)

    if args.save_lwembeds:
        tagger.save_lw_embeds(args.save_lwembeds)
    
    if args.transition_matrix:
        tagger.save_transition_matrix(args.transition_matrix)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        description="""Run the bi-LSTM tagger""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    group_main = parser.add_argument_group('Main', 'main arguments')
    group_main.add_argument("--model",
                            help="path to store/load model [required]",
                            required=True)
    group_main.add_argument(
        "--train",
        nargs='*',
        help="path to train file [if multiple files are given actives MTL]"
    )  # allow multiple train files, each asociated with a task = position in the list
    group_main.add_argument("--dev",
                            nargs='*',
                            help="dev file(s)",
                            required=False)
    group_main.add_argument("--test",
                            nargs='*',
                            help="test file(s) [same order as --train]",
                            required=False)

    group_model = parser.add_argument_group('Model',
                                            'specify model parameters')
    group_model.add_argument("--in_dim",
                             help="input dimension",
                             type=int,
                             default=64)  # default Polyglot size
    group_model.add_argument("--h_dim",
                             help="hidden dimension [default: 100]",
                             type=int,
                             default=100)
    group_model.add_argument("--c_in_dim",
                             help="input dimension for character embeddings",
                             type=int,
                             default=100)
    group_model.add_argument("--c_h_dim",
                             help="hidden dimension for character embeddings",
                             type=int,
                             default=100)
    group_model.add_argument(
        "--h_layers",
        help="number of stacked LSTMs [default: 1 = no stacking]",
        required=False,
        type=int,
        default=1)
    group_model.add_argument(
        "--pred_layer",
        nargs='*',
        help="predict task at this layer [default: last layer]",
        required=False
    )  # for each task the layer on which it is predicted (default 1)
    group_model.add_argument("--embeds",
                             help="word embeddings file",
                             required=False,
                             default=None)
    group_model.add_argument("--crf",
                             help="use CRF instead of local decoding",
                             default=False,
                             action="store_true")
    group_model.add_argument(
        "--viterbi-loss",
        help="Use viterbi loss training (only active if --crf is on)",
        action="store_true",
        default=False)
    group_model.add_argument("--transition-matrix",
                             help="store transition matrix from CRF")

    group_model.add_argument("--builder",
                             help="RNN builder (default: lstmc)",
                             choices=BUILDERS.keys(),
                             default="lstmc")

    group_model.add_argument(
        "--mlp",
        help="add additional MLP layer of this dimension [default 0=disabled]",
        default=0,
        type=int)
    group_model.add_argument(
        "--ac-mlp",
        help=
        "activation function for optional MLP layer [rectify, tanh, ...] (default: tanh)",
        default="tanh",
        choices=ACTIVATION_MAP.keys())
    group_model.add_argument(
        "--ac",
        help="activation function between hidden layers [rectify, tanh, ...]",
        default="tanh",
        choices=ACTIVATION_MAP.keys())

    group_input = parser.add_argument_group('Input', 'specific input options')
    group_input.add_argument(
        "--raw",
        help="expects raw text input (one sentence per line)",
        required=False,
        action="store_true",
        default=False)

    group_output = parser.add_argument_group('Output',
                                             'specific output options')
    group_output.add_argument(
        "--dictionary",
        help=
        "use dictionary as additional features or type constraints (with --type-constraints)",
        default=None)
    group_output.add_argument("--type-constraint",
                              help="use dictionary as type constraints",
                              default=False,
                              action="store_true")
    group_output.add_argument("--embed-lex",
                              help="use dictionary as type constraints",
                              default=False,
                              action="store_true")
    group_output.add_argument("--lex-dim",
                              help="input dimension for lexical features",
                              default=0,
                              type=int)
    group_output.add_argument(
        "--output",
        help="output predictions to file [word|gold|pred]",
        default=None)
    group_output.add_argument("--output-confidences",
                              help="output tag confidences",
                              action="store_true",
                              default=False)
    group_output.add_argument("--save-embeds",
                              help="save word embeddings to file",
                              required=False,
                              default=None)
    group_output.add_argument("--save-lexembeds",
                              help="save lexicon embeddings to file",
                              required=False,
                              default=None)
    group_output.add_argument(
        "--save-cwembeds",
        help="save character-based word-embeddings to file",
        required=False,
        default=None)
    group_output.add_argument(
        "--save-lwembeds",
        help="save lexicon-based word-embeddings to file",
        required=False,
        default=None)
    group_output.add_argument("--mimickx-model",
                              help="use mimickx model for OOVs",
                              required=False,
                              default=None,
                              type=str)

    group_opt = parser.add_argument_group('Optimizer',
                                          'specify training parameters')
    group_opt.add_argument("--iters",
                           help="training iterations",
                           type=int,
                           default=20)
    group_opt.add_argument("--sigma",
                           help="sigma of Gaussian noise",
                           default=0.2,
                           type=float)
    group_opt.add_argument("--trainer",
                           help="trainer [default: sgd]",
                           choices=TRAINER_MAP.keys(),
                           default="sgd")
    group_opt.add_argument(
        "--learning-rate",
        help="learning rate [0: use default]",
        default=0,
        type=float
    )  # see: http://dynet.readthedocs.io/en/latest/optimizers.html
    group_opt.add_argument(
        "--patience",
        help=
        "patience [default: 0=not used], requires specification of --dev and model path --save",
        required=False,
        default=0,
        type=int)
    group_opt.add_argument("--log-losses",
                           help="log loss (for each task if multiple active)",
                           required=False,
                           action="store_true",
                           default=False)
    group_opt.add_argument(
        "--word-dropout-rate",
        help=
        "word dropout rate [default: 0.25], if 0=disabled, recommended: 0.25 (Kiperwasser & Goldberg, 2016)",
        required=False,
        default=0.25,
        type=float)
    group_opt.add_argument("--char-dropout-rate",
                           help="char dropout rate [default: 0=disabled]",
                           required=False,
                           default=0.0,
                           type=float)
    group_opt.add_argument(
        "--disable-backprob-embeds",
        help="disable backprob into embeddings (default is to update)",
        required=False,
        action="store_false",
        default=True)
    group_opt.add_argument(
        "--initializer",
        help="initializer for embeddings (default: constant)",
        choices=INITIALIZER_MAP.keys(),
        default="constant")

    group_dynet = parser.add_argument_group('DyNet', 'DyNet parameters')
    group_dynet.add_argument("--seed",
                             help="random seed (also for DyNet)",
                             required=False,
                             type=int)
    group_dynet.add_argument("--dynet-mem",
                             help="memory for DyNet",
                             required=False,
                             type=int)
    group_dynet.add_argument(
        "--dynet-gpus", help="1 for GPU usage", default=0, type=int
    )  # warning: non-deterministic results on GPU https://github.com/clab/dynet/issues/399
    group_dynet.add_argument("--dynet-autobatch",
                             help="if 1 enable autobatching",
                             default=0,
                             type=int)
    group_dynet.add_argument(
        "--minibatch-size",
        help="size of minibatch for autobatching (1=disabled)",
        default=1,
        type=int)

    try:
        args = parser.parse_args()
    except:
        parser.print_help()
        exit()

    if args.train:
        if len(args.train) > 1:
            if not args.pred_layer:
                print("--pred_layer required!")
                exit()
        elif len(args.train) == 1 and not args.pred_layer:
            args.pred_layer = [args.h_layers]  # assumes h_layers is 1

    if args.c_in_dim == 0:
        print(">>> disable character embeddings <<<")

    if args.minibatch_size > 1:
        print(">>> using minibatch_size {} <<<".format(args.minibatch_size))

    if args.viterbi_loss:
        if not args.crf:
            print(
                "--crf (global decoding) needs to be active when --viterbi is used"
            )
            exit()
    if args.crf:
        if args.viterbi_loss:
            print(">>> using global decoding (Viterbi loss) <<<")
        else:
            print(">>> using global decoding (CRF, neg-log loss) <<<")

    if args.patience:
        if not args.dev or not args.model:
            print(
                "patience requires a dev set and model path (--dev and --model)"
            )
            exit()

    # check if --save folder exists
    if args.model:
        if os.path.isdir(args.model):
            if not os.path.exists(args.model):
                print("Creating {}..".format(args.model))
                os.makedirs(args.model)
        elif os.path.isdir(os.path.dirname(args.model)) and not os.path.exists(
                os.path.dirname(args.model)):
            print("Creating {}..".format(os.path.dirname(args.model)))
            os.makedirs(os.path.dirname(args.model))

    if args.output:
        if os.path.isdir(os.path.dirname(args.output)) and not os.path.exists(
                os.path.dirname(args.output)):
            os.makedirs(os.path.dirname(args.output))

    if not args.seed:
        ## set seed
        seed = random.randint(1, MAX_SEED)
    else:
        seed = args.seed

    print(">>> using seed: {} <<< ".format(seed))
    np.random.seed(seed)
    random.seed(seed)

    init_dynet(seed)

    if args.mimickx_model:
        from mimickx import Mimickx, load_model  # make sure PYTHONPATH is set
        print(">>> Loading mimickx model {} <<<".format(args.mimickx_model))

    model_path = args.model

    start = time.time()

    if args.train and len(args.train) != 0:

        tagger = NNTagger(args.in_dim,
                          args.h_dim,
                          args.c_in_dim,
                          args.c_h_dim,
                          args.h_layers,
                          args.pred_layer,
                          embeds_file=args.embeds,
                          w_dropout_rate=args.word_dropout_rate,
                          c_dropout_rate=args.char_dropout_rate,
                          activation=ACTIVATION_MAP[args.ac],
                          mlp=args.mlp,
                          activation_mlp=ACTIVATION_MAP[args.ac_mlp],
                          noise_sigma=args.sigma,
                          learning_algo=args.trainer,
                          learning_rate=args.learning_rate,
                          backprob_embeds=args.disable_backprob_embeds,
                          initializer=INITIALIZER_MAP[args.initializer],
                          builder=BUILDERS[args.builder],
                          crf=args.crf,
                          mimickx_model_path=args.mimickx_model,
                          dictionary=args.dictionary,
                          type_constraint=args.type_constraint,
                          lex_dim=args.lex_dim,
                          embed_lex=args.embed_lex)

        dev = None
        train = SeqData(args.train)
        if args.dev:
            dev = SeqData(args.dev)

        tagger.fit(train,
                   args.iters,
                   dev=dev,
                   model_path=model_path,
                   patience=args.patience,
                   minibatch_size=args.minibatch_size,
                   log_losses=args.log_losses)

        if not args.dev and not args.patience:  # in case patience is active it gets saved in the fit function
            save(tagger, model_path)

    if args.test and len(args.test) != 0:

        tagger = load(args.model, args.dictionary)

        # check if mimickx provided after training
        if args.mimickx_model:
            tagger.mimickx_model_path = args.mimickx_model
            tagger.mimickx_model = load_model(args.mimickx_model)

        stdout = sys.stdout
        # One file per test ...
        if args.test:
            test = SeqData(args.test)  # read in all test data

            for i, test_file in enumerate(
                    args.test):  # expect them in same order
                if args.output is not None:
                    sys.stdout = codecs.open(args.output + ".task{}".format(i),
                                             'w',
                                             encoding='utf-8')

                start_testing = time.time()

                print('\nTesting task{}'.format(i), file=sys.stderr)
                print('*******\n', file=sys.stderr)
                correct, total = tagger.evaluate(
                    test,
                    "task{}".format(i),
                    output_predictions=args.output,
                    output_confidences=args.output_confidences,
                    raw=args.raw,
                    unk_tag=None)
                if not args.raw:
                    print("\nTask{} test accuracy on {} items: {:.4f}".format(
                        i, i + 1, correct / total),
                          file=sys.stderr)
                print((
                    "Done. Took {0:.2f} seconds in total (testing took {1:.2f} seconds)."
                    .format(time.time() - start,
                            time.time() - start_testing)),
                      file=sys.stderr)
                sys.stdout = stdout
    if args.train:
        print("Info: biLSTM\n\t" + "\n\t".join([
            "{}: {}".format(a, v) for a, v in vars(args).items()
            if a not in ["train", "test", "dev", "pred_layer"]
        ]))
    else:
        # print less when only testing, as not all train params are stored explicitly
        print("Info: biLSTM\n\t" + "\n\t".join([
            "{}: {}".format(a, v) for a, v in vars(args).items() if a not in [
                "train", "test", "dev", "pred_layer", "initializer", "ac",
                "word_dropout_rate", "patience", "sigma",
                "disable_backprob_embed", "trainer", "dynet_seed", "dynet_mem",
                "iters"
            ]
        ]))

    tagger = load(args.model, args.dictionary)

    if args.save_embeds:
        tagger.save_embeds(args.save_embeds)

    if args.save_lexembeds:
        tagger.save_lex_embeds(args.save_lexembeds)

    if args.save_cwembeds:
        tagger.save_cw_embeds(args.save_cwembeds)

    if args.save_lwembeds:
        tagger.save_lw_embeds(args.save_lwembeds)

    if args.transition_matrix:
        tagger.save_transition_matrix(args.transition_matrix)