예제 #1
0
def test_iterations(config, model_type):
    filename = "test_files/models/%s_%s_iterations" % (FORMATS[0], model_type)
    remove_existing(filename)
    config.update(dict(classifier=model_type))
    passages = list(map(load_passage, passage_files(FORMATS[0])))
    last = 0
    iterations = []
    for i in 2, 5, 9, (11, True), (4, True):
        if isinstance(i, tuple):
            i, simple = i
        else:
            simple = False
            iterations.append(Iterations("%d --word-dim=%d" % (i, i)))
        scores = list(
            Parser(model_files=filename,
                   config=config).train(
                       passages,
                       dev=passages,
                       iterations=i if simple else iterations))
        assert max(0, i - last) == len(scores)
        last = i
    for iterations in ((3, 2), (4, 4)):
        with pytest.raises(
                ValueError):  # Number of epochs must be strictly increasing
            list(
                Parser(model_files=filename,
                       config=config).train(passages,
                                            dev=passages,
                                            iterations=iterations))
예제 #2
0
파일: parse.py 프로젝트: necvabolucu/tupa
 def train(self, passages=None, dev=None, test=None, iterations=1):
     """
     Train parser on given passages
     :param passages: iterable of passages to train on
     :param dev: iterable of passages to tune on
     :param test: iterable of passages that would be tested on after train finished
     :param iterations: iterable of Iterations objects whose i attributes are the number of iterations to perform
     """
     self.trained = True
     self.dev = dev
     self.test = test
     if passages:
         self.init_train()
         iterations = [
             i if isinstance(i, Iterations) else Iterations(i)
             for i in (iterations if hasattr(iterations, "__iter__") else (
                 iterations, ))
         ]
         if any(i.epochs >= j.epochs
                for i, j in zip(iterations[:-1], iterations[1:])):
             raise ValueError(
                 "Arguments to --iterations must be increasing: " +
                 " ".join(map(str, iterations)))
         self.config.args.iterations = iterations
         end = None
         for self.iteration, it in enumerate(iterations, start=1):
             start = self.model.classifier.epoch + 1 if self.model.classifier else 1
             if end and start < end + 1:
                 print("Dropped %d epochs because best score was on %d" %
                       (end - start + 1, start - 1))
             end = it.epochs + 1
             self.config.update_iteration(it)
             if end < start + 1:
                 print("Skipping %s, already trained %s epochs" %
                       (it, start - 1))
                 continue
             for self.epoch in range(start, end):
                 print("Training epoch %d of %d: " % (self.epoch, end - 1))
                 self.config.random.shuffle(passages)
                 if not sum(1 for _ in self.parse(passages,
                                                  mode=ParseMode.train)):
                     raise ParserException("Could not train on any passage")
                 yield self.eval_and_save(self.iteration == len(iterations)
                                          and self.epoch == end - 1,
                                          finished_epoch=True)
             print("Trained %d epochs" % (end - 1))
             if dev:
                 if self.iteration < len(iterations):
                     if self.model.is_retrainable:
                         self.model.load(
                             is_finalized=False
                         )  # Load best model to prepare for next iteration
                 elif test:
                     self.model.load(
                     )  # Load best model to prepare for test
     else:  # No passages to train on, just load model
         for model in self.models:
             model.load()
         self.print_config()
예제 #3
0
def main():
    if not os.path.exists(MODELS_DIR):
        os.makedirs(MODELS_DIR)
    Config().args.write = False
    if not Config().args.verbose:
        Config().args.verbose = 1
    out_file = os.environ.get("PARAMS_FILE", "params.csv")
    word_vectors_files = [os.environ[f] for f in os.environ if f.startswith("WORD_VECTORS")]
    size = int(os.environ.get("PARAMS_NUM", 30))
    np.random.seed()
    domains = (
        # Parameter name            Shared  Domain of possible values
        ("seed",                    False,  2147483647),  # max value for int
        ("classifier",              False,  [config.BIRNN, config.HIGHWAY_RNN]),
        ("learning_rate",           False,  [None]),
        ("learning_rate_decay",     False,  5 * [0] + [0.01]),
        ("update_word_vectors",     False,  [True, False]),
        ("word_vectors",            False,  [None] + word_vectors_files),
        ("word_dim_external",       False,  [0] + 5 * [300]),
        ("word_dim",                False,  range(150, 251)),
        ("tag_dim",                 False,  range(15, 26)),
        ("dep_dim",                 False,  range(5, 16)),
        ("edge_label_dim",          False,  range(15, 26)),
        ("node_label_dim",          False,  get_values_based_on_format(range(15, 30))),
        ("node_category_dim",       False,  get_values_based_on_format(range(5, 15))),
        ("max_node_categories",     False,  get_values_based_on_format(range(10, 26))),
        ("punct_dim",               False,  3),
        ("action_dim",              False,  range(2, 6)),
        ("ner_dim",                 False,  range(3, 9)),
        ("max_node_labels",         False,  get_values_based_on_format(range(1000, 4001))),
        ("min_node_label_count",    False,  range(1, 101)),
        ("layer_dim",               False,  range(50, 301)),
        ("layers",                  False,  [2]),
        ("lstm_layer_dim",          True,   range(300, 501, 2)),
        ("lstm_layers",             True,   [2]),
        ("embedding_layer_dim",     True,   range(300, 501)),
        ("embedding_layers",        True,   range(1, 3)),
        ("output_dim",              False,  range(40, 101)),
        ("activation",              True,   ["cube", "relu"]),
        ("init",                    True,   ["glorot_uniform"]),
        ("loss",                    False,  [config.DEFAULT_LOSS]),
        ("minibatch_size",          False,  range(50, 201)),
        ("optimizer",               False,  [config.DEFAULT_TRAINER]),
        ("swap_importance",         False,  np.arange(1, 2, step=.1)),
        ("iterations",              False,  [Iterations(MAX_ITERATIONS)]),
        ("word_dropout",            False,  np.arange(.41, step=.01)),
        ("word_dropout_external",   False,  np.arange(.51, step=.01)),
        ("tag_dropout",             False,  np.arange(.41, step=.01)),
        ("dep_dropout",             False,  np.arange(.41, step=.01)),
        ("node_label_dropout",      False,  np.arange(.41, step=.01)),
        ("node_dropout",            False,  np.arange(.41, step=.01)),
        ("dynet_weight_decay",      False,  [0, 1e-5]),
        ("dropout",                 False,  np.arange(.61, step=.01)),
        ("require_connected",       False,  [False]),
        ("swap",                    False,  [config.REGULAR, config.COMPOUND]),
        ("max_swap",                False,  range(2, 6)),
        ("max_words",               False,  range(8000, 30001)),
        ("max_words_external",      False,  [None] + list(range(50000, 250000))),
        ("rnn",                     True,   [config.DEFAULT_RNN]),
    )
    params = [Params(p, shared=s) for p, s in zip(*[map(OrderedDict, zip(*[sample(name, domain, size)
                                                                           for name, shared, domain in domains
                                                                           if shared or all_parameters]))
                                                    for all_parameters in (True, False)])]
    print("All parameter combinations to try:")
    print("\n".join(map(str, params)))
    print("Saving results to '%s'" % out_file)
    for param in params:
        param.run(out_file)
        best = max(params, key=Params.score)
        print("Best parameters: %s" % best)