Exemplo n.º 1
0
def tagger(repre, mission, train_file, input_file, saved_model_path,
           output_file):

    # read train and dev data sets
    train, vocab, labels = get_train_dataset(train_file, mission, repre)
    test = read_data(input_file, "test")

    # define vocabulary and help structures
    word2int = {w: i for i, w in enumerate(vocab)}
    label2int = {l: i for i, l in enumerate(labels)}

    # create a transducer classifier
    m = dy.ParameterCollection()
    if repre == "a":
        model_params = dy.load(saved_model_path, m)
    elif repre == "b":
        model_params = dy.load(saved_model_path, m)
    elif repre == "c":
        model_params = dy.load(saved_model_path, m)
    else:
        model_params = dy.load(saved_model_path, m)

    predictions = test_model(repre, test, model_params, word2int, label2int,
                             vocab)

    # write predictions to file
    file = open(output_file, "w")
    for idx, sentence in enumerate(predictions):
        word_idx = 0
        for word, label in sentence:
            orig_word = test[idx][word_idx]
            file.write(orig_word + " " + label + "\n")
            word_idx += 1
        file.write("\n")
def train_network(config, saver, parser, embeddings, train_examples, dev_set,
                  test_set):
    best_dev_UAS = 0
    model = ParserModel(config, embeddings, parser)
    parser.model = model
    for epoch in range(config.n_epochs):
        print "Epoch {:} out of {:}".format(epoch + 1, config.n_epochs)
        dev_UAS = run_epoch(model, config, parser, train_examples, dev_set)
        if dev_UAS > best_dev_UAS:
            best_dev_UAS = dev_UAS
            if not saver:
                print "New best dev UAS! Saving model in ./data/weights/parser.weights"
                dy.save('./data/weights/parser.weights',
                        [model.pW, model.pB1, model.pU, model.pB2])

    if saver:
        print 80 * "="
        print "TESTING"
        print 80 * "="
        print "Restoring the best model weights found on the dev set"
        model.pW, model.pB1, model.pU, model.pB2 = dy.load(
            './data/weights/parser.weights', model.m)
        print "Final evaluation on test set",
        UAS, dependencies = parser.parse(test_set)
        print "- test UAS: {:.2f}".format(UAS * 100.0)
        print "Writing predictions"
        with open('q2_test.predicted.pkl', 'w') as f:
            cPickle.dump(dependencies, f, -1)
        print "Done!"
    def build_model(self, pc, best_model_path):
        
        if best_model_path:
            print 'Loading model from: {}'.format(best_model_path)
            self.RNN, self.VOCAB_LOOKUP, self.R, self.bias = dy.load(best_model_path, pc)
        else:
            # LSTM
            self.RNN  = dy.CoupledLSTMBuilder(self.hyperparams['LAYERS'], self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'], pc)
            
            # embedding lookups for vocabulary
            self.VOCAB_LOOKUP  = pc.add_lookup_parameters((self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM']))

            # softmax parameters
            self.R = pc.add_parameters((self.hyperparams['VOCAB_SIZE'], self.hyperparams['HIDDEN_DIM']))
            self.bias = pc.add_parameters(self.hyperparams['VOCAB_SIZE'])
        
        
        print 'Model dimensions:'
        print ' * VOCABULARY EMBEDDING LAYER: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM'])
        print
        print ' * LSTM: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'])
        print ' LSTM has {} layer(s)'.format(self.hyperparams['LAYERS'])
        print
        print ' * SOFTMAX: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['HIDDEN_DIM'], self.hyperparams['VOCAB_SIZE'])
        print
Exemplo n.º 4
0
    def __init__(self, c2i, num_lstm_layers=-1,\
                char_dim=-1, hidden_dim=-1, word_embedding_dim=-1, file=None):
        self.c2i = c2i
        self._model = dy.Model()
        if file == None:
            # Char LSTM Parameters
            self.char_lookup = self._model.add_lookup_parameters((len(c2i), char_dim))
            self.char_fwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self._model)
            self.char_bwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self._model)

            # Post-LSTM Parameters
            self.lstm_to_rep_params = self._model.add_parameters((word_embedding_dim, hidden_dim * 2))
            self.lstm_to_rep_bias = self._model.add_parameters(word_embedding_dim)
            self.mlp_out = self._model.add_parameters((word_embedding_dim, word_embedding_dim))
            self.mlp_out_bias = self._model.add_parameters(word_embedding_dim)
        else:
            # read from saved file. c2i mapping to be read by calling function (for now)
            pc = dy.ParameterCollection()
            model_members = iter(dy.load(file,pc))
            # model_members = iter(self._model.load(file))
            self.char_lookup = model_members.next()
            self.char_fwd_lstm = model_members.next()
            self.char_bwd_lstm = model_members.next()
            self.lstm_to_rep_params = model_members.next()
            self.lstm_to_rep_bias = model_members.next()
            self.mlp_out = model_members.next()
            self.mlp_out_bias = model_members.next()
Exemplo n.º 5
0
def run_test(args):
    print("Loading test trees from {}...".format(args.test_path))
    test_treebank = trees.load_trees(args.test_path)
    print("Loaded {:,} test examples.".format(len(test_treebank)))

    print("Loading model from {}...".format(args.model_path_base))
    model = dy.ParameterCollection()
    [parser] = dy.load(args.model_path_base, model)

    print("Parsing test sentences...")

    start_time = time.time()

    test_predicted = []
    for tree in test_treebank:
        dy.renew_cg()
        sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
        predicted, _ = parser.parse(sentence)
        test_predicted.append(predicted.convert())

    test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted,
                                 args.parser_type)

    print("test-fscore {} "
          "test-elapsed {}".format(
              test_fscore,
              format_elapsed(start_time),
          ))
Exemplo n.º 6
0
 def user_load_model(path, model):
     edm = dy.load(path + '/edm', model)
     print "Back from spec"
     print edm, M
     #with open(path + '/params.pkl', 'rb') as f:
     #    M = pickle.load(f)
     edm.set_M(M)
     return edm
Exemplo n.º 7
0
def save_components(args):
    model = dy.ParameterCollection()
    [parser] = dy.load(args.model_path_base, model)
    parser.f_label.param_collection().save(args.save_path, '/f-label', append=False)
    parser.f_tag.param_collection().save(args.save_path, '/f-tag', append=True)
    parser.f_encoding.param_collection().save(args.save_path, '/f-encoding', append=True)
    parser.word_embeddings.save(args.save_path, '/word-embedding', append=True)
    parser.lstm.param_collection().save(args.save_path, '/lstm', append=True)
Exemplo n.º 8
0
 def test_save_load(self):
     self.p.forward()
     self.p.backward()
     self.t.update()
     dy.renew_cg()
     v1 = self.p.value()
     dy.save(self.file, [self.p])
     [p2] = dy.load(self.file, self.m2)
     v2 = p2.value()
     self.assertTrue(np.allclose(v1, v2))
Exemplo n.º 9
0
 def __init__(self, type, model_file, input_file):
     self.type = type
     self.input_file = input_file
     self.sequences = read_file(input_file)
     model = dn.Model()
     self.model = dn.load("model_type" + self.type, model)[0]
     self.vocab, self.tags, self.chars = pickle.load(
         open(model_file + self.type + ".vocab", "rb"))
     self.tags_to_ix = {id: tag for tag, id in self.tags.iteritems()}
     self.define_data()
Exemplo n.º 10
0
 def test_save_load(self):
     self.p.forward()
     self.p.backward()
     self.t.update()
     dy.renew_cg()
     v1 = self.p.value()
     dy.save(self.file, [self.p])
     [p2] = dy.load(self.file, self.m2)
     v2 = p2.value()
     self.assertTrue(np.allclose(v1, v2))
    def __init__(self, model_path):
        model_folder = nmt_model_path
        best_model_path = model_folder + '/bestmodel.txt'
        hypoparams_file = model_folder + '/best.dev'

        hypoparams_file_reader = codecs.open(hypoparams_file, 'r', 'utf-8')
        hyperparams_dict = dict([
            line.strip().split(' = ')
            for line in hypoparams_file_reader.readlines()
        ])
        self.hyperparams = {
            'INPUT_DIM': int(hyperparams_dict['INPUT_DIM']),
            'HIDDEN_DIM': int(hyperparams_dict['HIDDEN_DIM']),
            #'FEAT_INPUT_DIM': int(hyperparams_dict['FEAT_INPUT_DIM']),
            'LAYERS': int(hyperparams_dict['LAYERS']),
            'VOCAB_PATH': hyperparams_dict['VOCAB_PATH'],
            'OVER_SEGS': 'OVER_SEGS' in hyperparams_dict
        }

        self.pc = dy.ParameterCollection()

        print 'Loading vocabulary from {}:'.format(
            self.hyperparams['VOCAB_PATH'])
        self.vocab = Vocab.from_file(self.hyperparams['VOCAB_PATH'])
        #        BEGIN_CHAR   = u'<s>'
        #        STOP_CHAR   = u'</s>'
        #        UNK_CHAR = u'<unk>'
        #        self.BEGIN   = self.vocab.w2i[BEGIN_CHAR]
        #        self.STOP   = self.vocab.w2i[STOP_CHAR]
        #        self.UNK       = self.vocab.w2i[UNK_CHAR]
        self.BEGIN = utils.GO_ID
        self.STOP = utils.EOS_ID
        self.UNK = utils.UNK_ID
        self.hyperparams['VOCAB_SIZE'] = self.vocab.size()

        print 'Model Hypoparameters:'
        for k, v in self.hyperparams.items():
            print '{:20} = {}'.format(k, v)
        print

        print 'Loading model from: {}'.format(best_model_path)
        self.RNN, self.VOCAB_LOOKUP, self.R, self.bias = dy.load(
            best_model_path, self.pc)

        print 'Model dimensions:'
        print ' * VOCABULARY EMBEDDING LAYER: IN-DIM: {}, OUT-DIM: {}'.format(
            self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM'])
        print
        print ' * LSTM: IN-DIM: {}, OUT-DIM: {}'.format(
            self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'])
        print ' LSTM has {} layer(s)'.format(self.hyperparams['LAYERS'])
        print
        print ' * SOFTMAX: IN-DIM: {}, OUT-DIM: {}'.format(
            self.hyperparams['HIDDEN_DIM'], self.hyperparams['VOCAB_SIZE'])
        print
Exemplo n.º 12
0
    def __init__(self,
                 options,
                 train_sentences=None,
                 restore_file=None,
                 statistics=None):
        self.model = dn.Model()

        random.seed(1)
        self.trainer = dn.AdamTrainer(self.model)

        self.activation = activations[options.activation]
        # self.decoder = decoders[options.decoder]

        self.labelsFlag = options.labelsFlag
        self.costaugFlag = options.cost_augment
        self.options = options

        if "func" in options:
            del options.func

        if restore_file:
            self.container, = dn.load(restore_file, self.model)
            networks = list(self.container.components)
            self.network = networks.pop(0)
            self.statistics = statistics
            self.has_emptys = len(statistics.emptys) > 0
            if self.has_emptys:
                self.network_for_emptys = networks.pop(0)
            if self.options.use_2nd:
                self.network3 = networks.pop(0)
                if self.has_emptys:
                    self.network3_for_emptys_mid = networks.pop(0)
                    self.network3_for_emptys_out = networks.pop(0)
            assert not networks
        else:
            self.container = nn.Container(self.model)
            self.statistics = statistics = StatisticsWithEmpty.from_sentences(
                train_sentences)
            self.has_emptys = len(statistics.emptys) > 0
            self.network = EdgeEvaluationNetwork(self.container, statistics,
                                                 options)
            if self.has_emptys:
                self.network_for_emptys = EdgeEvaluation(
                    self.container, options)
            if options.use_2nd:
                self.network3 = EdgeSiblingEvaluation(self.container, options)
                if self.has_emptys:
                    self.network3_for_emptys_mid = EdgeSiblingEvaluation(
                        self.container, options)
                    self.network3_for_emptys_out = EdgeSiblingEvaluation(
                        self.container, options)
Exemplo n.º 13
0
 def __init__(self, nclass, paramcol, loadname=None):
     '''
     @param nclass: int, number of classes to be classified 
     @param paramcol: parameter collection that is to hold the local 
     parameters in CRF
     @param loadname: string, default=None, if it is not None, load 
     parameters instead of creating them from scratch, taking 
     loadname as the basename used in dy.load()
     '''
     if loadname is None:
         self.d = nclass
         self.pb = paramcol.add_parameters((nclass, ))
         self.pe = paramcol.add_parameters((nclass, ))
         self.pT = paramcol.add_parameters((nclass, nclass))
     else:
         self.pb, self.pe, self.pT = dy.load(loadname, paramcol)
         self.d = self.pT.shape()[0]
Exemplo n.º 14
0
def run_test(args):
    #args.test_path = args.test_path.replace('[*]', args.treetype)
    print("Loading test trees from {}...".format(args.test_path))
    test_treebank = trees.load_trees(args.test_path, args.normal)
    print("Loaded {:,} test examples.".format(len(test_treebank)))

    print("Loading model from {}...".format(args.model_path_base))
    model = dy.ParameterCollection()
    [parser] = dy.load(args.model_path_base, model)

    label_vocab = vocabulary.Vocabulary()

    label_list = util.load_label_list('../data/labels.txt')
    for item in label_list:
        label_vocab.index((item, ))
    label_vocab.index((parse.EMPTY, ))
    for item in label_list:
        label_vocab.index((item + "'", ))

    label_vocab.freeze()
    latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel)

    print("Parsing test sentences...")

    start_time = time.time()

    test_predicted = []
    test_gold = latent_tree.build_latent_trees(test_treebank)
    for x, chunks in test_treebank:
        dy.renew_cg()
        #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
        sentence = [(parse.XX, ch) for ch in x]
        predicted, _ = parser.parse(sentence)
        test_predicted.append(predicted.convert())

    #test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted, args.expname + '.test.')
    test_fscore = evaluate.eval_chunks(args.evalb_dir,
                                       test_gold,
                                       test_predicted,
                                       output_filename=args.expname +
                                       '.finaltest.txt')  # evalb
    print("test-fscore {} "
          "test-elapsed {}".format(
              test_fscore,
              format_elapsed(start_time),
          ))
Exemplo n.º 15
0
def run_test2(args):

    model = dy.ParameterCollection()
    # [parser] = dy.load(args.model_path_base, model)
    [parser] = dy.load(
        "models/chartdyRBTC-model_addr_dytree_giga_0.4_200_1_chartdyRBTC_dytree_1_houseno_0_0_dev=0.90",
        model)

    test_chunk_insts = util.read_chunks(args.test_path, args.normal)

    # ftreelog = open(args.expname + '.test.predtree.txt', 'w', encoding='utf-8')
    ftreelog = open('aaa' + '.test.predtree.txt', 'w', encoding='utf-8')
    test_predicted = []
    test_start_time = time.time()
    test_predicted = []
    test_gold = []
    for inst in test_chunk_insts:
        chunks = util.inst2chunks(inst)
        test_gold.append(chunks)

    for x, chunks in test_chunk_insts:
        dy.renew_cg()
        sentence = [(parse.XX, ch) for ch in x]
        predicted, _ = parser.parse(sentence)
        pred_tree = predicted.convert()
        ftreelog.write(pred_tree.linearize() + '\n')
        test_predicted.append(pred_tree.to_chunks())

    ftreelog.close()

    # test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename=args.expname + '.test.txt')  # evalb
    test_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                        test_gold,
                                        test_predicted,
                                        output_filename='aaaabbbb' +
                                        '.test.txt')  # evalb

    print("test-fscore {} "
          "test-elapsed {} ".format(
              test_fscore,
              format_elapsed(test_start_time),
          ))
Exemplo n.º 16
0
 def load_model(path, model_version):
     full_saving_path = os.path.join(path, model_version)
     new_model_obj = pickle.load(open(full_saving_path + ".p", "rb"))
     model_to_load = dy.ParameterCollection()
     W_emb, W_cnn, b_cnn, W_mlp, b_mlp, V_mlp, a_mlp = dy.load(
         full_saving_path, model_to_load)
     new_model_obj.W_emb = W_emb
     new_model_obj.W_cnn = W_cnn
     new_model_obj.b_cnn = b_cnn
     new_model_obj.W_mlp = W_mlp
     new_model_obj.b_mlp = b_mlp
     new_model_obj.V_mlp = V_mlp
     new_model_obj.a_mlp = a_mlp
     # converting default dict into dict, since pickle can only save dict objects and not defaultdict ones
     new_model_obj.w2i = defaultdict(lambda: len(new_model_obj.w2i),
                                     new_model_obj.w2i)
     new_model_obj.t2i = defaultdict(lambda: len(new_model_obj.t2i),
                                     new_model_obj.t2i)
     new_model_obj.model = model_to_load
     return new_model_obj
Exemplo n.º 17
0
def run_test(args):
    if not os.path.exists(args.experiment_directory):
        os.mkdir(args.experiment_directory)
    print("Loading test trees from {}...".format(args.input_file))

    test_treebank = trees.load_trees(args.input_file)
    test_tokenized_lines = parse_trees_to_string_lines(test_treebank)
    test_embeddings_file = compute_elmo_embeddings(test_tokenized_lines,
                                                   os.path.join(
                                                       args.experiment_directory,
                                                       'test_embeddings'))

    print("Loaded {:,} test examples.".format(len(test_treebank)))

    print("Loading model from {}...".format(args.model_path))
    model = dy.ParameterCollection()
    [parser] = dy.load(args.model_path, model)

    print("Parsing test sentences...")
    check_performance(parser, test_treebank, test_embeddings_file, args)
Exemplo n.º 18
0
 def __init__(self, indim, hdim, paramcol, loadname=None):
     '''
     @param indim: int, input dimension of biLSTM
     @param hdim: int, hidden state dimension of both forward 
     and backward LSTM
     @param paramcol: parameter collection that is to hold the 
     local parameters in biLSTM
     @param loadname: string, default=None, if it is not None, 
     load parameters instead of creating them from scratch, 
     taking loadname as the basename used in dy.load()
     '''
     if loadname is None:
         self.flstm = dy.VanillaLSTMBuilder(1, indim, hdim, paramcol)
         self.blstm = dy.VanillaLSTMBuilder(1, indim, hdim, paramcol)
         # self.flstm = dy.LSTMBuilder(1, indim, hdim, paramcol)
         # self.blstm = dy.LSTMBuilder(1, indim, hdim, paramcol)
         self.flstm.set_dropouts(config.dropout, config.dropout)
         self.blstm.set_dropouts(config.dropout, config.dropout)
     else:
         self.flstm, self.blstm = dy.load(loadname, paramcol)
Exemplo n.º 19
0
def model_load_helper(mode, prefix, model):
    """
    Save/Load helper for backward compatibly.
    It save/load options and model.
    """
    if mode is None:
        mode = detect_saved_model_type(prefix)

    if mode == "dynet":
        with open(prefix + ".options", "rb") as f:
            options = pickle.load(f)
        return options, dn.load(prefix, model)[0]
    elif mode == "pickle":
        with open(prefix, "rb") as f:
            options, picklable = pickle.load(f)
        return options, DynetSaveable.from_picklable_obj(picklable, model)
    elif mode == "pickle-gzip":
        with open(prefix + ".gz", "rb") as f:
            options, picklable = pickle.load(f)
        return options, DynetSaveable.from_picklable_obj(picklable, model)
    else:
        raise TypeError("Invalid model format.")
Exemplo n.º 20
0
 def load_model(self, d):
     self.init_model()
     param_keys = d["param_keys"]
     self.max_num_labels = d["max_num_labels"]
     Config().args.layers = self.layers = d["layers"]
     Config().args.layer_dim = self.layer_dim = d["layer_dim"]
     Config().args.output_dim = self.output_dim = d.get(
         "output_dim",
         Config().args.output_dim)
     Config().args.activation = self.activation_str = d["activation"]
     self.activation = ACTIVATIONS[self.activation_str]
     Config().args.init = self.init_str = d["init"]
     self.init = INITIALIZERS[self.init_str]
     self.load_extra(d)
     print("Loading model from '%s'... " % self.filename,
           end="",
           flush=True)
     started = time.time()
     try:
         param_values = dy.load(self.filename, self.model)
         print("Done (%.3fs)." % (time.time() - started))
         self.params = OrderedDict(zip(param_keys, param_values))
     except KeyError as e:
         print("Failed loading model: %s" % e)
Exemplo n.º 21
0
    def __init__(self, options, train_sentences=None, restore_file=None):
        self.model = dn.Model()
        random.seed(1)
        self.trainer = dn.AdamTrainer(self.model)

        self.activation = activations[options.activation]
        self.decoder = decoders[options.decoder]

        self.labelsFlag = options.labelsFlag
        self.costaugFlag = options.cost_augment
        self.options = options

        if "func" in options:
            del options.func

        if restore_file:
            self.container, = dn.load(restore_file, self.model)
            self.network, self.network3 = self.container.components
        else:
            self.container = nn.Container(self.model)
            statistics = Statistics.from_sentences(train_sentences)
            self.network = EdgeEvaluationNetwork(self.container, statistics,
                                                 options)
            self.network3 = EdgeSiblingEvaluation(self.container, options)
Exemplo n.º 22
0
    def load_proposal_model(self, dir):
        """Load the proposal model to sample with."""
        assert os.path.isdir(dir), dir

        print(f'Loading proposal model from `{dir}`...')
        model_checkpoint_path = os.path.join(dir, 'model')
        state_checkpoint_path = os.path.join(dir, 'state.json')
        [proposal] = dy.load(model_checkpoint_path, dy.ParameterCollection())

        assert (isinstance(proposal, DiscRNNG)
                or isinstance(proposal, ChartParser)), type(proposal)

        with open(state_checkpoint_path, 'r') as f:
            state = json.load(f)
            epochs = state['epochs']
            fscore = state['test-fscore']

        print(
            f'Loaded model trained for {epochs} epochs with test-fscore {fscore}.'
        )

        self.proposal = proposal
        self.proposal.eval()
        self.use_loaded_samples = False
Exemplo n.º 23
0
def run_train(args):

    args.numpy_seed = seed
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    if args.trial == 1:
        args.train_path = 'data/trial.txt'
        args.dev_path = 'data/trial.txt'
        args.test_path = 'data/trial.txt'

    # args.train_path = args.train_path.replace('[*]', args.treetype)
    # args.dev_path = args.dev_path.replace('[*]', args.treetype)
    # args.test_path = args.test_path.replace('[*]', args.treetype)

    print("Loading training trees from {}...".format(args.train_path))
    train_chunk_insts = util.read_chunks(args.train_path, args.normal)
    print("Loaded {:,} training examples.".format(len(train_chunk_insts)))

    print("Loading development trees from {}...".format(args.dev_path))
    dev_chunk_insts = util.read_chunks(args.dev_path, args.normal)
    print("Loaded {:,} development examples.".format(len(dev_chunk_insts)))

    print("Loading test trees from {}...".format(args.test_path))
    test_chunk_insts = util.read_chunks(args.test_path, args.normal)
    print("Loaded {:,} test examples.".format(len(test_chunk_insts)))

    # print("Processing trees for training...")
    # train_parse = [tree.convert() for tree in train_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(parse.START)
    tag_vocab.index(parse.STOP)
    tag_vocab.index(parse.XX)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(parse.START)
    word_vocab.index(parse.STOP)
    word_vocab.index(parse.UNK)
    word_vocab.index(parse.NUM)

    for x, chunks in train_chunk_insts + dev_chunk_insts + test_chunk_insts:
        for ch in x:
            word_vocab.index(ch)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    label_list = util.load_label_list(args.labellist_path)  #'data/labels.txt')
    for item in label_list:
        label_vocab.index((item, ))

    if args.nontlabelstyle != 1:
        for item in label_list:
            label_vocab.index((item + "'", ))

    if args.nontlabelstyle == 1:
        label_vocab.index((parse.EMPTY, ))

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()

    latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel,
                                             args.nontlabelstyle)

    def print_vocabulary(name, vocab):
        special = {parse.START, parse.STOP, parse.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)

    print("Initializing model...")

    pretrain = {'giga': 'data/giga.vec100', 'none': 'none'}
    pretrainemb = util.load_pretrain(pretrain[args.pretrainemb],
                                     args.word_embedding_dim, word_vocab)

    model = dy.ParameterCollection()
    if args.parser_type == "chartdyRBTC":
        parser = parse.ChartDynamicRBTConstraintParser(
            model,
            tag_vocab,
            word_vocab,
            label_vocab,
            args.tag_embedding_dim,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            args.dropout,
            (args.pretrainemb, pretrainemb),
            args.chunkencoding,
            args.trainc == 1,
            True,
            (args.zerocostchunk == 1),
        )

    else:
        print('Model is not valid!')
        exit()

    if args.loadmodel != 'none':
        tmp = dy.load(args.loadmodel, model)
        parser = tmp[0]
        print('Model is loaded from ', args.loadmodel)

    trainer = dy.AdamTrainer(model)

    total_processed = 0
    current_processed = 0
    check_every = len(train_chunk_insts) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_dev_model_path = None

    start_time = time.time()

    def check_dev():
        nonlocal best_dev_fscore
        nonlocal best_dev_model_path

        dev_start_time = time.time()

        dev_predicted = []
        #dev_gold = []

        #dev_gold = latent_tree.build_latent_trees(dev_chunk_insts)
        dev_gold = []
        for inst in dev_chunk_insts:
            chunks = util.inst2chunks(inst)
            dev_gold.append(chunks)

        for x, chunks in dev_chunk_insts:
            dy.renew_cg()
            #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
            sentence = [(parse.XX, ch) for ch in x]
            predicted, _ = parser.parse(sentence)
            dev_predicted.append(predicted.convert().to_chunks())

        #dev_fscore = evaluate.evalb(args.evalb_dir, dev_gold, dev_predicted, args.expname + '.dev.') #evalb
        dev_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                           dev_gold,
                                           dev_predicted,
                                           output_filename=args.expname +
                                           '.dev.txt')  # evalb

        print("dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(
                  dev_fscore,
                  format_elapsed(dev_start_time),
                  format_elapsed(start_time),
              ))

        if dev_fscore.fscore > best_dev_fscore:
            if best_dev_model_path is not None:
                for ext in [".data", ".meta"]:
                    path = best_dev_model_path + ext
                    if os.path.exists(path):
                        print(
                            "Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            best_dev_model_path = "{}_dev={:.2f}".format(
                args.model_path_base + "_" + args.expname, dev_fscore.fscore)
            print("Saving new best model to {}...".format(best_dev_model_path))
            dy.save(best_dev_model_path, [parser])

            test_start_time = time.time()
            test_predicted = []
            #test_gold = latent_tree.build_latent_trees(test_chunk_insts)
            test_gold = []
            for inst in test_chunk_insts:
                chunks = util.inst2chunks(inst)
                test_gold.append(chunks)

            ftreelog = open(args.expname + '.test.predtree.txt',
                            'w',
                            encoding='utf-8')

            for x, chunks in test_chunk_insts:
                dy.renew_cg()
                #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
                sentence = [(parse.XX, ch) for ch in x]
                predicted, _ = parser.parse(sentence)
                pred_tree = predicted.convert()
                ftreelog.write(pred_tree.linearize() + '\n')
                test_predicted.append(pred_tree.to_chunks())

            ftreelog.close()

            #test_fscore = evaluate.evalb(args.evalb_dir, test_chunk_insts, test_predicted, args.expname + '.test.')
            test_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                                test_gold,
                                                test_predicted,
                                                output_filename=args.expname +
                                                '.test.txt')  # evalb

            print("epoch {:,} "
                  "test-fscore {} "
                  "test-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      test_fscore,
                      format_elapsed(test_start_time),
                      format_elapsed(start_time),
                  ))

    train_trees = latent_tree.build_dynamicRBT_trees(train_chunk_insts)
    train_trees = [(x, tree.convert(), chunks, latentscope)
                   for x, tree, chunks, latentscope in train_trees]

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_chunk_insts)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_chunk_insts), args.batch_size):
            dy.renew_cg()
            batch_losses = []

            for x, tree, chunks, latentscope in train_trees[
                    start_index:start_index + args.batch_size]:

                discard = False
                for chunk in chunks:
                    length = chunk[2] - chunk[1]
                    if length > args.maxllimit:
                        discard = True
                        break

                if discard:
                    continue
                    print('discard')

                sentence = [(parse.XX, ch) for ch in x]
                if args.parser_type == "top-down":
                    _, loss = parser.parse(sentence, tree, args.explore)
                else:
                    _, loss = parser.parse(sentence, tree, chunks, latentscope)
                batch_losses.append(loss)
                total_processed += 1
                current_processed += 1

            batch_loss = dy.average(batch_losses)
            batch_loss_value = batch_loss.scalar_value()
            batch_loss.backward()
            trainer.update()

            print("Epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_chunk_insts) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ),
                  flush=True)

            if current_processed >= check_every:
                current_processed -= check_every
                if epoch > 7:
                    check_dev()
Exemplo n.º 24
0
    def __init__(self, graphs, embeddings, assoc_mode=BILINEAR_MODE, reg=0.0, dropout=0.0,
                 no_assoc=False, model_path=None, ergm_path=None,
                 path_only_init=False):
        """
        :param graphs: dictionary of {relation:CSR-format graph}s, node-aligned
        :param embeddings: list of numpy array embeddings, indices aligned to nodes
        :param model_path: optional path for files with pre-trained association model (read by super)
        :param ergm_path: optional path for files with pre-trained model
        :param path_only_init: model_path only used for initialization
        """
        # input validation
        AssociationModel.__init__(self, graphs, embeddings, assoc_mode, dropout, model_path=model_path)

        # raw members
        self.no_assoc = no_assoc
        self.regularize = reg

        # cache members
        self.cache = {}
        self.edge_counts = self.add_cache_dict('ec')                   # keys are single relations
        self.mutual_edge_counts = self.add_cache_dict('mec')           # keys are unordered relation pairs
        self.two_path_counts = self.add_cache_dict('tpc')              # keys are ordered relation pairs
        self.transitive_closure_counts = self.add_cache_dict('tcc')    # keys are ordered relation triplets
        self.directed_triangle_counts = self.add_cache_dict('dtc')     # keys are ordered relation triplets
        self.in_degs = self.add_cache_dict('ins')                      # keys are single relations, values are big lists
        self.out_degs = self.add_cache_dict('outs')                    # keys are single relations, values are big lists
        self.in_one_star_counts = self.add_cache_dict('i1sc')          # keys are single relations
        self.out_one_star_counts = self.add_cache_dict('o1sc')         # keys are single relations
        self.in_two_star_counts = self.add_cache_dict('i2sc')          # keys are unordered relation pairs
        self.out_two_star_counts = self.add_cache_dict('o2sc')         # keys are unordered relation pairs
        self.in_three_star_counts = self.add_cache_dict('i3sc')        # keys are unordered relation triplets
        self.out_three_star_counts = self.add_cache_dict('o3sc')       # keys are unordered relation triplets
        # 'at least k' stars - 'one/two/three plus'
        self.in_one_p_star_counts = self.add_cache_dict('i1psc')       # keys are single relations
        self.out_one_p_star_counts = self.add_cache_dict('o1psc')      # keys are single relations
        self.in_two_p_star_counts = self.add_cache_dict('i2psc')       # keys are unordered relation pairs
        self.out_two_p_star_counts = self.add_cache_dict('o2psc')      # keys are unordered relation pairs
        self.in_three_p_star_counts = self.add_cache_dict('i3psc')     # keys are unordered relation triplets
        self.out_three_p_star_counts = self.add_cache_dict('o3psc')    # keys are unordered relation triplets

        self.missing_node_indices = []          # updates during training (NOT SURE IF NEEDED)

        timeprint('computing ERGM features...')
        self.init_ergm_features()               # populates self.feature_vals
        timeprint('finished! computed {} features'.format(len(self.feature_vals)))
        timeprint('{} non-zero features'.format(np.count_nonzero(list(self.feature_vals.values()))))

        # documentationy again, for efficient updates
        encountered_features = list(self.feature_vals.keys()) # canonical ordering from now on
        
        if ergm_path is not None:
            ergm_model_path = ergm_path
        elif (model_path is not None) and (not path_only_init):
            ergm_model_path = model_path
        else:
            ergm_model_path = None
                
        if ergm_model_path is None:
            self.feature_set = encountered_features
        else:
            self.feature_set = pickle.load(open(ergm_model_path + '.feats'))
            assert sorted(self.feature_set) == sorted(encountered_features)
        
        if ergm_model_path is None:
            self.ergm_weights = self.model.add_parameters(len(self.feature_set))
        
        if model_path is None and ergm_model_path is None:
            # 'model_path is not None' is initialized in super()
            # TODO support other association modes (affects downstream)
            if self.no_assoc:
                self.word_assoc_weights = {r:self.model.add_parameters((self.emb_dim, self.emb_dim), init=dy.ConstInitializer(0.0)) for r in self.relation_names}
            else:
                self.word_assoc_weights = {r:self.model.add_parameters((self.emb_dim, self.emb_dim)) for r in self.relation_names}
        elif ergm_model_path is not None:
            pc = dy.ParameterCollection()
            dy.load(ergm_model_path + '.dyn', pc)
            pc_list = pc.parameters_list()
            i = 0
            self.ergm_weights = pc_list[i]
            if not path_only_init:
                self.word_assoc_weights = {}
                rel_order = self.relation_names
                for r in rel_order:
                    i += 1
                    self.word_assoc_weights[r] = pc_list[i]
                i += 1
                assert i == len(pc_list),\
                       '{} relation params read but length is {}'.format(i, len(pc_list))
        
        self.dy_score = self.ergm_score()
        self.score = self.dy_score.scalar_value()

        self.score_is_stale = False

        timeprint('finished initialization. initial ERGM score = {}'.format(self.score))
Exemplo n.º 25
0
 def test_save_load(self):
     dy.save(self.file, [self.b])
     [b] = dy.load(self.file, self.m2)
Exemplo n.º 26
0
def load_model(mode_file_name):
    m = dy.ParameterCollection()
    return [m] + list(dy.load(mode_file_name, m))
Exemplo n.º 27
0
    LAS, UAS = [
        float(line.strip().split()[-2])
        for line in open('score_tmp').readlines()[:2]
    ]
    print('LAS %.2f, UAS %.2f' % (LAS, UAS))
    #os.system('rm tmp score_tmp')
    return LAS, UAS


if __name__ == "__main__":
    args, config = parser_arg_cfg()

    # load model with high level save/load API
    load_model_path = config.get("load", "load_model_path")
    pc = dy.ParameterCollection()
    biaffine_parser = dy.load(load_model_path, pc)[0]

    # get vocabs from the model, which is then used for create fields
    vocab_form, vocab_upos, vocab_deprel = biaffine_parser.vocab_form, biaffine_parser.vocab_pos, biaffine_parser.vocab_deprel

    # create data fields for building test dataset, vocabs is extracted from model
    # instead of built from data itself
    f_form = pytext.data.Field(lower=True, tokenize=list, include_lengths=True)
    f_upos = pytext.data.Field(tokenize=list)
    f_head = pytext.data.Field(use_vocab=False, pad_token=0)
    f_deprel = pytext.data.Field(tokenize=list)
    f_form.vocab = vocab_form
    f_upos.vocab = vocab_upos
    f_deprel.vocab = vocab_deprel

    # build test dataset
Exemplo n.º 28
0
def main():
    # Configuration file processing
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='../configs/debug.cfg')
    argparser.add_argument('--continue_training',
                           action='store_true',
                           help='Load model Continue Training')
    argparser.add_argument('--name',
                           default='experiment',
                           help='The name of the experiment.')
    argparser.add_argument('--model',
                           default='s2s',
                           help='s2s: seq2seq-head-selection-model'
                           's2tBFS: seq2tree-BFS-decoder-model'
                           's2tDFS: seq2tree-DFS-decoder-model')
    argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)')
    args, extra_args = argparser.parse_known_args()
    cfg = IniConfigurator(args.config_file, extra_args)

    # Logger setting
    logger = dual_channel_logger(
        __name__,
        file_path=cfg.LOG_FILE,
        file_model='w',
        formatter='%(asctime)s - %(levelname)s - %(message)s',
        time_formatter='%m-%d %H:%M')
    from eval.script_evaluator import ScriptEvaluator

    # DyNet setting
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    import dynet_config
    dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED)
    dynet_config.set_gpu()
    import dynet as dy
    from models.token_representation import TokenRepresentation
    from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder
    from models.graph_nn_decoder import GraphNNDecoder

    # Build the dataset of the training process
    # Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',
    )
    # Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    # Setup datasets
    datasets_settings = {
        'train': DatasetSetting(cfg.TRAIN, True),
        'dev': DatasetSetting(cfg.DEV, False),
        'test': DatasetSetting(cfg.TEST, False),
    }
    datasets = PTBDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(counters,
                           no_pad_namespace={'rel'},
                           no_unk_namespace={'rel'})

    # Build model
    # Parameter
    pc = dy.ParameterCollection()
    trainer = dy.AdamTrainer(pc,
                             alpha=cfg.LR,
                             beta_1=cfg.ADAM_BETA1,
                             beta_2=cfg.ADAM_BETA2,
                             eps=cfg.EPS)

    # Token Representation Layer
    token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary)
    # BiLSTM Encoder Layer
    encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim,
                               cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder)
    # GNN Decoder Layer
    decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary)
    # PTB Evaluator
    my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary)

    # Build Training Batch
    def cmp(ins):
        return len(ins['word'])

    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True,
                                       cmp, True)
    valid_batch = list(
        datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False))
    test_batch = list(
        datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False))

    # Train model
    BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = cnt_iter = 0
    valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)]
    logger.info("Experiment name: %s" % args.name)
    SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip()
    logger.info('Git SHA: %s' % SHA)
    while cnt_iter < cfg.MAX_ITER:
        dy.renew_cg()
        cnt_iter += 1
        indexes, masks, truth = train_batch.__next__()
        vectors = token_repre(indexes, True)
        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                          np.array(masks['1D']).T, True)
        loss, part_loss = decoder(vectors, masks, truth, True, True)
        for i, l in enumerate([loss] + part_loss):
            valid_loss[i].append(l.value())
        loss.backward()
        trainer.learning_rate = cfg.LR * cfg.LR_DECAY**(cnt_iter /
                                                        cfg.LR_ANNEAL)
        trainer.update()

        if cnt_iter % cfg.VALID_ITER:
            continue

        # Validation
        for i in range(len(valid_loss)):
            valid_loss[i] = str(round(np.mean(valid_loss[i]), 2))
        avg_loss = ', '.join(valid_loss)
        logger.info("")
        logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" %
                    (cnt_iter / cfg.VALID_ITER, cnt_iter, avg_loss,
                     trainer.learning_rate, BEST_ITER))

        valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)]
        my_eval.clear('Valid')
        for indexes, masks, truth in valid_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)
            vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                              np.array(masks['1D']).T, False)
            pred = decoder(vectors, masks, None, False, True)
            my_eval.add_truth('Valid', truth)
            my_eval.add_pred('Valid', pred)
        dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
        if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV):
            BEST_ITER = cnt_iter / cfg.VALID_ITER
            os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE))
            os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE))

        # Just record test result
        my_eval.clear('Test')
        for indexes, masks, truth in test_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)
            vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                              np.array(masks['1D']).T, False)
            pred = decoder(vectors, masks, None, False, True)
            my_eval.add_truth('Test', truth)
            my_eval.add_pred('Test', pred)
        my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
    my_eval.print_best_result('Valid')

    test_pc = dy.ParameterCollection()
    token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc)

    my_eval.clear('Test')
    test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp,
                                      False)
    for indexes, masks, truth in test_batch:
        dy.renew_cg()
        vectors = token_repre(indexes, False)
        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                          np.array(masks['1D']).T, False)
        pred = decoder(vectors, masks, None, False, True)
        my_eval.add_truth('Test', truth)
        my_eval.add_pred('Test', pred)
    my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
Exemplo n.º 29
0
 def load_from_disk(self, filename):
     (self.builder, self.lookup, self.R, self.bias) = dy.load(filename, model)
Exemplo n.º 30
0
def load_or_create_model(args, parses_for_vocab):
    components = args.model_path_base.split('/')
    directory = '/'.join(components[:-1])
    if os.path.isdir(directory):
        relevant_files = [f for f in os.listdir(directory) if f.startswith(components[-1])]
    else:
        relevant_files = []
    assert len(relevant_files) <= 2, "Multiple possibilities {}".format(relevant_files)
    if len(relevant_files) > 0:
        print("Loading model from {}...".format(args.model_path_base))

        model = dy.ParameterCollection()
        [parser] = dy.load(args.model_path_base, model)
    else:
        assert parses_for_vocab is not None
        print("Constructing vocabularies using train parses...")

        tag_vocab = vocabulary.Vocabulary()
        tag_vocab.index(parse.START)
        tag_vocab.index(parse.STOP)

        word_vocab = vocabulary.Vocabulary()
        word_vocab.index(parse.START)
        word_vocab.index(parse.STOP)
        word_vocab.index(parse.UNK)

        label_vocab = vocabulary.Vocabulary()
        label_vocab.index(())

        for tree in parses_for_vocab:
            nodes = [tree]
            while nodes:
                node = nodes.pop()
                if isinstance(node, trees.InternalParseNode):
                    label_vocab.index(node.label)
                    nodes.extend(reversed(node.children))
                else:
                    assert isinstance(node, LeafParseNode)
                    tag_vocab.index(node.tag)
                    word_vocab.index(node.word)

        tag_vocab.freeze()
        word_vocab.freeze()
        label_vocab.freeze()

        print("Initializing model...")
        model = dy.ParameterCollection()
        parser = parse.Parser(
            model,
            tag_vocab,
            word_vocab,
            label_vocab,
            None,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            None,
            args.dropout,
            not args.no_elmo
        )
    return parser, model
Exemplo n.º 31
0
 def test_save_load(self):
     dy.save(self.file, [self.b])
     [b] = dy.load(self.file, self.m2)
Exemplo n.º 32
0
def test_on_parses(args):
    if not os.path.exists(args.experiment_directory):
        os.mkdir(args.experiment_directory)
    model = dy.ParameterCollection()
    [parser] = dy.load(args.model_path_base, model)

    treebank = trees.load_trees(args.input_file, strip_top=True, filter_none=True)
    output = [tree.linearize() for tree in treebank]
    with open(os.path.join(args.experiment_directory, 'parses.txt'), 'w') as f:
        f.write('\n'.join(output))
    sentence_embeddings = h5py.File(args.elmo_embeddings_file_path, 'r')

    test_predicted = []
    start_time = time.time()
    total_log_likelihood = 0
    total_confusion_matrix = {}
    total_turned_off = 0
    ranks = []
    num_correct = 0
    total = 0
    for tree_index, tree in enumerate(treebank):
        if tree_index % 100 == 0:
            print(tree_index)
            dy.renew_cg()
        sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves]
        elmo_embeddings_np = sentence_embeddings[str(tree_index)][:, :, :]
        assert elmo_embeddings_np.shape[1] == len(sentence), (
            elmo_embeddings_np.shape[1], len(sentence), [word for pos, word in sentence])
        elmo_embeddings = dy.inputTensor(elmo_embeddings_np)
        predicted, (additional_info, c, t) = parser.span_parser(sentence, is_train=False,
                                                                elmo_embeddings=elmo_embeddings)
        num_correct += c
        total += t
        rank = additional_info[3]
        ranks.append(rank)
        total_log_likelihood += additional_info[-1]
        test_predicted.append(predicted.convert())
    print('pos accuracy', num_correct / total)
    print("total time", time.time() - start_time)
    print("total loglikelihood", total_log_likelihood)
    print("total turned off", total_turned_off)
    print(total_confusion_matrix)

    print(ranks)
    print("avg", np.mean(ranks), "median", np.median(ranks))

    dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted,
                                               args=args,
                                               erase_labels=True,
                                               name="without-labels")
    print("dev-fscore without labels", dev_fscore_without_labels)

    dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted,
                                               args=args,
                                               erase_labels=True,
                                               flatten=True,
                                               name="without-label-flattened")
    print("dev-fscore without labels and flattened", dev_fscore_without_labels)

    dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted,
                                               args=args,
                                               erase_labels=False,
                                               flatten=True,
                                               name="flattened")
    print("dev-fscore with labels and flattened", dev_fscore_without_labels)

    test_fscore = evaluate.evalb('EVALB/', treebank, test_predicted, args=args,
                                 name="regular")

    print(
        "test-fscore {} "
        "test-elapsed {}".format(
            test_fscore,
            format_elapsed(start_time),
        )
    )
    with open(os.path.join(args.experiment_directory, "confusion_matrix.pickle"), "wb") as f:
        pickle.dump(total_confusion_matrix, f)
Exemplo n.º 33
0
def main():
    # Configuration file processing
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='../configs/debug.cfg')
    argparser.add_argument('--continue_training', action='store_true',
                           help='Load model Continue Training')
    argparser.add_argument('--name', default='experiment',
                           help='The name of the experiment.')
    argparser.add_argument('--model', default='s2s',
                           help='s2s: seq2seq-head-selection-model'
                           's2tDFS: seq2tree-DFS-decoder-model')
    argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)')
    args, extra_args = argparser.parse_known_args()
    cfg = IniConfigurator(args.config_file, extra_args)

    # Logger setting
    logger = dual_channel_logger(
        __name__,
        file_path=cfg.LOG_FILE,
        file_model='w',
        formatter='%(asctime)s - %(levelname)s - %(message)s',
        time_formatter='%m-%d %H:%M')
    from eval.script_evaluator import ScriptEvaluator

    # DyNet setting
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    import dynet_config
    dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED)
    dynet_config.set_gpu()
    import dynet as dy
    from models.token_representation import TokenRepresentation
    from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder
    from models.graph_nn_decoder import GraphNNDecoder
    from models.jackknife_decoder import JackKnifeGraphNNDecoder

    

    # Build the dataset of the training process
    # Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',)
    # Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    # Setup datasets
    datasets_settings = {'train': DatasetSetting(cfg.TRAIN, True),
                         'dev': DatasetSetting(cfg.DEV, False),
                         'test': DatasetSetting(cfg.TEST, False), }
    datasets = PTBDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'})

    # Build model
    # Parameter
    pc = dy.ParameterCollection()
    LR = 0.0005
    trainer = dy.AdamTrainer(pc, LR, cfg.ADAM_BETA1, cfg.ADAM_BETA2, cfg.EPS)

    # Token Representation Layer
    token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary, include_pos=True)
    # BiLSTM Encoder Layer
    #encoder = BiaffineAttention()
    #encoder = MultiHeadedAttention(pc, 10, token_repre.token_dim)
    #encoder = MultiLayerMultiHeadAttention(pc, 10, token_repre.token_dim, num_layers=1)
    #encoder = MyMultiHeadAttention(None, 6, token_repre.token_dim, 32, 32, model=pc)
    
    #encoder = LabelAttention(None, token_repre.token_dim, 128, 128, 112, 128, use_resdrop=True, q_as_matrix=False, residual_dropout=0.1, attention_dropout=0.1, d_positional=None, model=pc)
    # encoder = Encoder(None, token_repre.token_dim,
    #                 num_layers=1, num_heads=2, d_kv = 32, d_ff=1024, d_l=112,
    #                 d_positional=None,
    #                 num_layers_position_only=0,
    #                 relu_dropout=0.1, residual_dropout=0.1, attention_dropout=0.1,
    #                 use_lal=True,
    #                 lal_d_kv=128,
    #                 lal_d_proj=128,
    #                 lal_resdrop=True,
    #                 lal_pwff=True,
    #                 lal_q_as_matrix=False,
    #                 lal_partitioned=True,
    #                 model=pc)
    #encoder = ScaledDotProductAttention(pc, 10)
    encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim, cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder)
    # GNN Decoder Layer
    decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary)

    #decoder = JackKnifeGraphNNDecoder(pc, cfg, datasets.vocabulary)
    # PTB Evaluator
    my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary)

    #dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
    #exit(0)

    # Build Training Batch
    def cmp(ins):
        return len(ins['word'])
    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True)
    valid_batch = list(datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False))
    test_batch = list(datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False))

    #print('-----------------------')
    # print('TRAIN BATCH IS: ')
    # # print(train_batch)
    # indexes, masks, truth = train_batch.__next__()
    # print(indexes)
    # print('------------------',end='\n\n\n\n\n\n\n')
    # print(len(indexes))
    # exit(0)
    # exit(0)
    # for k in indexes:
    #     print(k)
    #print(indexes)
    #print(masks)


    # Train model
    BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = 0
    cnt_iter = -cfg.WARM * cfg.GRAPH_LAYERS
    valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)]
    logger.info("Experiment name: %s" % args.name)
    SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip()
    logger.info('Git SHA: %s' % SHA)
    while cnt_iter < cfg.MAX_ITER:
        print(cnt_iter, cfg.MAX_ITER)
        #dy.renew_cg()
        dy.renew_cg(immediate_compute = True, check_validity = True)
        cnt_iter += 1
        indexes, masks, truth = train_batch.__next__()
        vectors = token_repre(indexes, True)
        
        

        #vectors = encoder(vectors, np.array(masks['1D']).T)
        
        #print(vectors.npvalue)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)

        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, True)
       
        loss, part_loss = decoder(vectors, masks, truth, cnt_iter, True, True)
        for i, l in enumerate([loss]+part_loss):
            valid_loss[i].append(l.value())
        loss.backward()
        trainer.learning_rate = LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL)
        #trainer.learning_rate = cfg.LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL)
        trainer.update()

        if cnt_iter % cfg.VALID_ITER: continue
        # Validation
        for i in range(len(valid_loss)):
            valid_loss[i] = str(round(np.mean(valid_loss[i]), 2))
        avg_loss = ', '.join(valid_loss)
        logger.info("")
        logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" %
                    (cnt_iter/cfg.VALID_ITER, cnt_iter, avg_loss,
                     trainer.learning_rate, BEST_ITER))

        valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)]
        my_eval.clear('Valid')
        for indexes, masks, truth in valid_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)

            vectors = encoder(vectors, np.array(masks['1D']).T)
            #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
            #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
            #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

            pred = decoder(vectors, masks, None, cnt_iter, False, True)
            my_eval.add_truth('Valid', truth)
            my_eval.add_pred('Valid', pred)
        dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
        if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV):
            BEST_ITER = cnt_iter/cfg.VALID_ITER
            os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE))
            os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE))

        # Just record test result
        my_eval.clear('Test')
        for indexes, masks, truth in test_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)

            vectors = encoder(vectors, np.array(masks['1D']).T)
            #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
            #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
            #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

            pred = decoder(vectors, masks, None, cnt_iter, False, True)
            my_eval.add_truth('Test', truth)
            my_eval.add_pred('Test', pred)
        my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
    my_eval.print_best_result('Valid')

    # Final Test
    test_pc = dy.ParameterCollection()
    token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc)
    my_eval.clear('Test')
    for indexes, masks, truth in test_batch:
        dy.renew_cg()
        vectors = token_repre(indexes, False)

        vectors = encoder(vectors, np.array(masks['1D']).T)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
        #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
        #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

        pred = decoder(vectors, masks, None, 0, False, True)
        my_eval.add_truth('Test', truth)
        my_eval.add_pred('Test', pred)
    my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
Exemplo n.º 34
0
 def load_from_disk(self, filename):
     (self.builder, self.lookup, self.R, self.bias) = dy.load(filename, model)