def _train_model(_, args): train_file = args.train dev_file = args.dev epochs = args.epochs vocab_dest = args.vocab model_dest = args.parameter_file batch_size = args.batch_size embedding_file = None model_class = INCLUDED_MODELS.get(args.model_name) if not model_class: raise ValueError("Model %s doesn't exist." % args.model) # Disable patience if there is no dev. set patience = args.patience if dev_file else -1 vocab = Vocabulary().fit(train_file, embedding_file) word_embeddings = vocab.load_embedding() if embedding_file else None if word_embeddings: print("> Embedding shape", word_embeddings.shape) # save vocab for reproducability later print("> Saving vocabulary to", vocab_dest) vocab.save(vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(train_file) dev_data = vocab.tokenize_conll(dev_file) if dev_file else None # instantiate model model = model_class(vocab, word_embeddings) # 'best' only saves models that improve results on the dev. set # 'epoch' saves models on each epoch to a file appended with the epoch number save_mode = "best" if dev_file else "epoch" save_callback = ModelSaveCallback(model_dest, mode=save_mode) callbacks = [save_callback] # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, dev_file, dev_data, epochs=epochs, batch_size=batch_size, callbacks=callbacks, patience=patience, )
def _run_model(_, args): run_file = args.test out_file = args.output vocab_file = args.vocab model_file = args.parameter_file batch_size = args.batch_size word_embeddings = None model_class = INCLUDED_MODELS.get(args.model_name) vocab = Vocabulary().load(vocab_file) model = model_class(vocab, word_embeddings) parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.load_from_file(model_file) run_data = vocab.tokenize_conll(run_file) predictions = parser.run(run_data, batch_size) write_predictions_to_file(predictions, reference_file=run_file, output_file=out_file, vocab=vocab) print(">> Wrote predictions to conllu file %s" % out_file)
def _eval_model(_, args): test_file = args.filename vocab_file = args.vocab model_file = args.parameter_file batch_size = args.batch_size word_embeddings = None model_class = INCLUDED_MODELS.get(args.model_name) vocab = Vocabulary().load(vocab_file) model = model_class(vocab, word_embeddings) parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.load_from_file(model_file) test_data = vocab.tokenize_conll(test_file) metrics = parser.evaluate(test_file, test_data, batch_size=batch_size) for key, value in metrics.items(): print(key, round(value, 3))
parser.add_argument("--test", required=True) parser.add_argument("--model", required=True) arguments, unknown = parser.parse_known_args() TRAIN_FILE = arguments.train DEV_FILE = arguments.dev TEST_FILE = arguments.test MODEL_FILE = arguments.model n_epochs = 5 vocab = Vocabulary() vocab.fit(TRAIN_FILE) print(">> Loading in data") TRAIN = vocab.tokenize_conll(arguments.train) DEV = vocab.tokenize_conll(arguments.dev) TEST = vocab.tokenize_conll(arguments.test) encoder = BetaEncodeHandler() print("> pre-encoding edges") s = time.time() TRAIN = pre_encode(encoder, TRAIN, accumulate_vocab=True) DEV = pre_encode(encoder, DEV) TEST = pre_encode(encoder, TEST) print(">> done pre-encoding", time.time() - s) # 5m is completely arbitrary # REQUEST: fix this to be inferred from the encoder parser = MST(5_000_000)
class EmbeddingsExtractor(object): def __init__(self, logging_file, model_config): # configure logging self.logging_file = logging_file self._configure_logging() self.model_config = model_config logging.info(model_config) # load vocabulary, parser and model self._load_model() # create lstms self._create_lstms() def _configure_logging(self): logging.basicConfig(filename=self.logging_file, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:\t%(message)s") def _load_model(self): """ load original K&G model and vocab """ self.vocab = Vocabulary(self.model_config['only_words']) self.vocab.load(self.model_config['vocab_file']) self.parser = DependencyParserPytorch(self.vocab, self.model_config['upos_dim'], self.model_config['word_dim'], self.model_config['hidden_dim']) self.model = ParserModel(self.parser, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=self.vocab) self.model.load_from_file(self.model_config['model_file']) def _create_lstms(self): # create and initialize FWD and BWD biLSTMs with model parameters input_size = self.model_config['word_dim'] + self.model_config[ 'upos_dim'] state_dict = self.parser.deep_bilstm.state_dict() self.lstm_fwd_0 = nn.LSTM(input_size=input_size, hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l0'] new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l0'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0'] self.lstm_fwd_0.load_state_dict(new_state_dict) self.lstm_bwd_0 = nn.LSTM(input_size=input_size, hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict[ 'lstm.weight_hh_l0_reverse'] new_state_dict['weight_ih_l0'] = state_dict[ 'lstm.weight_ih_l0_reverse'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0_reverse'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0_reverse'] self.lstm_bwd_0.load_state_dict(new_state_dict) # NOTICE! input_size = 2*hidden_dim? self.lstm_fwd_1 = nn.LSTM(input_size=2 * self.model_config['hidden_dim'], hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l1'] new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l1'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1'] self.lstm_fwd_1.load_state_dict(new_state_dict) # NOTICE! input_size = 2*hidden_dim? self.lstm_bwd_1 = nn.LSTM(input_size=2 * self.model_config['hidden_dim'], hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict[ 'lstm.weight_hh_l1_reverse'] new_state_dict['weight_ih_l0'] = state_dict[ 'lstm.weight_ih_l1_reverse'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1_reverse'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1_reverse'] self.lstm_bwd_1.load_state_dict(new_state_dict) def generate_embeddings(self, input_file): logging.info( "\n\n\n===================================================================================================" ) logging.info("Generating K&G contextual embeddings for %s" % input_file) logging.info( "===================================================================================================\n" ) # generate tokenized data tokenized_sentences = self.vocab.tokenize_conll(input_file) embs = {} for i, sample in enumerate(tokenized_sentences): self.model.backend.renew_cg() # for pytorch it is just 'pass' # get embeddings words, lemmas, tags, heads, rels, chars = sample words = self.model.backend.input_tensor(np.array([words]), dtype="int") tags = self.model.backend.input_tensor(np.array([tags]), dtype="int") word_embs = self.parser.wlookup(words) tags_embs = self.parser.tlookup( tags) # TODO think if it makes sense to use tag_embs or not! input_data0 = torch.cat( [word_embs, tags_embs], dim=-1) # dim 1x8x125 (if we have 8 words in the sentence) input_data0_reversed = torch.flip(input_data0, (1, )) # feed data out_lstm_fwd_0, hidden_lstm_fwd_0 = self.lstm_fwd_0(input_data0) out_lstm_bwd_0, hidden_lstm_bwd_0 = self.lstm_bwd_0( input_data0_reversed) input_data1 = torch.cat((out_lstm_fwd_0, out_lstm_bwd_0), 2) input_data1_reversed = torch.flip(input_data1, (1, )) out_lstm_fwd_1, hidden_lstm_fwd_1 = self.lstm_fwd_1(input_data1) out_lstm_bwd_1, hidden_lstm_bwd_1 = self.lstm_bwd_1( input_data1_reversed) # generate embeddings out_lstm_bwd_0 = torch.flip(out_lstm_bwd_0, (1, )) out_lstm_bwd_1 = torch.flip(out_lstm_bwd_1, (1, )) # TODO in ELMo they perform a task-dependant weighted sum of the concatenation of L0 (initial embeddings), L1 and L2; # As our input has varying sizes and we are not weighting the layers, we'll just concatenate everything. # TODO for the syntactic probes, ELMo stores sepparately the three layers, so maybe we can do the same at least with layer 0 and layer1 ¿? sentence_embeddings = torch.cat( (input_data0, out_lstm_fwd_0, out_lstm_bwd_0, out_lstm_fwd_1, out_lstm_bwd_1), 2) # 1 x 8 x 125+100+100+100+100 = 525 embs[i] = sentence_embeddings return embs @staticmethod def save_to_hdf5(embeddings, file_path, skip_root=False): # save embeddings in hdf5 format # Write contextual word representations to disk for each of the train, dev, and test split in hdf5 format, where the # index of the sentence in the conllx file is the key to the hdf5 dataset object. That is, your dataset file should # look a bit like {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}, etc. # Note here that SEQLEN for each sentence must be the number of tokens in the sentence as specified by the conllx file. with h5py.File(file_path, 'w') as f: for k, v in embeddings.items(): logging.info('creating dataset for k %s' % str(k)) sentence_embs = v.detach().numpy() if skip_root: sentence_embs = sentence_embs[:, 1:, :] f.create_dataset(str(k), data=sentence_embs) @staticmethod def check_hdf5_file(file_path): with h5py.File(file_path, 'r') as f: for item in f.items(): logging.info(item)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train", dest="train", help="Annotated CONLL train file", metavar="FILE", required=True, ) parser.add_argument( "--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True, ) parser.add_argument( "--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True, ) parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--vocab_dest", dest="vocab_dest") parser.add_argument("--model_dest", dest="model_dest", required=True) parser.add_argument( "--embs", dest="embs", help="pre-trained embeddings file name", required=False ) parser.add_argument( "--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action="store_true", ) parser.add_argument("--patience", dest="patience", type=int, default=-1) arguments, unknown = parser.parse_known_args() n_epochs = arguments.epochs vocab = Vocabulary() if arguments.embs: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print("shape", embs.shape) else: vocab = vocab.fit(arguments.train) embs = None # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) # instantiate model model = DependencyParser(vocab, embs) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = Model( model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab, ) parser.train( training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience, ) parser.load_from_file(arguments.model_dest) metrics = parser.evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch) print(">>> Test score:", test_UAS, test_LAS)
if arguments.embs == None: vocab = vocab.fit(arguments.train) embs = None else: vocab = vocab.fit(arguments.train, arguments.embs) embs = vocab.load_embedding() print('shape',embs.shape) # save vocab for reproducability later if arguments.vocab_dest: print("> saving vocab to", arguments.vocab_dest) vocab.save(arguments.vocab_dest) # prep data print(">> Loading in data") training_data = vocab.tokenize_conll(arguments.train) if arguments.dev_mode: training_data=training_data[:100] dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) # instantiate model model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger)
ARGPARSER.add_argument("--test", required=True) ARGPARSER.add_argument("--model", required=True) ARGUMENTS, UNK = ARGPARSER.parse_known_args() TRAIN_FILE = ARGUMENTS.train DEV_FILE = ARGUMENTS.dev TEST_FILE = ARGUMENTS.test MODEL_FILE = ARGUMENTS.model N_EPOCHS = 5 VOCAB = Vocabulary() VOCAB.fit(TRAIN_FILE) print("> Loading in data") TRAIN = VOCAB.tokenize_conll(ARGUMENTS.train) DEV = VOCAB.tokenize_conll(ARGUMENTS.dev) TEST = VOCAB.tokenize_conll(ARGUMENTS.test) ENCODER = BetaEncodeHandler() print("> Pre-encoding edges") START_TIME = time.time() TRAIN = pre_encode(ENCODER, TRAIN, accumulate_vocab=True) DEV = pre_encode(ENCODER, DEV) TEST = pre_encode(ENCODER, TEST) print(">> Done pre-encoding edges", time.time() - START_TIME) # 5m is completely arbitrary but fits all features for PTB. # TODO: Infer this from the encoder by letting it grow PARAMS = MST(5_000_000)
def main(): """Main function.""" argparser = argparse.ArgumentParser() argparser.add_argument("--train", required=True) argparser.add_argument("--dev", required=True) argparser.add_argument("--test", required=True) argparser.add_argument("--emb", dest="emb") argparser.add_argument("--epochs", dest="epochs", type=int, default=283) argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True) argparser.add_argument("--model_dest", dest="model_dest", required=True) argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3) argparser.add_argument("--dropout", type=int, default=0.33) arguments, _ = argparser.parse_known_args() # [Data] min_occur_count = 2 train_file = arguments.train dev_file = arguments.dev vocab_destination = arguments.vocab_dest model_destination = arguments.model_dest # [Network] word_dims = 100 tag_dims = 100 lstm_hiddens = 400 mlp_arc_size = 500 mlp_rel_size = 100 lstm_layers = arguments.lstm_layers dropout_emb = arguments.dropout dropout_lstm_input = arguments.dropout dropout_lstm_hidden = arguments.dropout dropout_mlp = arguments.dropout # [Hyperparameters for optimizer] learning_rate = 2e-3 decay = 0.75 decay_steps = 5000 beta_1 = 0.9 beta_2 = 0.9 epsilon = 1e-12 # [Run] batch_scale = 5000 # for scaled batching n_epochs = arguments.epochs vocab = Vocabulary() vocab = vocab.fit(train_file, arguments.emb, min_occur_count) embs = vocab.load_embedding(True) if arguments.emb else None vocab.save(vocab_destination) model = DozatManning( vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, pretrained_embeddings=embs, ) optimizer = dy.AdamTrainer( model.parameter_collection, learning_rate, beta_1, beta_2, epsilon ) # Callbacks custom_learning_update_callback = UpdateParamsCallback( optimizer, learning_rate, decay, decay_steps ) save_callback = ModelSaveCallback(model_destination) callbacks = [custom_learning_update_callback, save_callback] parser = Model( model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab, ) # Prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train( training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks, ) parser.load_from_file(model_destination) metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale) test_uas = metrics["nopunct_uas"] test_las = metrics["nopunct_las"] print() print(metrics) print(">> Test score:", test_uas, test_las)
model_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/model.model' only_words = True vocab = Vocabulary(only_words) vocab.load(vocab_file) embs = None parser = DependencyParser(vocab, embs, False) model = ParserModel(parser, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) model.load_from_file(model_file) # input_file = '/home/lpmayos/hd/code/cvt_text/data/raw_data/depparse/test_mini.txt' input_file = '/home/lpmayos/hd/code/structural-probes/example/data/en_ewt-ud-sample/en_ewt-ud-dev.conllu' input_file = transform_to_conllu(input_file) input_data = vocab.tokenize_conll(input_file) embeddings = parser.extract_embeddings( input_data, model.backend, format='concat', save=True, file_path='babau.hdf5' ) # {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...} print(embeddings)