Python Vocabulary.tokenize_conll 예제들, uniparse.Vocabulary.tokenize_conll Python 예제들

예제 #1

0

파일 보기

파일: cli.py 프로젝트: paulmayer/uniparse

def _train_model(_, args):
    train_file = args.train
    dev_file = args.dev
    epochs = args.epochs
    vocab_dest = args.vocab
    model_dest = args.parameter_file
    batch_size = args.batch_size
    embedding_file = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    if not model_class:
        raise ValueError("Model %s doesn't exist." % args.model)

    # Disable patience if there is no dev. set
    patience = args.patience if dev_file else -1

    vocab = Vocabulary().fit(train_file, embedding_file)
    word_embeddings = vocab.load_embedding() if embedding_file else None
    if word_embeddings:
        print("> Embedding shape", word_embeddings.shape)

    # save vocab for reproducability later
    print("> Saving vocabulary to", vocab_dest)
    vocab.save(vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(train_file)

    dev_data = vocab.tokenize_conll(dev_file) if dev_file else None

    # instantiate model
    model = model_class(vocab, word_embeddings)

    # 'best' only saves models that improve results on the dev. set
    # 'epoch' saves models on each epoch to a file appended with the epoch number
    save_mode = "best" if dev_file else "epoch"
    save_callback = ModelSaveCallback(model_dest, mode=save_mode)
    callbacks = [save_callback]

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        patience=patience,
    )

예제 #2

0

파일 보기

파일: cli.py 프로젝트: paulmayer/uniparse

def _run_model(_, args):
    run_file = args.test
    out_file = args.output
    vocab_file = args.vocab
    model_file = args.parameter_file
    batch_size = args.batch_size
    word_embeddings = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    vocab = Vocabulary().load(vocab_file)
    model = model_class(vocab, word_embeddings)
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )

    parser.load_from_file(model_file)

    run_data = vocab.tokenize_conll(run_file)
    predictions = parser.run(run_data, batch_size)
    write_predictions_to_file(predictions,
                              reference_file=run_file,
                              output_file=out_file,
                              vocab=vocab)

    print(">> Wrote predictions to conllu file %s" % out_file)

예제 #3

0

파일 보기

파일: cli.py 프로젝트: paulmayer/uniparse

def _eval_model(_, args):
    test_file = args.filename
    vocab_file = args.vocab
    model_file = args.parameter_file
    batch_size = args.batch_size
    word_embeddings = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    vocab = Vocabulary().load(vocab_file)
    model = model_class(vocab, word_embeddings)
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )

    parser.load_from_file(model_file)
    test_data = vocab.tokenize_conll(test_file)
    metrics = parser.evaluate(test_file, test_data, batch_size=batch_size)

    for key, value in metrics.items():
        print(key, round(value, 3))

예제 #4

0

파일 보기

parser.add_argument("--test", required=True)
parser.add_argument("--model", required=True)

arguments, unknown = parser.parse_known_args()

TRAIN_FILE = arguments.train
DEV_FILE = arguments.dev
TEST_FILE = arguments.test
MODEL_FILE = arguments.model
n_epochs = 5

vocab = Vocabulary()
vocab.fit(TRAIN_FILE)

print(">> Loading in data")
TRAIN = vocab.tokenize_conll(arguments.train)
DEV = vocab.tokenize_conll(arguments.dev)
TEST = vocab.tokenize_conll(arguments.test)

encoder = BetaEncodeHandler()
print("> pre-encoding edges")
s = time.time()
TRAIN = pre_encode(encoder, TRAIN, accumulate_vocab=True)
DEV = pre_encode(encoder, DEV)
TEST = pre_encode(encoder, TEST)
print(">> done pre-encoding", time.time() - s)

# 5m is completely arbitrary
# REQUEST: fix this to be inferred from the encoder
parser = MST(5_000_000)

예제 #5

0

파일 보기

파일: kiperwasser_contextual_embeddings_pytorch_demo.py 프로젝트: TalnUPF/UniParse

class EmbeddingsExtractor(object):
    def __init__(self, logging_file, model_config):

        # configure logging
        self.logging_file = logging_file
        self._configure_logging()

        self.model_config = model_config
        logging.info(model_config)

        # load vocabulary, parser and model
        self._load_model()

        # create lstms
        self._create_lstms()

    def _configure_logging(self):
        logging.basicConfig(filename=self.logging_file,
                            level=logging.DEBUG,
                            format="%(asctime)s:%(levelname)s:\t%(message)s")

    def _load_model(self):
        """ load original K&G model and  vocab
        """
        self.vocab = Vocabulary(self.model_config['only_words'])
        self.vocab.load(self.model_config['vocab_file'])
        self.parser = DependencyParserPytorch(self.vocab,
                                              self.model_config['upos_dim'],
                                              self.model_config['word_dim'],
                                              self.model_config['hidden_dim'])
        self.model = ParserModel(self.parser,
                                 decoder="eisner",
                                 loss="kiperwasser",
                                 optimizer="adam",
                                 strategy="bucket",
                                 vocab=self.vocab)
        self.model.load_from_file(self.model_config['model_file'])

    def _create_lstms(self):
        # create and initialize FWD and BWD biLSTMs with model parameters

        input_size = self.model_config['word_dim'] + self.model_config[
            'upos_dim']

        state_dict = self.parser.deep_bilstm.state_dict()

        self.lstm_fwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l0']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l0']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0']
        self.lstm_fwd_0.load_state_dict(new_state_dict)

        self.lstm_bwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l0_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l0_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0_reverse']
        self.lstm_bwd_0.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_fwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l1']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l1']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1']
        self.lstm_fwd_1.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_bwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l1_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l1_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1_reverse']
        self.lstm_bwd_1.load_state_dict(new_state_dict)

    def generate_embeddings(self, input_file):

        logging.info(
            "\n\n\n==================================================================================================="
        )
        logging.info("Generating K&G contextual embeddings for %s" %
                     input_file)
        logging.info(
            "===================================================================================================\n"
        )

        # generate tokenized data
        tokenized_sentences = self.vocab.tokenize_conll(input_file)

        embs = {}
        for i, sample in enumerate(tokenized_sentences):
            self.model.backend.renew_cg()  # for pytorch it is just 'pass'

            # get embeddings

            words, lemmas, tags, heads, rels, chars = sample

            words = self.model.backend.input_tensor(np.array([words]),
                                                    dtype="int")
            tags = self.model.backend.input_tensor(np.array([tags]),
                                                   dtype="int")

            word_embs = self.parser.wlookup(words)
            tags_embs = self.parser.tlookup(
                tags)  # TODO think if it makes sense to use tag_embs or not!

            input_data0 = torch.cat(
                [word_embs, tags_embs],
                dim=-1)  # dim 1x8x125 (if we have 8 words in the sentence)
            input_data0_reversed = torch.flip(input_data0, (1, ))

            # feed data

            out_lstm_fwd_0, hidden_lstm_fwd_0 = self.lstm_fwd_0(input_data0)
            out_lstm_bwd_0, hidden_lstm_bwd_0 = self.lstm_bwd_0(
                input_data0_reversed)

            input_data1 = torch.cat((out_lstm_fwd_0, out_lstm_bwd_0), 2)
            input_data1_reversed = torch.flip(input_data1, (1, ))
            out_lstm_fwd_1, hidden_lstm_fwd_1 = self.lstm_fwd_1(input_data1)
            out_lstm_bwd_1, hidden_lstm_bwd_1 = self.lstm_bwd_1(
                input_data1_reversed)

            # generate embeddings

            out_lstm_bwd_0 = torch.flip(out_lstm_bwd_0, (1, ))
            out_lstm_bwd_1 = torch.flip(out_lstm_bwd_1, (1, ))

            # TODO in ELMo they perform a task-dependant weighted sum of the concatenation of L0 (initial embeddings), L1 and L2;
            #  As our input has varying sizes and we are not weighting the layers, we'll just concatenate everything.
            # TODO for the syntactic probes, ELMo stores sepparately the three layers, so maybe we can do the same at least with layer 0 and layer1 ¿?
            sentence_embeddings = torch.cat(
                (input_data0, out_lstm_fwd_0, out_lstm_bwd_0, out_lstm_fwd_1,
                 out_lstm_bwd_1), 2)  # 1 x 8 x 125+100+100+100+100 = 525
            embs[i] = sentence_embeddings

        return embs

    @staticmethod
    def save_to_hdf5(embeddings, file_path, skip_root=False):
        # save embeddings in hdf5 format

        # Write contextual word representations to disk for each of the train, dev, and test split in hdf5 format, where the
        # index of the sentence in the conllx file is the key to the hdf5 dataset object. That is, your dataset file should
        # look a bit like {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}, etc.
        # Note here that SEQLEN for each sentence must be the number of tokens in the sentence as specified by the conllx file.

        with h5py.File(file_path, 'w') as f:
            for k, v in embeddings.items():
                logging.info('creating dataset for k %s' % str(k))
                sentence_embs = v.detach().numpy()
                if skip_root:
                    sentence_embs = sentence_embs[:, 1:, :]
                f.create_dataset(str(k), data=sentence_embs)

    @staticmethod
    def check_hdf5_file(file_path):

        with h5py.File(file_path, 'r') as f:
            for item in f.items():
                logging.info(item)

예제 #6

0

파일 보기

파일: run_dynet_kiperwasser.py 프로젝트: paulmayer/uniparse

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train",
        dest="train",
        help="Annotated CONLL train file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--dev",
        dest="dev",
        help="Annotated CONLL dev file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--test",
        dest="test",
        help="Annotated CONLL dev test",
        metavar="FILE",
        required=True,
    )
    parser.add_argument("--epochs", dest="epochs", type=int, default=30)
    parser.add_argument("--tb_dest", dest="tb_dest")
    parser.add_argument("--vocab_dest", dest="vocab_dest")
    parser.add_argument("--model_dest", dest="model_dest", required=True)
    parser.add_argument(
        "--embs", dest="embs", help="pre-trained embeddings file name", required=False
    )
    parser.add_argument(
        "--no_update_pretrained_emb",
        dest="no_update_pretrained_emb",
        help="don't update the pretrained embeddings during training",
        default=False,
        action="store_true",
    )
    parser.add_argument("--patience", dest="patience", type=int, default=-1)

    arguments, unknown = parser.parse_known_args()

    n_epochs = arguments.epochs

    vocab = Vocabulary()
    if arguments.embs:
        vocab = vocab.fit(arguments.train, arguments.embs)
        embs = vocab.load_embedding()
        print("shape", embs.shape)
    else:
        vocab = vocab.fit(arguments.train)
        embs = None

    # save vocab for reproducability later
    if arguments.vocab_dest:
        print("> saving vocab to", arguments.vocab_dest)
        vocab.save(arguments.vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    # instantiate model
    model = DependencyParser(vocab, embs)

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    save_callback = ModelSaveCallback(arguments.model_dest)
    callbacks.append(save_callback)

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        arguments.dev,
        dev_data,
        epochs=n_epochs,
        batch_size=32,
        callbacks=callbacks,
        patience=arguments.patience,
    )
    parser.load_from_file(arguments.model_dest)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=32)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    print(metrics)

    if arguments.tb_dest and tensorboard_logger:
        tensorboard_logger.raw_write("test_UAS", test_UAS)
        tensorboard_logger.raw_write("test_LAS", test_LAS)

    print()
    print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
    print(">>> Test score:", test_UAS, test_LAS)

예제 #7

0

파일 보기

if arguments.embs == None:
    vocab = vocab.fit(arguments.train)
    embs = None
else:
    vocab = vocab.fit(arguments.train, arguments.embs)
    embs = vocab.load_embedding()
    print('shape',embs.shape)

# save vocab for reproducability later
if arguments.vocab_dest:
    print("> saving vocab to", arguments.vocab_dest)
    vocab.save(arguments.vocab_dest)

# prep data
print(">> Loading in data")
training_data = vocab.tokenize_conll(arguments.train)
if arguments.dev_mode:
    training_data=training_data[:100]
dev_data = vocab.tokenize_conll(arguments.dev)
test_data = vocab.tokenize_conll(arguments.test)

# instantiate model
model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb)

callbacks = []
tensorboard_logger = None
if arguments.tb_dest:
    tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
    callbacks.append(tensorboard_logger)

예제 #8

0

파일 보기

파일: run_mst.py 프로젝트: paulmayer/uniparse

    ARGPARSER.add_argument("--test", required=True)
    ARGPARSER.add_argument("--model", required=True)

    ARGUMENTS, UNK = ARGPARSER.parse_known_args()

    TRAIN_FILE = ARGUMENTS.train
    DEV_FILE = ARGUMENTS.dev
    TEST_FILE = ARGUMENTS.test
    MODEL_FILE = ARGUMENTS.model
    N_EPOCHS = 5

    VOCAB = Vocabulary()
    VOCAB.fit(TRAIN_FILE)

    print("> Loading in data")
    TRAIN = VOCAB.tokenize_conll(ARGUMENTS.train)
    DEV = VOCAB.tokenize_conll(ARGUMENTS.dev)
    TEST = VOCAB.tokenize_conll(ARGUMENTS.test)

    ENCODER = BetaEncodeHandler()
    print("> Pre-encoding edges")
    START_TIME = time.time()
    TRAIN = pre_encode(ENCODER, TRAIN, accumulate_vocab=True)
    DEV = pre_encode(ENCODER, DEV)
    TEST = pre_encode(ENCODER, TEST)
    print(">> Done pre-encoding edges", time.time() - START_TIME)

    # 5m is completely arbitrary but fits all features for PTB.
    # TODO: Infer this from the encoder by letting it grow
    PARAMS = MST(5_000_000)

예제 #9

0

파일 보기

파일: run_dozat.py 프로젝트: paulmayer/uniparse

def main():
    """Main function."""
    argparser = argparse.ArgumentParser()

    argparser.add_argument("--train", required=True)
    argparser.add_argument("--dev", required=True)
    argparser.add_argument("--test", required=True)
    argparser.add_argument("--emb", dest="emb")
    argparser.add_argument("--epochs", dest="epochs", type=int, default=283)
    argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True)
    argparser.add_argument("--model_dest", dest="model_dest", required=True)

    argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3)
    argparser.add_argument("--dropout", type=int, default=0.33)

    arguments, _ = argparser.parse_known_args()

    # [Data]
    min_occur_count = 2
    train_file = arguments.train
    dev_file = arguments.dev
    vocab_destination = arguments.vocab_dest
    model_destination = arguments.model_dest

    # [Network]
    word_dims = 100
    tag_dims = 100
    lstm_hiddens = 400
    mlp_arc_size = 500
    mlp_rel_size = 100
    lstm_layers = arguments.lstm_layers
    dropout_emb = arguments.dropout
    dropout_lstm_input = arguments.dropout
    dropout_lstm_hidden = arguments.dropout
    dropout_mlp = arguments.dropout

    # [Hyperparameters for optimizer]
    learning_rate = 2e-3
    decay = 0.75
    decay_steps = 5000
    beta_1 = 0.9
    beta_2 = 0.9
    epsilon = 1e-12

    # [Run]
    batch_scale = 5000  # for scaled batching
    n_epochs = arguments.epochs

    vocab = Vocabulary()
    vocab = vocab.fit(train_file, arguments.emb, min_occur_count)
    embs = vocab.load_embedding(True) if arguments.emb else None

    vocab.save(vocab_destination)

    model = DozatManning(
        vocab,
        word_dims,
        tag_dims,
        dropout_emb,
        lstm_layers,
        lstm_hiddens,
        dropout_lstm_input,
        dropout_lstm_hidden,
        mlp_arc_size,
        mlp_rel_size,
        dropout_mlp,
        pretrained_embeddings=embs,
    )

    optimizer = dy.AdamTrainer(
        model.parameter_collection, learning_rate, beta_1, beta_2, epsilon
    )

    # Callbacks
    custom_learning_update_callback = UpdateParamsCallback(
        optimizer, learning_rate, decay, decay_steps
    )
    save_callback = ModelSaveCallback(model_destination)
    callbacks = [custom_learning_update_callback, save_callback]

    parser = Model(
        model,
        decoder="cle",
        loss="crossentropy",
        optimizer=optimizer,
        strategy="scaled_batch",
        vocab=vocab,
    )

    # Prep data
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=n_epochs,
        batch_size=batch_scale,
        callbacks=callbacks,
    )

    parser.load_from_file(model_destination)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale)
    test_uas = metrics["nopunct_uas"]
    test_las = metrics["nopunct_las"]

    print()
    print(metrics)
    print(">> Test score:", test_uas, test_las)

예제 #10

0

파일 보기

    model_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/model.model'
    only_words = True
    vocab = Vocabulary(only_words)
    vocab.load(vocab_file)
    embs = None
    parser = DependencyParser(vocab, embs, False)

    model = ParserModel(parser,
                        decoder="eisner",
                        loss="kiperwasser",
                        optimizer="adam",
                        strategy="bucket",
                        vocab=vocab)
    model.load_from_file(model_file)

    # input_file = '/home/lpmayos/hd/code/cvt_text/data/raw_data/depparse/test_mini.txt'
    input_file = '/home/lpmayos/hd/code/structural-probes/example/data/en_ewt-ud-sample/en_ewt-ud-dev.conllu'

    input_file = transform_to_conllu(input_file)
    input_data = vocab.tokenize_conll(input_file)

    embeddings = parser.extract_embeddings(
        input_data,
        model.backend,
        format='concat',
        save=True,
        file_path='babau.hdf5'
    )  # {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}

    print(embeddings)