示例#1
0
def test_serialization():
    all_readers = [
        fastqa_reader,
        modular_qa_reader,
        # fastqa_reader_torch,
        dam_snli_reader,
        cbilstm_nli_reader,
        modular_nli_reader,
        distmult_reader,
        complex_reader,
        transe_reader,
    ]

    for reader in all_readers:
        vocab, config = {}, {}

        data = None
        if reader in {distmult_reader, complex_reader, transe_reader}:
            data = loaders['jack'](
                'tests/test_data/WN18/wn18-snippet.jack.json')
            config['repr_dim'] = 50
        elif reader in {cbilstm_nli_reader, dam_snli_reader}:
            data = loaders['snli'](
                'tests/test_data/SNLI/1000_samples_snli_1.0_train.jsonl')

            embeddings = load_embeddings("data/GloVe/glove.the.50d.txt",
                                         'glove')
            vocab = Vocab(emb=embeddings, init_from_embeddings=True)
            config['repr_dim_input'] = 50
            config['repr_dim'] = 50
        elif reader in {fastqa_reader}:
            data = loaders['squad']('data/SQuAD/snippet.json')

            embeddings = load_embeddings("data/GloVe/glove.the.50d.txt",
                                         'glove')
            vocab = Vocab(emb=embeddings, init_from_embeddings=True)
            config['repr_dim_input'] = 50
            config['repr_dim'] = 50

        if data is not None:
            tf.reset_default_graph()

            shared_resources = SharedResources(vocab, config)
            reader_instance = reader(shared_resources)
            reader_instance.setup_from_data(data)

            temp_dir_path = tempfile.mkdtemp()
            reader_instance.store(temp_dir_path)

            reader_instance.load(temp_dir_path)

            assert reader_instance is not None
示例#2
0
def test_shared_resources_store():
    embeddings_file = "data/GloVe/glove.the.50d.txt"
    embeddings = load_embeddings(embeddings_file, 'glove')
    config = {"embedding_file": embeddings_file, "embedding_format": "glove"}
    some_vocab = Vocab(vocab=embeddings.vocabulary)
    some_vocab('foo')
    shared_resources = SharedResources(some_vocab, config, embeddings)

    import tempfile
    with tempfile.TemporaryDirectory() as tmp_dir:
        path = tmp_dir + "_resources"
        shared_resources.store(path)

        new_shared_resources = SharedResources()
        new_shared_resources.load(path)

        type_a, type_b = type(new_shared_resources.vocab), type(
            shared_resources.vocab)
        assert type_a == type_b

        for k in new_shared_resources.vocab.__dict__:
            assert new_shared_resources.vocab.__dict__[
                k] == shared_resources.vocab.__dict__[k]
        assert new_shared_resources.config == shared_resources.config
        assert new_shared_resources.embeddings.lookup.shape == embeddings.lookup.shape
        assert np.array_equal(new_shared_resources.embeddings.get(b"the"),
                              embeddings.get(b"the"))
示例#3
0
def test_memory_map_dir():
    import tempfile
    from jack.io.embeddings.memory_map import save_as_memory_map_dir, load_memory_map_dir
    embeddings_file = "data/GloVe/glove.the.50d.txt"
    embeddings = load_embeddings(embeddings_file, 'glove')
    with tempfile.TemporaryDirectory() as tmp_dir:
        mem_map_dir = tmp_dir + "/glove.the.50d.memmap"
        save_as_memory_map_dir(mem_map_dir, embeddings)
        loaded_embeddings = load_memory_map_dir(mem_map_dir)
        assert loaded_embeddings.shape == embeddings.shape
        assert len(loaded_embeddings.vocabulary) == 1
        assert loaded_embeddings.vocabulary["the"] == 0
        assert "foo" not in loaded_embeddings.vocabulary
        assert np.isclose(loaded_embeddings.get("the"), embeddings.get("the"), 1.e-5).all()
示例#4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='Convert embeddings to memory map directory')
    parser.add_argument("input_file", help="The input embedding file.")
    parser.add_argument("output_dir",
                        help="The name of the directory to store the memory map in. Will be created if it doesn't "
                             "exist.")
    parser.add_argument("-f", "--input_format", help="Format of input embeddings.", default="glove",
                        choices=["glove", "word2vec", "memory_map_dir"])
    args = parser.parse_args()
    input_name = args.input_file
    output_dir = args.output_dir
    embeddings = load_embeddings(input_name, typ=args.input_format)
    logging.info("Loaded embeddings from {}".format(input_name))
    save_as_memory_map_dir(output_dir, embeddings)
    logging.info("Stored embeddings to {}".format(output_dir))
示例#5
0
    def load(self, path: str):
        conf_file = os.path.join(path, "conf.yaml")
        emb_file = os.path.join(path, "emb.pkl")
        remainder_file = os.path.join(path, "remainder.pkl")
        if os.path.exists(conf_file):
            with open(conf_file, "r") as f:
                config = yaml.load(f)
            if config["embedding_file"] is not None:
                emb = load_embeddings(config["embedding_file"], typ=config.get("emb_format", None))
            elif os.path.exists(emb_file):
                with open(emb_file, "rb") as f:
                    emb = pickle.load(f)
        elif os.path.exists(emb_file):
            with open(emb_file, "rb") as f:
                emb = pickle.load(f)

        with open(remainder_file, "rb") as f:
            remaining = pickle.load(f)

        self.__dict__ = remaining
        self.__dict__["emb"] = emb
示例#6
0
    shape = meta['shape']
    mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape)
    result = Embeddings(meta['vocab'],
                        mem_map,
                        filename=file_prefix,
                        emb_format="mem_map")
    return result


def save_as_memory_map(file_prefix: str, emb: Embeddings):
    meta_file = file_prefix + "_meta.pkl"
    mem_map_file = file_prefix + "_memmap"
    with open(meta_file, "wb") as f:
        pickle.dump({"vocab": emb.vocabulary, "shape": emb.shape}, f)
    mem_map = np.memmap(mem_map_file,
                        dtype='float32',
                        mode='w+',
                        shape=emb.shape)
    mem_map[:] = emb.lookup[:]
    mem_map.flush()
    del mem_map


if __name__ == "__main__":
    input_name = sys.argv[1]
    output_prefix = sys.argv[2]
    embeddings = load_embeddings(input_name)
    logging.info("Loaded embeddings from {}".format(input_name))
    save_as_memory_map(output_prefix, embeddings)
    logging.info("Stored embeddings to {}".format(output_prefix))
示例#7
0
tf.app.flags.DEFINE_string('file', None, 'dataset file')
tf.app.flags.DEFINE_string('dataset_type', 'squad', 'either squad or jack')
tf.app.flags.DEFINE_string('model', None, 'Name of the reader')
tf.app.flags.DEFINE_string('model_dir', None, 'directory to saved model')
tf.app.flags.DEFINE_string('embedding_path', None, 'path to embeddings')
tf.app.flags.DEFINE_string('embedding_format', 'glove', 'embeddings format')
tf.app.flags.DEFINE_string('device', "/cpu:0", 'device to use')
tf.app.flags.DEFINE_string('out', "results.json", 'Result file path.')
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size')
tf.app.flags.DEFINE_integer('beam_size', 1, 'beam size')
tf.app.flags.DEFINE_string('kwargs', '{}', 'additional reader-specific configurations')

FLAGS = tf.app.flags.FLAGS

logger.info("Loading embeddings from {}...".format(FLAGS.embedding_path))
emb = load_embeddings(FLAGS.embedding_path, FLAGS.embedding_format)
vocab = Vocab(emb=emb, init_from_embeddings=True)

logger.info("Creating and loading reader from {}...".format(FLAGS.model_dir))
config = {"beam_size": FLAGS.beam_size, 'batch_size': FLAGS.batch_size, "max_support_length": None}
config.update(json.loads(FLAGS.kwargs))
reader = readers[FLAGS.model](vocab, config)
with tf.device(FLAGS.device):
    reader.load_and_setup(FLAGS.model_dir)

if FLAGS.dataset_type == "squad":
    dataset_jtr = convert_squad(FLAGS.file)
else:
    with open(FLAGS.file) as f:
        dataset_jtr = json.load(f)