예제 #1
0
def build_vocab(questions):
    """Since some readers require an initialized vocabulary, initialize it here."""

    vocab = dict()
    for question in questions:
        for t in tokenize(question.question):
            if t not in vocab:
                vocab[t] = len(vocab)
    embeddings = Embeddings(vocab, np.random.random([len(vocab), 10]))

    vocab = Vocab(vocab=embeddings.vocabulary)
    return vocab, embeddings
예제 #2
0
 def load(self, path):
     """
     Loads this (potentially empty) resource from path (all object attributes).
     Args:
         path: path to shared resources
     """
     remainder_path = os.path.join(path, 'remainder')
     if os.path.exists(remainder_path):
         with open(remainder_path, 'rb') as f:
             self.__dict__.update(pickle.load(f))
     for f in os.listdir(path):
         if f == 'config.yaml':
             with open(os.path.join(path, f), 'r') as f:
                 self.config = yaml.load(f)
         elif f == 'embeddings':
             self.embeddings = Embeddings.from_dir(os.path.join(path, f))
         else:
             v = Vocab()
             v.load(os.path.join(path, f))
             self.__dict__[f] = v
예제 #3
0
def load_memory_map(file_prefix: str) -> Embeddings:
    """
    Loads embeddings from a memory map file to allow lazy loading (and reduce the memory usage).
    Args:
        file_prefix: a file prefix. This function stores several files, and they will all start with this prefix.

    Returns:
        Embeddings object with a lookup matrix that is backed by a memory map.

    """
    meta_file = file_prefix + "_meta.pkl"
    mem_map_file = file_prefix + "_memmap"
    with open(meta_file, "rb") as f:
        meta = pickle.load(f)
    shape = meta['shape']
    mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape)
    result = Embeddings(meta['vocab'],
                        mem_map,
                        filename=file_prefix,
                        emb_format="mem_map")
    return result
예제 #4
0
def load_memory_map_dir(directory: str) -> Embeddings:
    """
    Loads embeddings from a memory map directory to allow lazy loading (and reduce the memory usage).
    Args:
        directory: a file prefix. This function loads two files in the directory: a meta json file with shape information
        and the vocabulary, and the actual memory map file.

    Returns:
        Embeddings object with a lookup matrix that is backed by a memory map.

    """
    meta_file = os.path.join(directory, "meta.json")
    mem_map_file = os.path.join(directory, "memory_map")
    with open(meta_file, "r") as f:
        meta = json.load(f)
    shape = tuple(meta['shape'])
    vocab = meta['vocab']
    mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape)
    result = Embeddings(vocab,
                        mem_map,
                        filename=directory,
                        emb_format="memory_map_dir")
    return result