def build_vocab(questions): """Since some readers require an initialized vocabulary, initialize it here.""" vocab = dict() for question in questions: for t in tokenize(question.question): if t not in vocab: vocab[t] = len(vocab) embeddings = Embeddings(vocab, np.random.random([len(vocab), 10])) vocab = Vocab(vocab=embeddings.vocabulary) return vocab, embeddings
def load(self, path): """ Loads this (potentially empty) resource from path (all object attributes). Args: path: path to shared resources """ remainder_path = os.path.join(path, 'remainder') if os.path.exists(remainder_path): with open(remainder_path, 'rb') as f: self.__dict__.update(pickle.load(f)) for f in os.listdir(path): if f == 'config.yaml': with open(os.path.join(path, f), 'r') as f: self.config = yaml.load(f) elif f == 'embeddings': self.embeddings = Embeddings.from_dir(os.path.join(path, f)) else: v = Vocab() v.load(os.path.join(path, f)) self.__dict__[f] = v
def load_memory_map(file_prefix: str) -> Embeddings: """ Loads embeddings from a memory map file to allow lazy loading (and reduce the memory usage). Args: file_prefix: a file prefix. This function stores several files, and they will all start with this prefix. Returns: Embeddings object with a lookup matrix that is backed by a memory map. """ meta_file = file_prefix + "_meta.pkl" mem_map_file = file_prefix + "_memmap" with open(meta_file, "rb") as f: meta = pickle.load(f) shape = meta['shape'] mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape) result = Embeddings(meta['vocab'], mem_map, filename=file_prefix, emb_format="mem_map") return result
def load_memory_map_dir(directory: str) -> Embeddings: """ Loads embeddings from a memory map directory to allow lazy loading (and reduce the memory usage). Args: directory: a file prefix. This function loads two files in the directory: a meta json file with shape information and the vocabulary, and the actual memory map file. Returns: Embeddings object with a lookup matrix that is backed by a memory map. """ meta_file = os.path.join(directory, "meta.json") mem_map_file = os.path.join(directory, "memory_map") with open(meta_file, "r") as f: meta = json.load(f) shape = tuple(meta['shape']) vocab = meta['vocab'] mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape) result = Embeddings(vocab, mem_map, filename=directory, emb_format="memory_map_dir") return result