def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]: realpath = get_resource(path) binpath = replace_ext(realpath, '.pkl') if cache: try: flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]') word2vec, dim = load_pickle(binpath) flash('') return word2vec, dim except IOError: pass dim = None word2vec = dict() f = TimingFileIterator(realpath) for idx, line in enumerate(f): f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]') line = line.rstrip().split(delimiter) if len(line) > 2: if dim is None: dim = len(line) else: if len(line) != dim: logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim)) continue word, vec = line[0], line[1:] word2vec[word] = np.array(vec, dtype=np.float32) dim -= 1 if cache: flash('Caching word2vec [blink][yellow]...[/yellow][/blink]') save_pickle((word2vec, dim), binpath) flash('') return word2vec, dim
def load_word2vec_as_vocab_tensor( path, delimiter=' ', cache=True) -> Tuple[Dict[str, int], torch.Tensor]: realpath = get_resource(path) vocab_path = replace_ext(realpath, '.vocab') matrix_path = replace_ext(realpath, '.pt') if cache: try: flash( 'Loading vocab and matrix from cache [blink][yellow]...[/yellow][/blink]' ) vocab = load_pickle(vocab_path) matrix = torch.load(matrix_path, map_location='cpu') flash('') return vocab, matrix except IOError: pass word2vec, dim = load_word2vec(path, delimiter, cache) vocab = dict((k, i) for i, k in enumerate(word2vec.keys())) matrix = torch.Tensor(list(word2vec.values())) if cache: flash('Caching vocab and matrix [blink][yellow]...[/yellow][/blink]') save_pickle(vocab, vocab_path) torch.save(matrix, matrix_path) flash('') return vocab, matrix
def save_pickle(self, path): """Save to path Args: path: Returns: """ save_pickle(self, path)