def test_cached_path(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path( "tests/fixtures/does_not_exist/fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == self.glove_file # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes
def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri( file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: warn( 'The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line( first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)
def add_file_to_archive(self, name: str) -> None: """ Any class in its ``from_params`` method can request that some of its input files be added to the archive by calling this method. For example, if some class ``A`` had an ``input_file`` parameter, it could call ``` params.add_file_to_archive("input_file") ``` which would store the supplied value for ``input_file`` at the key ``previous.history.and.then.input_file``. The ``files_to_archive`` dict is shared with child instances via the ``_check_is_dict`` method, so that the final mapping can be retrieved from the top-level ``Params`` object. NOTE: You must call ``add_file_to_archive`` before you ``pop()`` the parameter, because the ``Params`` instance looks up the value of the filename inside itself. If the ``loading_from_archive`` flag is True, this will be a no-op. """ if not self.loading_from_archive: self.files_to_archive[f"{self.history}{name}"] = cached_path( self.get(name))
def _load_cnn_weights(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] char_embed_dim = cnn_options['embedding']['dim'] convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True) # load the weights with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN']['W_cnn_{}'.format(i)][...] bias = fin['CNN']['b_cnn_{}'.format(i)][...] w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0)) if w_reshaped.shape != tuple(conv.weight.data.shape): raise ValueError("Invalid weight file") conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) conv.bias.data.copy_(torch.FloatTensor(bias)) conv.weight.requires_grad = self.requires_grad conv.bias.requires_grad = self.requires_grad convolutions.append(conv) self.add_module('char_conv_{}'.format(i), conv) self._convolutions = convolutions
def _read_pretrained_words(embeddings_filename: str) -> Set[str]: words = set() with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') word = fields[0] words.add(word) return words
def _load_char_embedding(self): with h5py.File(cached_path(self._weight_file), 'r') as fin: char_embed_weights = fin['char_embed'][...] weights = numpy.zeros( (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), dtype='float32') weights[1:, :] = char_embed_weights self._char_embedding_weights = torch.nn.Parameter( torch.FloatTensor(weights), requires_grad=self.requires_grad)
def from_file(params_file: str, params_overrides: str = "") -> 'Params': """ Load a `Params` object from a configuration file. """ # redirect to cache, if necessary params_file = cached_path(params_file) file_dict = pyhocon.ConfigFactory.parse_file(params_file) overrides_dict = pyhocon.ConfigFactory.parse_string(params_overrides) param_dict = overrides_dict.with_fallback(file_dict) return Params(param_dict)
def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = zipfile.ZipFile(cached_archive_path, 'r') if member_path is None: members_list = archive.namelist() member_path = self._get_the_only_file_in_the_archive( members_list, archive_path) member_path = cast(str, member_path) member_file = archive.open(member_path, 'r') self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def from_file(params_file: str, params_overrides: str = "") -> 'Params': """ Load a `Params` object from a configuration file. """ # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = dict(os.environ) file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = tarfile.open(cached_archive_path, 'r') if member_path is None: members_list = archive.getnames() member_path = self._get_the_only_file_in_the_archive( members_list, archive_path) member_path = cast(str, member_path) member = archive.getmember( member_path) # raises exception if not present member_file = cast(IO[bytes], archive.extractfile(member)) self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def _load_projection(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True) with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN_proj']['W_proj'][...] bias = fin['CNN_proj']['b_proj'][...] self._projection.weight.data.copy_( torch.FloatTensor(numpy.transpose(weight))) self._projection.bias.data.copy_(torch.FloatTensor(bias)) self._projection.weight.requires_grad = self.requires_grad self._projection.bias.requires_grad = self.requires_grad
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: warn( "You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache." ) # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: info("Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError( 'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm( input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoCharacterEncoder, self).__init__() with open(cached_path(options_file), 'r') as fin: self._options = json.load(fin) self._weight_file = weight_file self.output_dim = self._options['lstm']['projection_dim'] self.requires_grad = requires_grad self._load_weights() # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) self._end_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
def _load_highway(self): # pylint: disable=protected-access # the highway layers have same dimensionality as the number of cnn filters cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) n_highway = cnn_options['n_highway'] # create the layers, and load the weights self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu) for k in range(n_highway): # The AllenNLP highway is one matrix multplication with concatenation of # transform and carry weights. with h5py.File(cached_path(self._weight_file), 'r') as fin: # The weights are transposed due to multiplication order assumptions in tf # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X)) w_transform = numpy.transpose( fin['CNN_high_{}'.format(k)]['W_transform'][...]) # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x) w_carry = -1.0 * numpy.transpose( fin['CNN_high_{}'.format(k)]['W_carry'][...]) weight = numpy.concatenate([w_transform, w_carry], axis=0) self._highways._layers[k].weight.data.copy_( torch.FloatTensor(weight)) self._highways._layers[ k].weight.requires_grad = self.requires_grad b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...] b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...] bias = numpy.concatenate([b_transform, b_carry], axis=0) self._highways._layers[k].bias.data.copy_( torch.FloatTensor(bias)) self._highways._layers[ k].bias.requires_grad = self.requires_grad
def load_weights(self, weight_file: str) -> None: """ Load the pre-trained weights from the file. """ requires_grad = self.requires_grad with h5py.File(cached_path(weight_file), 'r') as fin: for i_layer, lstms in enumerate( zip(self.forward_layers, self.backward_layers)): for j_direction, lstm in enumerate(lstms): # lstm is an instance of LSTMCellWithProjection cell_size = lstm.cell_size dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell'][ 'Cell%s' % i_layer]['LSTMCell'] # tensorflow packs together both W and U matrices into one matrix, # but pytorch maintains individual matrices. In addition, tensorflow # packs the gates as input, memory, forget, output but pytorch # uses input, forget, memory, output. So we need to modify the weights. tf_weights = numpy.transpose(dataset['W_0'][...]) torch_weights = tf_weights.copy() # split the W from U matrices input_size = lstm.input_size input_weights = torch_weights[:, :input_size] recurrent_weights = torch_weights[:, input_size:] tf_input_weights = tf_weights[:, :input_size] tf_recurrent_weights = tf_weights[:, input_size:] # handle the different gate order convention for torch_w, tf_w in [[input_weights, tf_input_weights], [ recurrent_weights, tf_recurrent_weights ]]: torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[( 2 * cell_size):(3 * cell_size), :] torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[( 1 * cell_size):(2 * cell_size), :] lstm.input_linearity.weight.data.copy_( torch.FloatTensor(input_weights)) lstm.state_linearity.weight.data.copy_( torch.FloatTensor(recurrent_weights)) lstm.input_linearity.weight.requires_grad = requires_grad lstm.state_linearity.weight.requires_grad = requires_grad # the bias weights tf_bias = dataset['B'][...] # tensorflow adds 1.0 to forget gate bias instead of modifying the # parameters... tf_bias[(2 * cell_size):(3 * cell_size)] += 1 torch_bias = tf_bias.copy() torch_bias[(1 * cell_size):(2 * cell_size)] = tf_bias[( 2 * cell_size):(3 * cell_size)] torch_bias[(2 * cell_size):(3 * cell_size)] = tf_bias[( 1 * cell_size):(2 * cell_size)] lstm.state_linearity.bias.data.copy_( torch.FloatTensor(torch_bias)) lstm.state_linearity.bias.requires_grad = requires_grad # the projection weights proj_weights = numpy.transpose(dataset['W_P_0'][...]) lstm.state_projection.weight.data.copy_( torch.FloatTensor(proj_weights)) lstm.state_projection.weight.requires_grad = requires_grad