def get_infersent_model(self): """Retrieves the InferSent model. Returns: The pretrained InferSent model. """ infersent_version = 2 model_folder_path = utils.download_if_needed(InferSent.MODEL_PATH) model_path = os.path.join(model_folder_path, f"infersent{infersent_version}.pkl") params_model = { "bsize": 64, "word_emb_dim": 300, "enc_lstm_dim": 2048, "pool_type": "max", "dpout_model": 0.0, "version": infersent_version, } infersent = InferSentModel(params_model) infersent.load_state_dict(torch.load(model_path)) word_embedding_path = utils.download_if_needed( InferSent.WORD_EMBEDDING_PATH) w2v_path = os.path.join(word_embedding_path, "fastText", "crawl-300d-2M.vec") infersent.set_w2v_path(w2v_path) infersent.build_vocab_k_words(K=100000) return infersent
def _load_classification_text_file(self, text_file_name, offset=0, shuffle=False): """Loads tuples from lines of a classification text file. Format must look like: 1 this is a great little ... 0 "i love hot n juicy . ... 0 "\""this world needs a ... Arguments: text_file_name (str): name of the text file to load from. offset (int): line to start reading from shuffle (bool): If True, randomly shuffle loaded data """ text_file_path = utils.download_if_needed(text_file_name) text_file = open(text_file_path, "r") raw_lines = text_file.readlines()[offset:] raw_lines = [self._clean_example(ex) for ex in raw_lines] self.examples = [ self._process_example_from_file(ex) for ex in raw_lines ] self._i = 0 text_file.close() if shuffle: random.shuffle(self.examples)
def __init__(self, max_candidates=15, embedding_type='paragramcf', **kwargs): super().__init__(**kwargs) self.max_candidates = max_candidates self.embedding_type = embedding_type if embedding_type == 'paragramcf': word_embeddings_folder = 'paragramcf' word_embeddings_file = 'paragram.npy' word_list_file = 'wordlist.pickle' nn_matrix_file = 'nn.npy' else: raise ValueError(f'Could not find word embedding {embedding_type}') # Download embeddings if they're not cached. cache_path = utils.download_if_needed('{}/{}'.format( WordSwapEmbedding.PATH, embedding_type)) # Concatenate folder names to create full path to files. word_embeddings_file = os.path.join(cache_path, word_embeddings_file) word_list_file = os.path.join(cache_path, word_list_file) nn_matrix_file = os.path.join(cache_path, nn_matrix_file) # Actually load the files from disk. self.word_embeddings = np.load(word_embeddings_file) self.word_embedding_word2index = np.load(word_list_file, allow_pickle=True) self.nn = np.load(nn_matrix_file) # Build glove dict and index. self.word_embedding_index2word = {} for word, index in self.word_embedding_word2index.items(): self.word_embedding_index2word[index] = word
def from_pretrained(cls, name_or_path): """Load trained LSTM model by name or from path. Args: name_or_path (str): Name of the model (e.g. "lstm-imdb") or model saved via `save_pretrained`. """ if name_or_path in TEXTATTACK_MODELS: path = utils.download_if_needed(TEXTATTACK_MODELS[name_or_path]) else: path = name_or_path config_path = os.path.join(path, "config.json") if os.path.exists(config_path): with open(config_path, "r") as f: config = json.load(f) else: # Default config config = { "architectures": "LSTMForClassification", "hidden_size": 150, "depth": 1, "dropout": 0.3, "num_labels": 2, "max_seq_length": 128, "model_path": None, "emb_layer_trainable": True, } del config["architectures"] model = cls(**config) state_dict = load_cached_state_dict(path) model.load_state_dict(state_dict) return model
def __init__(self, model_path, num_labels=2): model_file_path = utils.download_if_needed(model_path) self.model = BertForSequenceClassification.from_pretrained( model_file_path, num_labels=num_labels) self.model.to(utils.device) self.model.eval() self.tokenizer = BERTTokenizer(model_file_path)
def load_glove200(self): glove_path = utils.download_if_needed( GloveEmbeddingLayer.EMBEDDING_PATH) glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy") word_list = np.load(glove_word_list_path) glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy") embedding_matrix = np.load(glove_matrix_path) return embedding_matrix, word_list
def counterfitted_GLOVE_embedding(): """Returns a prebuilt counter-fitted GLOVE word embedding proposed by "Counter-fitting Word Vectors to Linguistic Constraints" (Mrkšić et al., 2016)""" if ("textattack_counterfitted_GLOVE_embedding" in utils.GLOBAL_OBJECTS and isinstance( utils. GLOBAL_OBJECTS["textattack_counterfitted_GLOVE_embedding"], WordEmbedding, )): # avoid recreating same embedding (same memory) and instead share across different components return utils.GLOBAL_OBJECTS[ "textattack_counterfitted_GLOVE_embedding"] word_embeddings_folder = "paragramcf" word_embeddings_file = "paragram.npy" word_list_file = "wordlist.pickle" mse_dist_file = "mse_dist.p" cos_sim_file = "cos_sim.p" nn_matrix_file = "nn.npy" # Download embeddings if they're not cached. word_embeddings_folder = os.path.join(WordEmbedding.PATH, word_embeddings_folder).replace( "\\", "/") word_embeddings_folder = utils.download_if_needed( word_embeddings_folder) # Concatenate folder names to create full path to files. word_embeddings_file = os.path.join(word_embeddings_folder, word_embeddings_file) word_list_file = os.path.join(word_embeddings_folder, word_list_file) mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file) cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file) nn_matrix_file = os.path.join(word_embeddings_folder, nn_matrix_file) # loading the files embedding_matrix = np.load(word_embeddings_file) word2index = np.load(word_list_file, allow_pickle=True) index2word = {} for word, index in word2index.items(): index2word[index] = word nn_matrix = np.load(nn_matrix_file) embedding = WordEmbedding(embedding_matrix, word2index, index2word, nn_matrix) with open(mse_dist_file, "rb") as f: mse_dist_mat = pickle.load(f) with open(cos_sim_file, "rb") as f: cos_sim_mat = pickle.load(f) embedding._mse_dist_mat = mse_dist_mat embedding._cos_sim_mat = cos_sim_mat utils.GLOBAL_OBJECTS[ "textattack_counterfitted_GLOVE_embedding"] = embedding return embedding
def __init__(self): glove_path = utils.download_if_needed( GloveEmbeddingLayer.EMBEDDING_PATH) glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy") word_list = np.load(glove_word_list_path) glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy") embedding_matrix = np.load(glove_matrix_path) super().__init__(embedding_matrix=embedding_matrix, word_list=word_list)
def __init__(self, model_path, num_labels=2, entailment=False): model_file_path = utils.download_if_needed(model_path) self.model = BertForSequenceClassification.from_pretrained( model_file_path, num_labels=num_labels) self.model.to(utils.get_device()) self.model.eval() if entailment: self.tokenizer = BERTEntailmentTokenizer() else: self.tokenizer = BERTTokenizer(model_file_path)
def __init__(self, emb_layer_trainable=True): glove_path = utils.download_if_needed( GloveEmbeddingLayer.EMBEDDING_PATH) glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy") word_list = np.load(glove_word_list_path) glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy") embedding_matrix = np.load(glove_matrix_path) super().__init__(embedding_matrix=embedding_matrix, word_list=word_list) self.embedding.weight.requires_grad = emb_layer_trainable
def __init__( self, embedding_type="paragramcf", include_unknown_words=True, min_cos_sim=None, max_mse_dist=None, cased=False, compare_against_original=True, ): super().__init__(compare_against_original) self.include_unknown_words = include_unknown_words self.cased = cased self.min_cos_sim = min_cos_sim self.max_mse_dist = max_mse_dist self.embedding_type = embedding_type if embedding_type == "paragramcf": word_embeddings_folder = "paragramcf" word_embeddings_file = "paragram.npy" word_list_file = "wordlist.pickle" mse_dist_file = "mse_dist.p" cos_sim_file = "cos_sim.p" else: raise ValueError(f"Could not find word embedding {embedding_type}") # Download embeddings if they're not cached. word_embeddings_folder = os.path.join(WordEmbeddingDistance.PATH, word_embeddings_folder) word_embeddings_folder = utils.download_if_needed( word_embeddings_folder) # Concatenate folder names to create full path to files. word_embeddings_file = os.path.join(word_embeddings_folder, word_embeddings_file) word_list_file = os.path.join(word_embeddings_folder, word_list_file) mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file) cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file) # Actually load the files from disk. self.word_embeddings = np.load(word_embeddings_file) self.word_embedding_word2index = np.load(word_list_file, allow_pickle=True) # Precomputed distance matrices store distances at mat[x][y], where # x and y are word IDs and x < y. if self.max_mse_dist is not None and os.path.exists(mse_dist_file): with open(mse_dist_file, "rb") as f: self.mse_dist_mat = pickle.load(f) else: self.mse_dist_mat = {} if self.min_cos_sim is not None and os.path.exists(cos_sim_file): with open(cos_sim_file, "rb") as f: self.cos_sim_mat = pickle.load(f) else: self.cos_sim_mat = {}
def load_cached_state_dict(model_folder_path): if not os.path.exists(model_folder_path): model_folder_path = utils.download_if_needed(model_folder_path) # Take the first model matching the pattern *model.bin. model_path_list = glob.glob(os.path.join(model_folder_path, "*model.bin")) if not model_path_list: raise FileNotFoundError( f"model.bin not found in model folder {model_folder_path}.") model_path = model_path_list[0] state_dict = torch.load(model_path, map_location=utils.device) return state_dict
def get_infersent_model(self): """ Retrieves the InferSent model. Returns: The pretrained InferSent model. """ infersent_version = 2 model_folder_path = utils.download_if_needed(InferSent.MODEL_PATH) model_path = os.path.join(model_folder_path, f'infersent{infersent_version}.pkl') params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': infersent_version} infersent = InferSentModel(params_model) infersent.load_state_dict(torch.load(model_path)) word_embedding_path = utils.download_if_needed(InferSent.WORD_EMBEDDING_PATH) w2v_path = os.path.join(word_embedding_path, 'fastText', 'crawl-300d-2M.vec') infersent.set_w2v_path(w2v_path) infersent.build_vocab_k_words(K=100000) return infersent
def __init__(self, max_candidates=-1, **kwargs): super().__init__(**kwargs) self.max_candidates = max_candidates # Download synonym candidates bank if they're not cached. cache_path = utils.download_if_needed("{}/{}".format( WordSwapHowNet.PATH, "word_candidates_sense.pkl")) # Actually load the files from disk. with open(cache_path, "rb") as fp: self.candidates_bank = pickle.load(fp) self.pos_dict = {"JJ": "adj", "NN": "noun", "RB": "adv", "VB": "verb"}
def __init__(self): lm_folder = utils.download_if_needed(GoogLMHelper.CACHE_PATH) self.PBTXT_PATH = os.path.join(lm_folder, "graph-2016-09-10-gpu.pbtxt") self.CKPT_PATH = os.path.join(lm_folder, "ckpt-*") self.VOCAB_PATH = os.path.join(lm_folder, "vocab-2016-09-10.txt") self.BATCH_SIZE = 1 self.NUM_TIMESTEPS = 1 self.MAX_WORD_LEN = 50 self.vocab = lm_data_utils.CharsVocabulary(self.VOCAB_PATH, self.MAX_WORD_LEN) with tf.device("/gpu:1"): self.graph = tf.Graph() self.sess = tf.compat.v1.Session(graph=self.graph) with self.graph.as_default(): self.t = lm_utils.LoadModel( self.sess, self.graph, self.PBTXT_PATH, self.CKPT_PATH ) self.lm_cache = lru.LRU(2 ** 18)
def _load_classification_text_file(self, text_file_name, offset=0): """ Loads tuples from lines of a classification text file. Format must look like: 1 this is a great little ... 0 "i love hot n juicy . ... 0 "\""this world needs a ... Arguments: n (int): number of samples to return offset (int): line to start reading from """ text_file_path = utils.download_if_needed(text_file_name) text_file = open(text_file_path, 'r') raw_lines = text_file.readlines()[offset:] raw_lines = [self._clean_example(ex) for ex in raw_lines] self.examples = [self._process_example_from_file(ex) for ex in raw_lines] self.i = 0 text_file.close()
def __init__(self, embedding_type='paragramcf', include_unknown_words=True, min_cos_sim=None, max_mse_dist=None, cased=False): self.include_unknown_words = include_unknown_words self.cased = cased self.min_cos_sim = min_cos_sim self.max_mse_dist = max_mse_dist self.embedding_type = embedding_type if embedding_type == 'paragramcf': word_embeddings_folder = 'paragramcf' word_embeddings_file = 'paragram.npy' word_list_file = 'wordlist.pickle' mse_dist_file = 'mse_dist.p' cos_sim_file = 'cos_sim.p' else: raise ValueError(f'Could not find word embedding {word_embedding}') # Download embeddings if they're not cached. word_embeddings_path = utils.download_if_needed(WordEmbeddingDistance.PATH) word_embeddings_folder = os.path.join(word_embeddings_path, word_embeddings_folder) # Concatenate folder names to create full path to files. word_embeddings_file = os.path.join(word_embeddings_folder, word_embeddings_file) word_list_file = os.path.join(word_embeddings_folder, word_list_file) mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file) cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file) # Actually load the files from disk. self.word_embeddings = np.load(word_embeddings_file) self.word_embedding_word2index = np.load(word_list_file, allow_pickle=True) # Precomputed distance matrices store distances at mat[x][y], where # x and y are word IDs and x < y. if self.max_mse_dist is not None and os.path.exists(mse_dist_file): self.mse_dist_mat = pickle.load(open(mse_dist_file, 'rb')) else: self.mse_dist_mat = {} if self.min_cos_sim is not None and os.path.exists(cos_sim_file): self.cos_sim_mat = pickle.load(open(cos_sim_file, 'rb')) else: self.cos_sim_mat = {}
def __init__(self): path = BERTForSNLI.MODEL_PATH utils.download_if_needed(path) super().__init__(path, entailment=True, num_labels=3)
def _load_pickle_file(self, file_name, offset=0): self._i = 0 file_path = utils.download_if_needed(file_name) with open(file_path, "rb") as f: self.examples = pickle.load(f) self.examples = self.examples[offset:]
def __init__(self): glove_path = utils.download_if_needed( GloveEmbeddingLayer.EMBEDDING_PATH) glove_path = os.path.join(glove_path, 'glove.6B.200d.txt') super().__init__(embs=load_embedding(glove_path))
def load_cached_state_dict(model_folder_path): model_folder_path = utils.download_if_needed(model_folder_path) model_path = os.path.join(model_folder_path, "model.bin") state_dict = torch.load(model_path, map_location=utils.device) return state_dict