def download_languages_bin(directory: AnyStr, language_codes: List[AnyStr]): """Download all the required language fastText word embeddings. It takes a list of language codes as defined on https://fasttext.cc/docs/en/crawl-vectors.html and downloads them in order. """ root = getcwd() chdir(root + f"/{directory}") for language in language_codes: download_model(language, if_exists='ignore') run(["rm", "-f", f"cc.{language}.300.bin.gz"]) chdir(root)
def load_embeddings(filename, language="", reduced_dim=None): """ Loading pre-trained fasttext word embeddings. """ # Download the official fasttext embedding distribution: if language: util.download_model(language, if_exists='ignore') # Load model: try: model = fasttext.load_model(filename) except FileNotFoundError: print(f"File with name {filename} does not exist.") # Reduce embedding dimension if needed: if reduced_dim: assert reduced_dim < 300, f"The new embedding dimension {reduced_dim} is too big" assert reduced_dim > 0, f"The new embedding dimension {reduced_dim} must be strictly positive" util.reduce_model(model, reduced_dim) return model
def __init__( self, model_dir: str = "models", fasttext_model_name: str = "ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin", fasttext_en_model_name: str = "cc.en.300.bin", ): self.model_dir = model_dir self.fasttext_model_name = fasttext_model_name self.fasttext_en_model_name = fasttext_en_model_name self.model_path = os.path.join(self.model_dir, self.fasttext_model_name) if self.fasttext_model_name == "en": util.download_model('en', if_exists='ignore') self.model = load_model(self.fasttext_en_model_name) else: self.model = load_model(self.model_path) self.input_matrix = torch.FloatTensor(self.model.get_input_matrix()) self.matrix_shape = self.input_matrix.shape self.embedding_bag = EmbeddingBag( self.matrix_shape[0], self.matrix_shape[1]).from_pretrained(self.input_matrix, mode="mean").cuda()
def load_embeddings(language): from fasttext.util import download_model from fasttext import load_model download_model(language, if_exists='ignore') embeddings = load_model('cc.' + language + '.300.bin') return embeddings
import gdown from fasttext.util import download_model if __name__ == '__main__': print('Download job offers data') offers_url = 'https://drive.google.com/uc?export=download&confirm=A6wL&id=1tI4SctLNkZU6vJuBw1Hf1lVqephc35cG' gdown.download(offers_url, 'data/all_offers.csv', quiet=False) print('Download FastText representations of job offers') vectors_url = 'https://drive.google.com/uc?export=download&confirm=-GH4&id=1m_ckxOk4Ga884ai9mopnSj7gmvb1t5tG' gdown.download(vectors_url, 'data/offers_fasttext.npy', quiet=False) print('Download FastText French model') download_model('fr', if_exists='ignore')