def _emb(self): if self._embeddings is None: self._embeddings = load_facebook_vectors(self._path) return self._embeddings
def __init__(self, filepath, text_field, label_field, embeddings=None, max_text_len=cfg.max_text_len, alphabet=None, noise_level=0, elmo=False): assert not elmo, 'ELMo support is deprecated' if isinstance(embeddings, str): self.embeddings = load_facebook_vectors(embeddings) elif isinstance(embeddings, (FastTextKeyedVectors, KeyedVectors)): self.embeddings = embeddings else: raise ValueError('embeddings should be path to FastText file or ' 'gensim FastTextKeyedVectors object' f'got {type(embeddings)} instead') self._noise_level = noise_level self.alphabet = alphabet or cfg.alphabet self.text_field = text_field self.label_field = label_field self.data = pd.read_csv(filepath) self.max_text_len = max_text_len if self.embeddings is not None: self.unk_vec = np.random.rand(self.embeddings.vector_size) self.label2int = { l: i for i, l in enumerate(sorted(self.data[self.label_field].unique())) } self._data = self._preprocess_df(self.data)
def __init__(self, hyperparameters, verbose=True): self.distance_method = hyperparameters["distance_method"] self.hyperparameters = hyperparameters self.verbose = verbose self.keyword_graph = None self.inverse_lemmatizer_mapping = {} if self.distance_method == "fasttext": from gensim.models import fasttext self.pretrained_embedding_path = hyperparameters[ 'pretrained_embedding_path'] self.model = fasttext.load_facebook_vectors( self.pretrained_embedding_path) if self.verbose: logging.info("Initiated a keyword detector instance.") self.default_visualization_parameters = { "top_n": 10, "max_node_size": 8, "min_node_size": 2, "label_font_size": 10, "text_color": "red", "num_layout_iterations": 50, "edge_width": 0.08, "alpha_channel": 0.5 }
def __init__(self, TH, query): self.query = query self.TH = TH self.cap_path = datapath( "/home/ubuntu/seungho/fastText/build/run11_chat_mecab_190824.bin") self.model = load_facebook_vectors(self.cap_path) self.example = self.model['안녕']
def __init__(self, embeddings_path: str, verbose: bool = True) -> None: super().__init__(verbose=verbose) if platform.system() == "Windows": self.model = load_facebook_vectors(embeddings_path) else: self.model = load_fasttext_embeddings(embeddings_path)
def __init__(self, vectors_path='data/fasttext/wiki.en.bin', cuda=True): super().__init__() print('Init FastText embedder') self.wv = load_facebook_vectors(vectors_path) self.word_vec_dim = 300 self.cuda = cuda
def load_fasttext_model(fasttext_path): """ Args: fasttext_path: Path to fastText binary file """ import gensim.models.fasttext as ft cap_path = datapath(fasttext_path) wv = ft.load_facebook_vectors(cap_path) return wv
def train_model(self, corpus): if self.model is None: logging.info(f"Start loading model {self.pretrained_model_path}") if self.pretrained_model_path.endswith(".bin"): self.model = load_facebook_vectors(self.pretrained_model_path) else: self.model = FastTextKeyedVectors.load(self.pretrained_model_path) self.model.init_sims(True) logging.info(f"Finished loading model {self.pretrained_model_path}") return self.model
def load(path: str, name: str): if name.startswith('cc'): # Case: Native fastText embeddings. return load_facebook_vectors(path, encoding='latin1') if name.endswith('bin'): # Case: Models trained specifically for this project. model = FastText.load(path) # Pre-compute L2-normalized vectors. model.init_sims(replace=True) return model.wv if name.endswith('zip'): return load_gensim_model(filepath=path)
def test_get_fasttext_model(self): data = pandas.read_csv(str(TEST_DATA_DIR / "prepared_data.csv.xz"), index_col=0, keep_default_na=False) with tempfile.TemporaryDirectory( prefix="lookout_typos_fasttext_") as temp_dir: config = { "size": 100, "path": os.path.join(temp_dir, "ft.bin"), "dim": 5 } train_fasttext(data, config) wv = load_facebook_vectors(config["path"]) self.assertTupleEqual(wv["get"].shape, (5, ))
def get_embeddings( embeddings: str, embeddings_format: str = 'glove', embeddings_binary: bool = False, ) -> KeyedVectors: """ Get the embeddings model and matrix used in the setup function Parameters ---------- embeddings : Optional[str], optional Path to pretrained embeddings, by default None embeddings_format : str, optional The format of the input embeddings, should be one of: 'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can be used to download embeddings hosted on gensim on the fly. See https://github.com/RaRe-Technologies/gensim-data for the list of available embedding aliases. embeddings_binary : bool, optional Whether the input embeddings are provided in binary format, by default False Returns ------- KeyedVectors The embeddings object specified by the parameters. """ model = None if embeddings_format == 'glove': with temporary_file('temp.txt') as temp: glove2word2vec(embeddings, temp) model = KeyedVectors.load_word2vec_format(temp, binary=embeddings_binary) elif embeddings_format == 'word2vec': model = KeyedVectors.load_word2vec_format(embeddings, binary=embeddings_binary) elif embeddings_format == 'fasttext': model = fasttext.load_facebook_vectors(embeddings) elif embeddings_format == 'gensim': try: model = KeyedVectors.load(embeddings) except FileNotFoundError: model = api.load(embeddings) else: raise ValueError( "Only formats supported are word2vec, fasttext and gensim") return model
def load_words_embeddings(filepath, base_file, vocab_file=""): # cap_path = datapath(filepath) model = load_facebook_vectors(filepath) stop_words = stopwords.words('english') words = [] embeddings = [] if base_file: vocab = model.vocab f = open('words.txt', 'w') if vocab_file!="": given_vocab = load_words(vocab_file) new_vocab = [] for word in given_vocab: new_vocab.append(word) else: print("Initial vocab length: "+str(len(vocab))) new_vocab = [] for word in vocab: word = re.sub(r'[^a-z-_]+', '', word.lower()) word = word.strip() if word not in new_vocab and word not in stop_words and word!="" and len(word)>2: new_vocab.append(word) vocab = new_vocab print("Processed vocab length: "+str(len(vocab))) for word in vocab: words.append(word) f.write(word+"\n") embeddings.append(model[word].tolist()) f.close() else: vocab = load_words('words.txt') vocab = [word.strip("\n") for word in vocab] for word in vocab: words.append(word) embeddings.append(model[word].tolist()) return words, np.array(embeddings)
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, config: Optional[Mapping[str, Any]] = None) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction \ candidates. First token in every line split is added \ to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must \ be two values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param config: Candidates generation configuration, options: neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates (int). edit_dist_number: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as \ candidates (int). max_distance: Maximum edit distance for symspell lookup for candidates \ (int). radius: Maximum edit distance from typo allowed for candidates (int). max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted (int). start_pool_size: Length of data, starting from which multiprocessing is \ desired (int). chunksize: Max size of a chunk for one process during multiprocessing (int). set_min_freq: True to set the frequency of the unknown tokens to the \ minimum frequency in the vocabulary. It is set to zero \ otherwise. """ self.set_config(config) self.checker = SymSpell(max_dictionary_edit_distance=self.config["max_distance"], prefix_length=self.config["max_corrected_length"]) self.checker.load_dictionary(vocabulary_file) self.wv = load_facebook_vectors(embeddings_file) self.tokens = set(read_vocabulary(vocabulary_file)) self.frequencies = read_frequencies(frequencies_file) if self.config["set_min_freq"]: self.min_freq = min(self.frequencies.values())
def cossim_compare(infile, outfile, visim): fields = ['Word1', 'Word2', 'Sim2'] df = pandas.read_csv(visim, sep='\t', skipinitialspace=True, usecols=fields) print('loading model') model = load_facebook_vectors(infile) print('calculating cosine similarity') score_list = [] with open(outfile, 'w') as result: for i in range(df.shape[0]): word1 = model[(df.iloc[i, 0])] word2 = model[(df.iloc[i, 1])] similarity = df.iloc[i, 2] / 10 cossim = 1 - distance.cosine(word1, word2) result.write(f'{cossim}\t{similarity}\n') score = cossim - similarity score_list.append(abs(score)) with open(outfile, 'a') as result: score = sum(score_list) result.write(str(score))
def load_vectors(self, pmodel_name): """ Loads the VECTORS of an already trained model. It is much quicker and less cumbersome to use just vectors than to use the model itself, but still comes with the various important syntactic/semantic tools. If the vectors of the specified model are not found but another model's vectors are already loaded, this instance will continue to use the already loaded vectors. Parameters ----------- pmodel_name (str) - Name of the model to load vectors from Throws ----------- FileNotFoundError - If specified model is not found. """ try: if pmodel_name[-4:] != '.bin': pmodel_name = pmodel_name + '.bin' self.wordvectors = ft.load_facebook_vectors( os.path.join(self.__PATH_PREFIX__, pmodel_name)) except FileNotFoundError as err: raise FileNotFoundError("Model with name {} not found.".format( pmodel_name[:-4])) from err
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose: bool = False): """ Loads word embeddings with Gensim. :param str pretrained_embedding: :param cache_dir: the directory for storing cached data :param bool verbose: `True` to increase verbosity :return: KeyedVectors or FastTextKeyedVectors """ _word_embeddings_available(pretrained_embedding, can_use_subword=True) download_model(pretrained_embedding, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin") if pretrained_embedding.split(".")[-1] == 'wv': return KeyedVectors.load_word2vec_format(wv_path, binary=True) elif pretrained_embedding.split(".")[-1] == 'swv': from gensim.models.fasttext import load_facebook_vectors return load_facebook_vectors(wv_path)
REPEATED = [] W2VCANDIDAT = [] #LM_EM = [] TRAIN_DICTIONARY = { 'tok_len': TOKEN_LEN, 'blacklist': BLACKLISTS, 'repeats': REPEATED, 'word2vec': W2VCANDIDAT, 'context': CONTEXT } TARGETS = [] ALL_RESULTS = [] MODEL = load_facebook_vectors("./cc.ru.300.bin") class Word(): def __init__(self, word): self.word = word self.token_length = len(word) def check_blacklist(self): """ Check if the word in blacklist (number, latin or one-symbol) :return 0 if not in blacklist, 1 if in """ pattern = '[0-9\\.\\:\\-\\/a-z]+' if len(self.word) == 1: return 1
args.input_data, delimiter=';', names=['idf', 'labels', 'sentences', 'pivot_words', 'src', 'alea']) print(df.head(5)) y_test = df['labels'] print('\n ** Transform sentences to ' + str(args.ngram_size) + ' ngrams... \n') ngrams_list = sentences_to_ngrams(df['sentences'], args.ngram_size, args.fr_nouns_file) print(ngrams_list) print("\n ** Loading fastText model...\n") fasttext_model = fasttext.load_facebook_vectors(args.model_fasttext) print('\n ** Vectorisation of inputs... \n') x_test = vectorization(args.ngram_size, ngrams_list, args.we_vector_size, fasttext_model) np.random.seed(1) print('\n ** Loading model ' + args.model_path + ' \n') keras_models = ['GRU', 'MLP_PCA', 'MLP_AE'] if args.algorithm in keras_models: clf = load_model(args.model_path) else: clf = load(args.model_path)
def main(): parser = argparse.ArgumentParser(description='Generative Evaluation for Visual Dialogue') parser.add_argument('--generations', dest='generations', default='./generations.json', help='Path to file with answer generations.') parser.add_argument('--references', dest='references', default='densevisdial/refs_S_val.json', help='Path to file with answer reference sets.') # overlap (CIDER, METEOR) parameters parser.add_argument('--n', dest='n', type=int, default=4, help='Cider n-gram (computes 1 to n).') parser.add_argument('--no_overlap', dest='no_overlap', action='store_true', help='Do not compute overlap metrics.') # embedding distance FastText parameters parser.add_argument('--fast_text_model', dest='fast_text_model', required=True, help='Path to FastText .bin model.') parser.add_argument('--no_embedding', dest='no_embedding', action='store_true', help='Do not compute embedding metrics.') args = parser.parse_args() # load answer generations and reference sets print ('loading generations and references from .json files...') with open(args.generations) as f: gens = json.load(f) with open(args.references) as f: refs = json.load(f) print ('preparing data...') generations, references = prepare_data(gens, refs) print ('# question-answer pairs: ' + str(len(refs))) # load models print ('loading models and word embeddings (may take a few minutes)...') if not args.no_overlap: cider_model = CiderScorer(references, n=args.n) meteor_model = Meteor() if not args.no_embedding: bert_client = BertClient(check_length=False) fasttext_wordvectors = FastText.load_facebook_vectors(args.fast_text_model) numconverter = inflect.engine() print ('models loaded!') scores = initialise_score_dicts(args) print ('evaluating generations...') for i, (gs, rs) in enumerate(zip(generations, references)): sys.stdout.write('\r{}/{} --> {:3.1f}%'.format(str(i+1), str(len(references)), (i+1)/float(len(references))*100)) sys.stdout.flush() cider_list, meteor_list = [], [] bert_list, fasttext_list = [], [] # get bert embeddings of references if not args.no_embedding: bert_refs = get_bert_features(rs, bert_client) fasttext_refs = get_fasttext_features(rs, fasttext_wordvectors, numconverter) for ii, g in enumerate(gs): # loops through answer generations, if multiple if g == "": # ignore empty string scores['empty'] += 1 else: if not args.no_overlap: cider_list.append(compute_cider(g, rs, cider_model)) meteor_list.append(compute_meteor(g, rs, meteor_model)) if not args.no_embedding: bert_list.append(compute_bert(g, bert_refs, bert_client)) fasttext_list.append(compute_fasttext(g, fasttext_refs, fasttext_wordvectors, numconverter)) # average over multiple generations if not args.no_overlap: n_grams_cider = np.mean(cider_list, axis=0) for n, n_gram_cider in enumerate(n_grams_cider): scores['cider_{:d}'.format(n+1)].append(n_gram_cider) scores['meteor'].append(np.mean(meteor_list)) if not args.no_embedding: bert_scores = np.mean(bert_list, axis=0) scores['bert_l2'].append(bert_scores[0]) scores['bert_cs'].append(bert_scores[1]) fasttext_scores = np.mean(fasttext_list, axis=0) scores['fasttext_l2'].append(fasttext_scores[0]) scores['fasttext_cs'].append(fasttext_scores[1]) sys.stdout.write('\n') print_scores(scores) if 'meteor' in scores: meteor_model.close()
def load_fasttext(self, fasttext_path): path = datapath(fasttext_path) self.word_embedding = load_facebook_vectors(path)
parser.add_argument('--save-history', action='store_true', help='save history') # parse arguments args = parser.parse_args() if args.save_epoch is not None: os.makedirs(args.save_epoch, exist_ok=True) # device setting on_gpu = args.gpu and torch.cuda.is_available() device = torch.device('cuda' if on_gpu else 'cpu') # load embedding # loading gensim embedding print(f'Loading embedding from {args.embedding}') gensim_emb = fasttext.load_facebook_vectors(args.embedding) emb_dim = gensim_emb.vector_size n_vocabs = len(gensim_emb.vocab) # make torch embedding for generator torch_emb = nn.Embedding(n_vocabs, emb_dim) torch_emb.weight.data.copy_(torch.tensor(gensim_emb.vectors)) torch_emb.require_grad = False # disable update torch_emb = torch_emb.to(device) # make torch linear embedding for discriminator linear_emb = nn.Linear(n_vocabs, emb_dim, bias=False) linear_emb.weight.data.copy_(torch.tensor(gensim_emb.vectors).t()) # linear_emb.require_grad = False # disable update linear_emb = linear_emb.to(device)
def _load_bin(self, path): from gensim.models.fasttext import load_facebook_vectors self.vectors = load_facebook_vectors(path)