예제 #1
0
    def load(self, embedding_fname, embedding_url=None, *args, **kwargs):
        """
        Method initializes dict of embeddings from file
        Args:
            fname: file name

        Returns:
            Nothing
        """

        if not embedding_fname:
            raise RuntimeError('No pretrained fasttext intent_model provided')
        fasttext_model_file = embedding_fname

        if not Path(fasttext_model_file).is_file():
            emb_path = embedding_url
            if not emb_path:
                raise RuntimeError(
                    'No pretrained fasttext intent_model provided')
            embedding_fname = Path(fasttext_model_file).name
            try:
                download_path = './'
                download_untar(embedding_url, download_path)
            except Exception as e:
                raise RuntimeError(
                    'Looks like the `EMBEDDINGS_URL` variable is set incorrectly',
                    e)
        self.model = FastText.load_fasttext_format(fasttext_model_file)
        return
    def load_fasttext_format(cls, *args, **kwargs):
        """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with
        the original fasttext implementation.

        Parameters
        ----------
        fname : str
            Path to the file.

        """
        return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
예제 #3
0
def make_w2v(vocab):
    d = {}
    if opt.lang == 'en_w2v':
        model = KeyedVectors.load_word2vec_format(
            '../../../GoogleNews-vectors-negative300.bin', binary=True)
    if opt.lang == 'en_fast':
        model = KeyedVectors.load_word2vec_format(
            '../../../wiki-news-300d-1M.vec')
    if opt.lang == 'es':
        model = FastText.load_fasttext_format('../../../cc.es.300.bin')
    if opt.lang == 'fr':
        model = FastText.load_fasttext_format('../../../cc.fr.300.bin')
    for i in range(4, vocab.size()):
        word = vocab.idxToLabel[i]
        #if opt.lang == 'en_w2v':
        #if model.emb(word)[0] != None:
        #if model.emb(word)[0] != None:
        #d[i] = model.emb(word)
        #d[i] = model[word]
        if word in model:
            d[i] = model[word]
    return d
예제 #4
0
def make_embedding_matrix(train_captions):
    tokenizer.fit_on_texts(train_captions)
    model = FastText.load_fasttext_format(cfg.fasttext)
    #---------embedding matrix 만듬--------
    vocab_size = len(tokenizer.word_index)
    embedding_matrix = np.random.random((vocab_size, 256))
    for word,i in tokenizer.word_index.items(): # 1부터 시작함
        try:
            embedding_vector = model[word]
        except:
            #min count 이하 등장 단어
            #print(word, 'not found')
            pass
        if embedding_vector is not None:
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix
예제 #5
0
def embedding_weights_load(words_map, embedding_weights_path):
    pre_trained_embedding = None
    try:
        model = FastText.load_fasttext_format(embedding_weights_path)
        pre_trained_embedding = "bin"
    except:
        print("fastText binary file (.bin) is not found!")
        if os.path.exists("./Word_embedding/wiki.en.vec"):
            print("Using wikipedia(en) pre-trained word vectors.")
        else:
            print("Downloading wikipedia(en) pre-trained word vectors.")
            chakin.download(number=2, save_dir="./Word_embedding")
        print("Loading vectors...")
        if os.path.exists("./Word_embedding_model.pkl"):
            with open("./Word_embedding_model.pkl", mode="rb") as f:
                model = pickle.load(f)
        else:
            model = KeyedVectors.load_word2vec_format(
                './Word_embedding/wiki.en.vec')
            with open("Word_embedding_model.pkl", mode="wb") as f:
                pickle.dump(model, f)
        pre_trained_embedding = "txt"

    vocab_size = len(words_map)
    word_dimension = model['a'].shape[0]
    w = np.zeros((vocab_size, word_dimension), dtype=np.float32)

    for k, v in words_map.items():
        word = k
        word_number = v

        try:
            w[word_number][:] = model[word]
        except KeyError as e:
            if pre_trained_embedding == "bin":
                w[word_number][:] = model.seeded_vector(word)
            else:
                np.random.seed(word_number)
                w[word_number][:] = np.random.uniform(-0.25, 0.25,
                                                      word_dimension)
    return w
예제 #6
0
def embedding_weights_load(words_map, embeddingWeights_path):
    pre_trained_embedding = None
    try:

        model = FastText.load_fasttext_format(
            embeddingWeights_path)  #binファイルがある場合はそちらを読み込む
        pre_trained_embedding = "bin"

    except:
        print("fastText binary file (.bin) is not found!"
              )  #ない場合はwikipediaの分散表現を使用する
        if os.path.exists("./Word_embedding/wiki.en.vec"):
            print("Using wikipedia(en) pre-trained word vectors.")
        else:
            print("Downloading wikipedia(en) pre-trained word vectors.")
            chakin.download(number=2, save_dir="./Word_embedding")
        print("Loading vectors...")
        model = KeyedVectors.load_word2vec_format(
            './Word_embedding/wiki.en.vec')
        pre_trained_embedding = "txt"

    vocab_size = len(words_map)
    word_dimension = model['a'].shape[0]  #次元数を取得
    W = np.zeros((vocab_size, word_dimension),
                 dtype=np.float32)  #分散表現を格納するための行列
    for k, v in words_map.items():  #kには単語,vには単語ID
        word = k
        word_number = v
        #モデル中に存在しないチャンゴがある場合には、その単語の分散表現は乱数となる
        try:
            W[word_number][:] = model[word]
        except KeyError as e:
            if pre_trained_embedding == "bin":
                W[word_number][:] = model.seeded_vector(word)
            else:
                np.random.seed(word_number)
                W[word_number][:] = np.random.uniform(-0.25, 0.25,
                                                      word_dimension)
    return W
예제 #7
0
 def load_fasttext_format(cls, *args, **kwargs):
     return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
import gensim

from gensim.models.wrappers.fasttext import FastText

from file_path_manager import FilePathManager

if __name__ == '__main__':
    model = FastText.load_fasttext_format(FilePathManager.resolve("data/wiki.en"))
    model.save(FilePathManager.resolve("data/fasttext.model"))
예제 #9
0
start = time.time()
clf = lgb.LGBMClassifier(objective="multiclass")
clf.fit(plain_fasttext, train["class"])
Y_true, Y_pred = test["class"], clf.predict(plain_fasttext_test)
print("Report")
print(classification_report(Y_true, Y_pred, digits=6))
print("Accuracy: ", clf.score(plain_fasttext_test, test["class"]))
print("Time taken:", time.time() - start, "\n")

# In[ ]:

## SCDV based fasttext
from gensim.models.wrappers.fasttext import FastText

fasttext_model_200 = FastText.load_fasttext_format(
    '../japanese-dataset/livedoor-news-corpus/for-fasttext/fasttext_model_200dim'
)

# In[ ]:

# Get wordvectors for all words in vocabulary.
word_vectors = fasttext_model_200.wv.syn0

# Set number of clusters.
num_clusters = 60
# Uncomment below line for creating new clusters.
idx, idx_proba = cluster_GMM(num_clusters, word_vectors)

# Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
# idx_name = "gmm_latestclusmodel_len2alldata.pkl"
# idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
def get_fasttext():
    global _fasttext
    if _fasttext is None:
        log.debug("Loading fasttext model..")
        _fasttext = FastText.load_fasttext_format(FASTTEXT_PATH)
    return _fasttext
예제 #11
0
 def load(self, fname):
     self.fasttext = FastText.load_fasttext_format(fname)
예제 #12
0
# In[1]:

from gensim.models import KeyedVectors
from gensim.models.wrappers.fasttext import FastText

# In[2]:

## load model
word2vec = KeyedVectors.load(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec200.model"
)
word2vec_weighted = KeyedVectors.load(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec_weighted.model"
)
fasttext = FastText.load_fasttext_format(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_model_200dim"
)
fasttext_weighted = KeyedVectors.load(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_weighted.model"
)
poincare_vec = KeyedVectors.load(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec.model"
)
poincare_vec_weighted = KeyedVectors.load(
    "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec_weighted.model"
)

# In[3]:

len(word2vec.most_similar("独身"))
예제 #13
0
 def load_fasttext_format(cls, *args, **kwargs):
     return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
예제 #14
0
                            ] and not c[1] in ["非自立", "代名詞"]:
                words.append(cols[0])
    return words


questions_src = []
questions = []
answers = []
for line in open(args.input, "r", encoding="utf-8", errors="ignore"):
    cols = line.strip().split('\t')
    #print(cols[0])
    questions_src.append(cols[0])
    questions.append(wakati(cols[0]))
    answers.append(cols[1])

model = FastText.load_fasttext_format(args.model)


def part_minus(v):
    #正と負で別のベクトルにする
    tmp_v = np.zeros(DIM * 2)
    for i in range(DIM):
        if v[i] >= 0:
            tmp_v[i] = v[i]
        else:
            tmp_v[i * 2] = -v[i]
    return tmp_v


questions_vec = []
tf_vecs = []