Python FastText.load_word2vec_format 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.models.wrappers

클래스/타입: FastText

메소드/함수: load_word2vec_format

hotexamples.com에서의 예제들: 6

Python FastText.load_word2vec_format - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.models.wrappers.FastText.load_word2vec_format에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

load_fasttext_format(30)

FastText(8)

load(7)

load_word2vec_format(6)

train(5)

예제 #1

파일 보기

def save_fasttext(vocab):
    model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
     # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/fasttext.wv')

예제 #2

파일 보기

def build_vocab(filenames, vocabfile):
    """Write unique words from a set of files to a new file"""
    if os.path.isfile(vocabfile):
        print('Loading existing vocabulary from', vocabfile)
        return
    vocab = set()
    for filename in filenames:
        if filename.endswith('.vec'):
            model = FastText.load_word2vec_format(filename)
            vocab |= set(model.vocab.keys())
        else:
            with open(filename, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.rstrip('\n').lower().split()
                    vocab |= set(tokens)
    with open(vocabfile, 'w', encoding='utf-8') as f:
        for token in vocab:
            f.write(token + '\n')

예제 #3

파일 보기

파일: embeddings.py 프로젝트: tomekkorbak/treehopper

def load_word_vectors(embeddings_path):
    if os.path.isfile(embeddings_path + '.pth') and \
            os.path.isfile(embeddings_path + '.vocab'):
        print('==> File found, loading to memory')
        vectors = torch.load(embeddings_path + '.pth')
        vocab = Vocab(filename=embeddings_path + '.vocab')
        return vocab, vectors
    if os.path.isfile(embeddings_path + '.model'):
        model = KeyedVectors.load(embeddings_path + ".model")
    if os.path.isfile(embeddings_path + '.vec'):
        model = FastText.load_word2vec_format(embeddings_path + '.vec')
    list_of_tokens = model.vocab.keys()
    vectors = torch.zeros(len(list_of_tokens), model.vector_size)
    with open(embeddings_path + '.vocab', 'w', encoding='utf-8') as f:
        for token in list_of_tokens:
            f.write(token+'\n')
    vocab = Vocab(filename=embeddings_path + '.vocab')
    for index, word in enumerate(list_of_tokens):
        vectors[index, :] = torch.from_numpy(model[word])
    return vocab, vectors

예제 #4

파일 보기

파일: generate_feature_embedding.py 프로젝트: zm66260/quora_question_pairs

df_feat['glove_cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
df_feat['glove_skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
df_feat['glove_kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
df_feat['glove_kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]


# fasttext w2v distance
model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
def sent2vec(s):
    words = str(s).lower()
    words = nltk.word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

예제 #5

파일 보기

from gensim.models.wrappers import FastText

model = FastText()
model.load_word2vec_format('/home/ltp/WorkShop/fastText/model/wiki.zh.vec')

예제 #6

파일 보기

파일: train.py 프로젝트: Zuchens/opinions

def load_embeddings(embeddings_path):
    if os.path.isfile(embeddings_path + '.model'):
        model = KeyedVectors.load(embeddings_path + ".model")
    if os.path.isfile(embeddings_path + '.vec'):
        model = FastText.load_word2vec_format(embeddings_path + '.vec')
    return model