Пример #1
0
 def train(self):
     print("訓練中...(喝個咖啡吧^0^)")
     # Load file
     sentence = word2vec.Text8Corpus(
         "D:\\word2vec\\trash\\segmentation.txt")
     # Setting degree and Produce Model(Train)
     model = word2vec(sentence,
                      size=100,
                      window=5,
                      min_count=5,
                      sg=1,
                      workers=8)
     # Save model
     model.wv.save_word2vec_format(
         u"D:\\word2vec\\model\\zhfn100w5m5sg1.model.bin", binary=True)
     print("model1 已儲存完畢")
    return model


def wordsimilarity(word, model):
    semi = ''
    try:
        semi = model.most_similar(word, topn=10)
    except KeyError:
        print('The word not in vocabulary!')
    for term in semi:
        print('%s,%s' % (term[0], term[1]))


def LineSentence(path):
    """将指定路经的文本转换成iterable of iterables"""
    sentences = []
    i = 0
    with codecs.open(path, 'r', encoding="UTF-8") as raw_texts:
        for line in raw_texts.readlines():
            line = line.strip()
            sent_list = line.split()
            i += 1
            print("sent " + i)
            sentences.append(sent_list)
    print("read sentences done!")
    return sentences


config = Config()
model = word2vec(config, saved=True)
Пример #3
0
#Feature 1
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True,
                             sublinear_tf=False, ngram_range=(2,2))
#Feature 2
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()

#Feature 3
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer()

#Feature 4
from gensim.models import word2vec
w2vec = word2vec()

from sklearn.pipeline import  FeatureUnion
combined_features = FeatureUnion([("tfidf_vect", tfidf_vect),
                                  ("bow", bow),
                                  ("hash",hash_vect),
                                  ('word2vec',word2vec)])


X_combined_features = combined_features.fit_transform(df['content'].values)
y = df['label'].values

print X_combined_features.toarray()


Пример #4
0
    # train word2vec model ; shuffle data every epoch
    for i in range(n_epoch):
        random.shuffle(data)
        model.train(data, total_examples=len(data), epochs=1)

    # save model
    model.save('word2vec_model/CBOW.wv.syn0.npy')


# main()
sg = 0
vec_size = 256
min_count_of_each_word = 5
window_size = 5
n_epoch = 5
word2vec(sg, vec_size, min_count_of_each_word, window_size, n_epoch)

###############  Application of word2vec  ###############

# load word2vec model
model = word2vec.Word2Vec.load('/word2vec_model/CBOW.wv.syn0.npy')
# get most similarity with given words
model.wv.most_similar('nvidia')
# Print >>>
#[('GPU', 0.5550138354301453),
# ('TPU', 0.5424560308456421),
# ('Pro', 0.5173478126525879),
# ('intel', 0.5163905620574951),
# ('NVIDIA', 0.5157663226127625),
# ('Intel', 0.5154422521591187),
# ('PSV', 0.4950483441352844),
Пример #5
0
            for j in range(length_of_walk):
                # 次に到達するノードを選択する
                next_node = random.choice(list(G.neighbors(now_node)))
                # リストに到達したノードをリストに追加する
                path.append(str(next_node))
                # 今いるノードを「現在地」とする
                now_node = next_node
            # ランダムウォークしたノードをリストに追加
            paths.append(path)
        # 訪れたノード群を返す
        return paths


G = sample4()
walking = make_random_walks(G, num_walk=512, length_of_walk=1000)
model = word2vec(walking, min_count=1, size=2, window=10, workers=1)

x = []
y = []
node_list = []
colors = []
fig, ax = plt.subplots()
for node in G.nodes:
    vector = model.wv[str(node)]
    x.append(vector[0])
    y.append(vector[1])
    ax.annotate(str(node), (vector[0], vector[1]))
    if 0 <= node <= 2:
        colors.append("r")
    else:
        colors.append("b")
Пример #6
0

f = open(pg_file_path, mode="r")

reader = csv.reader(f, delimiter="\t")
# NetworkXにGraph.add_node()できるノードに変換
nodes = [(r[0], {"label": label(r[2])}) for r in reader if r[1] == ":page_id"]
f.seek(0)
# NetworkXに読み込める属性付きエッジに変換
edges = [(r[0], r[2], {"property": r[3]}) for r in reader if r[1] == "->"]
labels = [{"id": x[0], "label": x[1]["label"]} for x in nodes]

G = nx.DiGraph()
# ノードを追加
G.add_nodes_from(nodes)
# エッジを追加
G.add_edges_from(edges)

# パラメータの検討必要!!!!
walks = make_random_walks(G, 20, 20)
model = word2vec(walks, min_count=0, size=2, window=5, workers=1)

wiki_page_id = convert_artist2id(wiki_page_label)
vector = model.wv[wiki_page_id]
ranking = model.wv.most_similar([vector], [], int(sample_size))

for e in ranking:
    page_id = e[0]
    artist_name = convert_id2artist(page_id)
    print(artist_name)
Пример #7
0
        sentences.append(cut)
        for elem in cut:
            new_data[elem] = new_data.get(elem, 0) + 1

    from gensim.models import word2vec
    model = word2vec.Word2Vec(sentences, size=128, min_count=1)
    token_list = ['PAD_token', 'UNK_token', 'END_token']
    #np.zeros((1
    emb_vectors = []
    emb_vectors.append(np.zeros((128)))
    emb_vectors.append(np.random.rand((128)) / 1000.0)
    emb_vectors.append(np.random.rand((128)) / 1000.0)
    for k, v in new_data.items():
        #print k.encode('utf-8')
        #print model.wv[k]
        #print
        token_list.append(k)
        emb_vectors.append(np.array(model.wv[k]))
    emb_vectors = np.array(emb_vectors)
    np.save("../data/emb.npy", emb_vectors)
    with open("../data/token_list.json", 'w') as f:
        json.dump(token_list, f)
    load_emb = np.load('../data/emb.npy')
    pdb.set_trace()
    print('ok')
    #for k, v in new_data.items():
    #    print k.encode('utf-8'), v


word2vec()
Пример #8
0
for item in full_sentences:
    sentence = item.split('\t')
    sentences.append(sentence)
    weighted_feats += sentence

weighted_feats = set(weighted_feats)

topn = len(weighted_feats)

#######################################################################################

print("Starting training...")

tic = time()

model = word2vec(sentences, min_count=min_count, size=size, iter=epochs, window=window)

print("\tTime: ", time()-tic)

#######################################################################################

print("Starting prediction...")

file = open(users_sentences, "r")
users = file.read().splitlines()
file.close()

dicto = dict()
for count, user in enumerate(users):
    dicto[user] = count
Пример #9
0
import gensim
from gensim import  models, similarities
from gensim.models import word2vec


with open('twitter_data/pos_tweets.txt', 'r') as infile:
    pos_tweets = infile.readlines()

with open('twitter_data/neg_tweets.txt', 'r') as infile:
    neg_tweets = infile.readlines()

#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)

#Do some very minor text preprocessing
def cleanText(corpus):
    corpus = [z.lower().replace('\n','').split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)

n_dim = 300
#Initialize model and build vocab
imdb_w2v = word2vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)

#Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(x_train)