Python WordVector.WordVector 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: wordvector

클래스/타입: WordVector

메소드/함수: WordVector

hotexamples.com에서의 예제들: 10

Python WordVector.WordVector - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 wordvector.WordVector.WordVector에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

WordVector(10)

analogy(1)

closest_row_indices(1)

get_dict(1)

get_embed(1)

get_reverse_dict(1)

get_vector_by_name(1)

get_vector_by_num(1)

most_common(1)

n_closest(1)

num_words(1)

project_2d(1)

topk(1)

words_in_range(1)

예제 #1

파일 보기

 def test_analogy(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.analogy('the',
                                'fox',
                                'quick',
                                num=2,
                                metric='euclidean')
     self.assertEqual(2, len(d), 'wrong number of analogies returned')
     self.assertEqual('jumped', d[0], 'wrong most likely analogy returned')
     self.assertEqual('over', d[1],
                      'wrong 2nd most likely analogy returned')

예제 #2

파일 보기

 def test_gets(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     d.pop('the')  # mutate, check that copies were returned
     dr.pop(1)
     em[0, 0] = 10
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     self.assertEqual(6, len(d), 'wrong dictionary length')
     self.assertEqual(6, len(dr), 'wrong dictionary length')
     self.assertEqual(1.0, em[0, 0], 'wrong value in embed matrix')
     self.assertEqual(3, d['fox'], 'wrong value from dictionary')
     self.assertEqual('jumped', dr[4],
                      'wrong value from reverse dictionary')

예제 #3

파일 보기

 def test_closest_row_indices(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 2, 0]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'cosine')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 0, 2]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[1.0, 1.0]]),
                                                    6, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([0, 3, 4, 1, 2, 5]) - dist_list)) < 0.1,
         'incorrest closest indices')

예제 #4

파일 보기

 def test_get_vector_by_num(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1]) -
                 word_embedding.get_vector_by_num(3))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]) -
                 word_embedding.get_vector_by_num(5))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) -
                 word_embedding.get_vector_by_num(0))) < 0.1,
         'incorrest closest indices')

예제 #5

파일 보기

파일: main.py 프로젝트: nguyenthithuy97/datamining_word2vec

def load():
	files = ['../data/adventures_of_sherlock_holmes.txt',
        	'../data/hound_of_the_baskervilles.txt',
        	'../data/sign_of_the_four.txt']
	word_array, dictionary, num_lines, num_words = docload.build_word_array(
    	files, vocab_size=50000, gutenberg=True)

	print('Document loaded and processed: {} lines, {} words.'
      	.format(num_lines, num_words))

	print('Building training set ...')
	x, y = WindowModel.build_training_set(word_array)

	# shuffle and split 10% validation data
	x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
	split = round(x_shuf.shape[0]*0.9)
	x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
	x_train, y_train = (x[:split, :], y[:split, :])

	print('Training set built.')
	graph_params = {'batch_size': 32,
	                'vocab_size': np.max(x)+1,
	                'embed_size': 64,
	                'hid_size': 64,
	                'neg_samples': 64,
	                'learn_rate': 0.01,
	                'momentum': 0.9,
	                'embed_noise': 0.1,
	                'hid_noise': 0.3,
	                'optimizer': 'Momentum'}
	model = WindowModel(graph_params)
	print('Model built. Vocab size = {}. Document length = {} words.'
	      .format(np.max(x)+1, len(word_array)))

	print('Training ...')
	results = model.train(x_train, y_train, x_val, y_val, epochs=120, verbose=False)

	word_vector_embed = WordVector(results['embed_weights'], dictionary)
	word_vector_nce = WordVector(results['nce_weights'], dictionary)

예제 #6

파일 보기

 def test_num_words(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertEqual(6, word_embedding.num_words(),
                      'incorrect number of words')

예제 #7

파일 보기

 def n_closest(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     nc_list = word_embedding.n_closest('quick', 3, metric='euclidean')
     self.assertEqual(['quick', 'brown', 'the'], nc_list,
                      'wrong n-closest words returned')
     nc_list = word_embedding.n_closest('quick', 2, metric='cosine')
     self.assertEqual(['the', 'fox'], nc_list,
                      'wrong n-closest words returned')

예제 #8

파일 보기

 def test_most_common(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     mc_list = word_embedding.most_common(3)
     self.assertEqual(['the', 'quick', 'brown'], mc_list,
                      'wrong most common words returned')
     mc_list = word_embedding.most_common(1)
     self.assertEqual(['the'], mc_list, 'wrong most common words returned')

예제 #9

파일 보기

 def test_project_2D_2(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     proj, words = word_embedding.project_2d(0, 6)
     self.assertEqual((6, 2), proj.shape,
                      'incorrect projection array size returned')
     self.assertEqual('the', words[0], 'incorrect word at index 0')
     self.assertEqual('fox', words[3], 'incorrect word at index 3')

예제 #10

파일 보기

 def test_words_in_range(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     range_list = word_embedding.words_in_range(3, 6)
     self.assertEqual(['fox', 'jumped', 'over'], range_list,
                      'wrong most common words returned')
     range_list = word_embedding.words_in_range(0, 2)
     self.assertEqual(['the', 'quick'], range_list,
                      'wrong most common words returned')