def test_03_negative_samples_2(self): # 2p """ Tests whether return value of positive_and_negative_cooccurrences is of type 'generator'.""" test_tokens = ["a", "rose", "is", "a", "rose"] no_negatives = utils.positive_and_negative_cooccurrences(test_tokens, max_distance=1, neg_samples_factor=0, vocab_to_id={"rose": 0, "is": 1, "a": 2}) self.assertIsNotNone(no_negatives) self.assertIsInstance(no_negatives, types.GeneratorType)
def test_03_negative_samples_0(self): # 1p """ Tests whether positive tuples are created correctly.""" test_tokens = ["0", "1", "2", "3", "4"] vocab_dict = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} pos_neg_cooccurrences = utils.positive_and_negative_cooccurrences(test_tokens, max_distance=1, neg_samples_factor=100, vocab_to_id=vocab_dict) self.assertIsNotNone(pos_neg_cooccurrences) with_negatives = list(pos_neg_cooccurrences) # Count number of positive target words for id "2" sum_positives_twos = sum(1 for t in with_negatives if t[2] and t[0] == 2) # Count number of negative target words for id "2" sum_negative_twos = sum(1 for t in with_negatives if not t[2] and t[0] == 2) # Check if target words are used when sampling context words negatives = [t[0] for t in with_negatives if not t[2]] negatives_expected = [n for n in range(len(test_tokens)) for _ in range(200)][100:-100] self.assertEqual(sum_positives_twos, 2) self.assertEqual(sum_negative_twos, 200) self.assertEqual(str(negatives), str(negatives_expected))
def test_03_negative_samples_1(self): # 1p """ Tests whether positive tuples are created correctly.""" test_tokens = ["a", "rose", "is", "a", "rose"] no_negatives = utils.positive_and_negative_cooccurrences(test_tokens, max_distance=1, neg_samples_factor=0, vocab_to_id={"rose": 0, "is": 1, "a": 2}) no_negatives_expected = {(0, 2, True), (2, 0, True), (1, 0, True), (0, 1, True), (2, 1, True), (1, 2, True), (0, 2, True), (2, 0, True)} self.assertIsNotNone(no_negatives) self.assertEqual(set(no_negatives), no_negatives_expected)
def test_03_negative_samples_4(self): # 2p """ Tests whether negative tuples are created correctly: Are they created from all positive tuples? """ test_tokens = ["0", "0", "2", "3", "4"] vocab_dict = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} pos_negative_cooccurrences = utils.positive_and_negative_cooccurrences(test_tokens, max_distance=1, neg_samples_factor=10, vocab_to_id=vocab_dict) self.assertIsNotNone(pos_negative_cooccurrences) with_negatives = list(pos_negative_cooccurrences) # Count values of negative (sampled) tokens. neg_label_contexts = {t[1] for t in with_negatives if t[2] == False} self.assertEqual(neg_label_contexts, {0, 1, 2, 3, 4})
def test_04_skipgram_update_2(self): """ Tests whether update on negative tuple is performed correctly.""" # Check if "positive_and_negative_cooccurrences" isimplemented self.assertIsNotNone(utils.positive_and_negative_cooccurrences([], 1, 0, {})) sg = skipgram.SkipGram(["b", "a", "c", "b", "c", "c"], window_size=1, neg_samples_factor=0, vocab_size=3, num_dims=5) sg.context_word_matrix = np.array([[1, 0, 1], [0, 1, 0], [0, 0, -2]], dtype='float64') sg.target_word_matrix = np.array([[-1, 0, -1], [1, 0, 1], [1, 1, 1]], dtype='float64') ll = sg.update(context_id=0, target_id=1, label=False, learning_rate=1.0) self.assertTrue((sg.context_word_matrix[0] == sg.target_word_matrix[1]).all()) expected_updated = (1 - utils.sigmoid(2)) * np.array([1., 0., 1.], dtype='float64') self.assertTrue((sg.context_word_matrix[0] == expected_updated).all()) self.assertAlmostEqual(ll, -2.127, delta=0.001)
def __init__(self, tokens, window_size=1, neg_samples_factor=10, vocab_size=10000, num_dims=50): """ Creates an object for training skipgram embeddings from a corpus :param tokens: List of strings, the corpus. :param window_size: Maximum distance of context words. :param neg_samples_factor: Number of sampled negative tuples for each positive tuple :param vocab_size: Dictionary (string to int) mapping each word to its id (=row in embedding matrizes). :param num_dims: Number of dimensions used for embedding matrizes. """ self.word_to_id = utils.vocabulary_to_id_for_wordlist(tokens, vocab_size) self.pos_neg_list = list(utils.positive_and_negative_cooccurrences(tokens, window_size, neg_samples_factor, self.word_to_id)) rows = len(self.word_to_id) self.target_word_matrix = 0.1 * np.random.rand(rows, num_dims) self.context_word_matrix = 0.1 * np.random.rand(rows, num_dims)
def test_03_negative_samples_3(self): # 2p """ Tests whether negative tuples are created correctly: Are they chosen randomly from all words? """ test_tokens = ["0", "0", "2", "3", "4"] vocab_dict = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} pos_neg_cooccurrences = utils.positive_and_negative_cooccurrences(test_tokens, max_distance=1, neg_samples_factor=10, vocab_to_id=vocab_dict) self.assertIsNotNone(pos_neg_cooccurrences) with_negatives = list(pos_neg_cooccurrences) self.assertEqual(len(with_negatives), 88) # Count values of negative (sampled) tokens. neg_label_distribution = Counter([t[1] for t in with_negatives if t[2] == False]) # Use chi-squared test, in order to determine whether values are likely to be random. expected_distribution = [16, 16, 16, 16, 16] p_value = stats.chisquare(list(neg_label_distribution.values()), expected_distribution)[1] self.assertGreater(p_value, 0.01)