示例#1
0
    def test_hashvalue(self):
        dataloader = FakeMultiDataloader()
        emb = {}
        emb_unequal = {}
        for word in dataloader.all_vocab_list[:dataloader.valid_vocab_len]:
            vec = []
            for j in range(5):
                vec.append(random.random())
            vec = np.array(vec)
            emb[word] = vec
            emb_unequal[word] = vec + 1

        reference_key, gen_key = self.default_keywords
        key_list = [reference_key, gen_key]
        data = dataloader.get_data(reference_key=reference_key, gen_key=gen_key, \
                 to_list=True, pad=False, \
                 ref_len='non-empty', gen_len='non-empty', \
                 ref_vocab='valid_vocab', gen_vocab='valid_vocab', test_prec_rec=True)
        espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, 'avg', 3)
        espr_shuffle = EmbSimilarityPrecisionRecallMetric(
            dataloader, emb, 'avg', 3)

        data_shuffle = shuffle_instances(data, key_list)
        for idx in range(len(data_shuffle[reference_key])):
            np.random.shuffle(data_shuffle[reference_key][idx])
        batches_shuffle = split_batch(data_shuffle, key_list)

        espr.forward(data)
        res = espr.close()

        for batch in batches_shuffle:
            espr_shuffle.forward(batch)
        res_shuffle = espr_shuffle.close()

        assert same_dict(res, res_shuffle, False)

        data_less_word = copy.deepcopy(data)
        data_less_word[reference_key][0][0] = data_less_word[reference_key][0][
            0][:-2]
        for data_unequal in [data_less_word] + generate_unequal_data(data, key_list, \
                  dataloader.pad_id, \
                  reference_key, reference_is_3D=True):
            espr_unequal = EmbSimilarityPrecisionRecallMetric(
                dataloader, emb, 'avg', 3)

            espr_unequal.forward(data_unequal)
            res_unequal = espr_unequal.close()

            assert res['avg-bow hashvalue'] != res_unequal['avg-bow hashvalue']
        espr_unequal = EmbSimilarityPrecisionRecallMetric(
            dataloader, emb_unequal, 'avg', 3)
        espr_unequal.forward(data)
        res_unequal = espr_unequal.close()
        assert res['avg-bow hashvalue'] != res_unequal['avg-bow hashvalue']
示例#2
0
    def test_close(self, argument, shape, type, batch_len, ref_len, gen_len, \
           ref_vocab, gen_vocab, emb_mode, emb_type, emb_len):
        dataloader = FakeMultiDataloader()

        emb = {}
        for word in dataloader.all_vocab_list[:dataloader.valid_vocab_len]:
            vec = []
            for j in range(5):
                vec.append(random.random())
            emb[word] = vec
        if emb_len == 'unequal':
            key = list(emb.keys())[0]
            emb[key] = emb[key][:-1]
        if emb_type == 'list':
            emb = np.array(list(emb.values()), dtype=np.float32).tolist()

        if emb_type != 'dict':
            with pytest.raises(ValueError, match="invalid type"):
                espr = EmbSimilarityPrecisionRecallMetric(
                    dataloader, emb, emb_mode, 3)
            return
        else:
            if emb_len == 'unequal':
                with pytest.raises(
                        ValueError,
                        match=
                        "word embeddings have inconsistent embedding size or are empty"
                ):
                    espr = EmbSimilarityPrecisionRecallMetric(
                        dataloader, emb, emb_mode, 3)
                return
        if emb_mode not in ['avg', 'extrema']:
            with pytest.raises(ValueError,
                               match="mode should be 'avg' or 'extrema'."):
                espr = EmbSimilarityPrecisionRecallMetric(
                    dataloader, emb, emb_mode, 3)
            return

        if argument == 'default':
            reference_key, gen_key = self.default_keywords
            print(emb)
            espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb,
                                                      emb_mode, 3)
        else:
            reference_key, gen_key = ('rk', 'gk')
            espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode, 3, \
                        reference_key, gen_key)

        # TODO: might need adaptation of dataloader.get_data for test_prec_rec
        # turn_length is not generated_num_per_context conceptually
        data = dataloader.get_data(reference_key=reference_key, gen_key=gen_key, \
                 to_list=(type == 'list'), pad=(shape == 'pad'), \
                 ref_len=ref_len, gen_len=gen_len, \
                 ref_vocab=ref_vocab, gen_vocab=gen_vocab, test_prec_rec=True)

        _data = copy.deepcopy(data)
        if batch_len == 'unequal':
            data[reference_key] = data[reference_key][1:]
            _data = copy.deepcopy(data)
            with pytest.raises(ValueError, match="Batch num is not matched."):
                espr.forward(data)
        else:
            # if emb_len < dataloader.all_vocab_size and \
            # 	(ref_vocab == 'all_vocab' or gen_vocab == 'all_vocab'):
            # 	with pytest.raises(ValueError, match="[a-z]* index out of range."):
            # 		espr.forward(data)
            # else:
            espr.forward(data)
            ans = espr.close()
            prefix = emb_mode + '-bow'
            assert sorted(ans.keys()) == [
                prefix + ' hashvalue', prefix + ' precision',
                prefix + ' recall'
            ]

        assert same_dict(data, _data)
示例#3
0
	def test_close(self, argument, shape, type, batch_len, ref_len, gen_len, \
							 ref_vocab, gen_vocab, emb_mode, emb_type, emb_len):
		dataloader = FakeMultiDataloader()

		emb = []
		for i in range(dataloader.vocab_size + (1 if emb_len == 'unequal' else 0)):
			vec = []
			for j in range(5):
				vec.append(random.random())
			emb.append(vec)
		#print(emb_len, gen_vocab)
		if emb_type == 'array':
			emb = np.array(emb)

		if emb_type != 'array':
			with pytest.raises(ValueError, match="invalid type or shape or embed."):
				espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode)
			return
		if emb_len == 'unequal':
			with pytest.raises(ValueError, match="embed size not equal to vocab size."):
				espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode)
			return
		if emb_mode not in ['avg', 'extrema']:
			with pytest.raises(ValueError, match="mode should be 'avg' or 'extrema'."):
				espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode)
			return

		if argument == 'default':
			reference_key, gen_key = ('resp_allvocabs', 'gen')
			print(emb)
			espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode)
		else:
			reference_key, gen_key = ('rk', 'gk')
			espr = EmbSimilarityPrecisionRecallMetric(dataloader, emb, emb_mode, \
													  reference_key, gen_key)

		data = dataloader.get_data(reference_key=reference_key, gen_key=gen_key, \
								   to_list=(type == 'list'), pad=(shape == 'pad'), \
								   ref_len=ref_len, gen_len=gen_len, \
								   ref_vocab=ref_vocab, gen_vocab=gen_vocab)

		_data = copy.deepcopy(data)
		if batch_len == 'unequal':
			data[reference_key] = data[reference_key][1:]
			_data = copy.deepcopy(data)
			with pytest.raises(ValueError, match="Batch num is not matched."):
				espr.forward(data)
		else:
			# if emb_len < dataloader.all_vocab_size and \
			# 	(ref_vocab == 'all_vocab' or gen_vocab == 'all_vocab'):
			# 	with pytest.raises(ValueError, match="[a-z]* index out of range."):
			# 		espr.forward(data)
			# else:
			espr.forward(data)
			ans = espr.close()
			prefix = emb_mode + '-bow'
			assert sorted(ans.keys()) == [prefix + ' precision', prefix + ' recall']

		assert same_dict(data, _data)