Пример #1
0
    def test_sparse_abnormal_data(self):
        final_result = []
        numpy_array = []
        sparse_inst_shape = self.feature_num + 15
        indices = [x for x in range(self.feature_num + 10)]
        for i in range(self.data_num):
            tmp = 100 * np.random.rand(self.feature_num)
            tmp = [ik for ik in range(self.feature_num)]
            tmp[i % self.feature_num] = 'nan'
            # data_index = np.random.choice(indices, self.feature_num, replace=False)
            # data_index = sorted(data_index)
            data_index = [idx for idx in range(self.feature_num)]
            sparse_inst = SparseVector(data_index,
                                       tmp,
                                       shape=sparse_inst_shape)
            if i == 0:
                aa = sparse_inst.get_data(0, 'a')
                print('in for loop: {}, type: {}'.format(aa, type(aa)))
            inst = Instance(inst_id=i, features=sparse_inst, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            n = 0
            pointer = 0
            tmp_array = []
            while n < sparse_inst_shape:
                if n in data_index:
                    tmp_array.append(tmp[pointer])
                    pointer += 1
                else:
                    tmp_array.append(0)
                n += 1
            numpy_array.append(tmp_array)

        abnormal_value = final_result[0][1].features.get_data(0, 'a')
        print('abnormal_value: {}, type: {}'.format(abnormal_value,
                                                    type(abnormal_value)))
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=1)
        header = ['x' + str(i) for i in range(sparse_inst_shape)]
        numpy_table = np.array(numpy_array)
        table.schema = {'header': header}
        self.used_data_set.append(table)

        bin_obj = self._bin_obj_generator(abnormal_list=['nan'])
        split_points = bin_obj.fit_split_points(table)
        print('split_points: {}'.format(split_points))
        print(numpy_table)

        trans_result = bin_obj.transform(table,
                                         transform_cols_idx=-1,
                                         transform_type='bin_num')
        trans_result = trans_result.collect()
        print('transform result: ')
        for k, v in trans_result:
            value = v.features.get_all_data()
            value_list = []
            for value_k, value_v in value:
                value_list.append((value_k, value_v))
            print(k, value_list)
Пример #2
0
    def test_instance(self):
        indices = []
        data = []
        for i in range(1, 10):
            indices.append(i * i)
            data.append(i**3)

        shape = 100

        sparse_data = SparseVector(indices, data, shape)
        self.assertTrue(sparse_data.shape == shape
                        and len(sparse_data.sparse_vec) == 9)
        self.assertTrue(sparse_data.count_zeros() == 91)
        self.assertTrue(sparse_data.count_non_zeros() == 9)

        for idx, val in zip(indices, data):
            self.assertTrue(sparse_data.get_data(idx) == val)
        for i in range(100):
            if i in indices:
                continue
            self.assertTrue(sparse_data.get_data(i, i**4) == i**4)

        self.assertTrue(
            dict(sparse_data.get_all_data()) == dict(zip(indices, data)))