def _gen_data(self, data_num, feature_num, partition, expect_split_points, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] bin_num = len(expect_split_points) for data_key in range(data_num): value = expect_split_points[data_key % bin_num] if value == expect_split_points[-1]: if shift_iter % bin_num == 0: value = expect_split_points[0] shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) inst = Instance(inst_id=data_key, features=features, label=data_key % 2) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=feature_num) inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2) header = [str(i) for i in range(feature_num)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.table_list.append(result) return result
def to_instance(param_list, value): delimitor = param_list[0] data_type = param_list[1] label_type = param_list[2] output_format = param_list[3] max_fid = param_list[4] if output_format not in ["dense", "sparse"]: raise ValueError( "output format {} is not define".format(output_format)) cols = value.split(delimitor, -1) label = cols[0] if label_type == 'int': label = int(label) elif label_type in ["float", "float64"]: label = float(label) fid_value = [] for i in range(1, len(cols)): fid, val = cols[i].split(":", -1) fid = int(fid) if data_type in ["float", "float64"]: val = float(val) elif data_type in ["int", "int64"]: val = int(val) fid_value.append((fid, val)) if output_format == "dense": features = [0 for i in range(max_fid + 1)] for fid, val in fid_value: features[fid] = val features = np.asarray(features, dtype=data_type) else: indices = [] data = [] for fid, val in fid_value: indices.append(fid) data.append(val) features = SparseVector(indices, data, max_fid + 1) return Instance(inst_id=None, features=features, label=label)
def data_format_transform(row): if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.abs(row.features[i]) < consts.FLOAT_ZERO: continue indices.append(i) data.append(row.features[i]) row.features = SparseVector(indices, data, feature_shape) return row
def gen_data(self, data_num, feature_num, partition, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] for data_key in range(data_num): value = data_key % bin_num if value == 0: if shift_iter % bin_num == 0: value = bin_num - 1 shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) inst = Instance(inst_id=data_key, features=features, label=data_key % 2) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=10 * feature_num) inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2) header = [str(i) for i in range(feature_num * 10)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} return result
def to_instance(param_list, value): delimitor = param_list[0] data_type = param_list[1] with_label = param_list[2] label_type = param_list[3] output_format = param_list[4] tags_dict = param_list[5] if output_format not in ["dense", "sparse"]: raise ValueError("output format {} is not define".format(output_format)) cols = value.split(delimitor, -1) start_pos = 0 label = None if with_label: start_pos = 1 label = cols[0] if label_type == 'int': label = int(label) elif label_type in ["float", "float64"]: label = float(label) if output_format == "dense": features = [0 for i in range(len(tags_dict))] for tag in cols[start_pos:]: features[tags_dict.get(tag)] = 1 features = np.asarray(features, dtype=data_type) else: indices = [] data = [] for tag in cols[start_pos:]: indices.append(tags_dict.get(tag)) _data = 1 if data_type in ["float", "float64"]: _data = float(1) data.append(_data) features = SparseVector(indices, data, len(tags_dict)) return Instance(inst_id=None, features=features, label=label)
def convert(instances): if is_sparse: all_data = instances.features.get_all_data() indice = [] sparse_value = [] data_shape = instances.features.get_shape() for col_idx, col_value in all_data: if col_idx in transform_cols_idx: if col_value in abnormal_list: indice.append(col_idx) sparse_value.append(col_value) continue # Maybe it is because missing value add in sparse value, but col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) indice.append(col_idx) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] sparse_value.append(woe_value) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector else: features = instances.features assert isinstance(features, np.ndarray) transform_cols_idx_set = set(transform_cols_idx) for col_idx, col_value in enumerate(features): if col_idx in transform_cols_idx_set: if col_value in abnormal_list: features[col_idx] = col_value continue col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] features[col_idx] = woe_value instances.features = features return instances
def join_feature_with_label(inst, leaf_indices, leaf_mapping_list, vec_len, dense): label = inst.label if dense: vec = np.zeros(vec_len) offset = 0 for tree_idx, leaf_idx in enumerate(leaf_indices): vec[leaf_mapping_list[tree_idx][leaf_idx] + offset] = 1 offset += len(leaf_mapping_list[tree_idx]) return Instance(features=vec, label=label) else: indices, value = [], [] offset = 0 for tree_idx, leaf_idx in enumerate(leaf_indices): indices.append(leaf_mapping_list[tree_idx][leaf_idx] + offset) value.append(1) offset += len(leaf_mapping_list[tree_idx]) return Instance(features=SparseVector(indices=indices, data=value, shape=vec_len), label=label)
def gen_output_format(features, data_type='float', output_format='dense', missing_impute=None): if output_format not in ["dense", "sparse"]: raise ValueError( "output format {} is not define".format(output_format)) if output_format == "dense": return np.asarray(features, dtype=data_type) # The blow is to handle the sparse vector indices = [] data = [] column_shape = len(features) non_zero = 0 for i in range(column_shape): if (missing_impute is not None and features[i] in missing_impute) or \ (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]): continue if data_type in ['float', 'float64']: if np.fabs(float(features[i])) < consts.FLOAT_ZERO: continue indices.append(i) data.append(float(features[i])) non_zero += 1 elif data_type in ['int']: if int(features[i]) == 0: continue indices.append(i) data.append(int(features[i])) else: indices.append(i) data.append(features[i]) return SparseVector(indices, data, column_shape)
def setUp(self): self.data = [] for i in range(100): dict = {} indices = [] data = [] for j in range(40): idx = random.randint(0, 49) if idx in dict: continue dict[idx] = 1 val = random.random() indices.append(idx) data.append(val) sparse_vec = SparseVector(indices, data, 50) self.data.append((str(i), Instance(features=sparse_vec, label=i % 2))) self.table = session.parallelize(self.data, include_key=True) self.table.schema = {"header": ["fid" + str(i) for i in range(50)]}
def data_format_transform(row): """ transform data into sparse format """ if type(row.features).__name__ != consts.SPARSE_VECTOR: feature_shape = row.features.shape[0] indices = [] data = [] for i in range(feature_shape): if np.isnan(row.features[i]): indices.append(i) data.append(NoneType()) elif np.abs(row.features[i]) < consts.FLOAT_ZERO: continue else: indices.append(i) data.append(row.features[i]) new_row = copy.deepcopy(row) new_row.features = SparseVector(indices, data, feature_shape) return new_row else: sparse_vec = row.features.get_sparse_vector() replace_key = [] for key in sparse_vec: if sparse_vec.get(key) == NoneType() or np.isnan( sparse_vec.get(key)): replace_key.append(key) if len(replace_key) == 0: return row else: new_row = copy.deepcopy(row) new_sparse_vec = new_row.features.get_sparse_vector() for key in replace_key: new_sparse_vec[key] = NoneType() return new_row
def _convert_sparse_data(instances, bin_inner_param: BinInnerParam, bin_results: BinResults, abnormal_list: list, convert_type: str = 'bin_num'): instances = copy.deepcopy(instances) all_data = instances.features.get_all_data() data_shape = instances.features.get_shape() indice = [] sparse_value = [] transform_cols_idx = bin_inner_param.transform_bin_indexes split_points_dict = bin_results.all_split_points for col_idx, col_value in all_data: if col_idx in transform_cols_idx: if col_value in abnormal_list: indice.append(col_idx) sparse_value.append(col_value) continue # Maybe it is because missing value add in sparse value, but col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) indice.append(col_idx) if convert_type == 'bin_num': sparse_value.append(bin_num) elif convert_type == 'woe': col_results = bin_results.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] sparse_value.append(woe_value) else: sparse_value.append(col_value) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector return instances
def _convert_sparse_data(instances, transform_cols_idx, split_points_dict, header): all_data = instances.features.get_all_data() data_shape = instances.features.get_shape() indice = [] sparse_value = [] # print("In _convert_sparse_data, transform_cols_idx: {}, header: {}, split_points_dict: {}".format( # transform_cols_idx, header, split_points_dict # )) for col_idx, col_value in all_data: if col_idx in transform_cols_idx: col_name = header[col_idx] split_points = split_points_dict[col_name] bin_num = Binning.get_bin_num(col_value, split_points) indice.append(col_idx) sparse_value.append(bin_num) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector return instances
def gen_output_format(features, data_type='float', output_format='dense'): if output_format not in ["dense", "sparse"]: raise ValueError("output format %s is not define" % (output_format)) if output_format == "dense": result = np.asarray(features, dtype=data_type) return np.asarray(features, dtype=data_type) elif output_format == "sparse": indices = [] data = [] column_shape = len(features) non_zero = 0 for i in range(column_shape): if features[i] in ['', 'NULL', 'null', "NA"]: continue if data_type in ['float', 'float64']: if np.fabs(float(features[i])) < consts.FLOAT_ZERO: continue indices.append(i) data.append(float(features[i])) non_zero += 1 elif data_type in ['int']: if int(features[i]) == 0: continue indices.append(i) data.append(int(features[i])) else: raise ValueError("data type %s is not define" % (data_type)) return SparseVector(indices, data, column_shape)
def test_instance(self): indices = [] data = [] for i in range(1, 10): indices.append(i * i) data.append(i**3) shape = 100 sparse_data = SparseVector(indices, data, shape) self.assertTrue(sparse_data.shape == shape and len(sparse_data.sparse_vec) == 9) self.assertTrue(sparse_data.count_zeros() == 91) self.assertTrue(sparse_data.count_non_zeros() == 9) for idx, val in zip(indices, data): self.assertTrue(sparse_data.get_data(idx) == val) for i in range(100): if i in indices: continue self.assertTrue(sparse_data.get_data(i, i**4) == i**4) self.assertTrue( dict(sparse_data.get_all_data()) == dict(zip(indices, data)))
def to_instance(param_list, value): delimitor = param_list[0] data_type = param_list[1] tag_with_value = param_list[2] tag_value_delimitor = param_list[3] with_label = param_list[4] label_type = param_list[5] output_format = param_list[6] tags_dict = param_list[7] if output_format not in ["dense", "sparse"]: raise ValueError( "output format {} is not define".format(output_format)) cols = value.split(delimitor, -1) start_pos = 0 label = None if with_label: start_pos = 1 label = cols[0] if label_type == 'int': label = int(label) elif label_type in ["float", "float64"]: label = float(label) if output_format == "dense": features = [0 for i in range(len(tags_dict))] for fea in cols[start_pos:]: if tag_with_value: _tag, _val = fea.split(tag_value_delimitor, -1) if _tag in tags_dict: features[tags_dict.get(_tag)] = _val else: if fea in tags_dict: features[tags_dict.get(fea)] = 1 features = np.asarray(features, dtype=data_type) else: indices = [] data = [] for fea in cols[start_pos:]: if tag_with_value: _tag, _val = fea.split(tag_value_delimitor, -1) else: _tag = fea _val = 1 if _tag not in tags_dict: continue indices.append(tags_dict.get(_tag)) if data_type in ["float", "float64"]: _val = float(_val) elif data_type in ["int", "int64", "long"]: _val = int(_val) elif data_type == "str": _val = str(_val) data.append(_val) features = SparseVector(indices, data, len(tags_dict)) return Instance(inst_id=None, features=features, label=label)
def gen_data(self, data_num, partition): col_data = [] header = [str(i) for i in range(6)] mode_num = int(0.8 * data_num) other_num = data_num - mode_num col_1 = np.array([1] * mode_num + [0] * other_num) random.shuffle(col_1) col_data.append(col_1) mode_num = int(0.799 * data_num) other_num = data_num - mode_num col_1 = np.array([1] * mode_num + [0] * other_num) random.shuffle(col_1) col_data.append(col_1) mode_num = int(0.801 * data_num) other_num = data_num - mode_num col_1 = np.array([1] * mode_num + [0] * other_num) random.shuffle(col_1) col_data.append(col_1) col_2 = np.random.randn(data_num) col_data.append(col_2) mode_num = int(0.2 * data_num) other_num = data_num - mode_num col_1 = np.array([0.5] * mode_num + list(np.random.randn(other_num))) print("col 0.5 count: {}".format(list(col_1).count(0.5))) random.shuffle(col_1) col_data.append(col_1) mode_num = int(0.79 * data_num) other_num = data_num - mode_num col_1 = np.array([0.5] * mode_num + list(np.random.randn(other_num))) random.shuffle(col_1) col_data.append(col_1) data = [] data_2 = [] for key in range(data_num): features = np.array([col[key] for col in col_data]) inst = Instance(inst_id=key, features=features, label=key % 2) data.append((key, inst)) sparse_vec = SparseVector( indices=[i for i in range(len(features))], data=features, shape=len(features)) inst_2 = Instance(inst_id=key, features=sparse_vec, label=key % 2) data_2.append((key, inst_2)) result = session.parallelize(data, include_key=True, partition=partition) result_2 = session.parallelize(data_2, include_key=True, partition=partition) result.schema = {'header': header} result_2.schema = {'header': header} self.header = header return result, result_2
def gen_output_format(features, data_type='float', exclusive_data_type_fid_map=None, output_format='dense', missing_impute=None): if output_format not in ["dense", "sparse"]: raise ValueError( "output format {} is not define".format(output_format)) if output_format == "dense": format_features = copy.deepcopy(features) if data_type in [ "int", "int64", "long", "float", "float64", "double" ]: for i in range(len(features)): if (missing_impute is not None and features[i] in missing_impute) or \ (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]): format_features[i] = np.nan if exclusive_data_type_fid_map: for fid in range(len(features)): if fid in exclusive_data_type_fid_map: dtype = exclusive_data_type_fid_map[fid] else: dtype = data_type format_features[fid] = getattr(np, dtype)(features[fid]) return np.asarray(format_features, dtype=object) else: return np.asarray(format_features, dtype=data_type) indices = [] data = [] column_shape = len(features) non_zero = 0 for i in range(column_shape): if (missing_impute is not None and features[i] in missing_impute) or \ (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]): indices.append(i) data.append(np.nan) non_zero += 1 elif data_type in ['float', 'float64', "double"]: if np.fabs(float(features[i])) < consts.FLOAT_ZERO: continue indices.append(i) data.append(float(features[i])) non_zero += 1 elif data_type in ['int', "int64", "long"]: if int(features[i]) == 0: continue indices.append(i) data.append(int(features[i])) else: indices.append(i) data.append(features[i]) return SparseVector(indices, data, column_shape)
def convert_instance_to_bin(instance, bin_split_points=None): """ Method use by mapValues Api, convert an instance object's features to bins Parameters ---------- instance : Instance Object bin_split_points: 2D numpy's ndarray, split points of each feature need to binning Returns ------- instance: Instance Object, the instance object's features converted to bins """ sparse_data = False if type(instance.features).__name__ == "ndarray": feature_shape = instance.features.shape[0] else: feature_shape = instance.features.get_shape() sparse_data = True indices = [] data_format = type(instance.features).__name__ features = instance.features bins = [] if sparse_data: feature_values = [kv for kv in features.get_all_data()] else: feature_values = list( zip(range(features.shape[0]), features.tolist())) for fid, feature_value in feature_values: bin_id = 0 if sparse_data: indices.append(fid) if bin_split_points[fid].shape[0] == 0: bins.append(bin_id) continue if bin_split_points[fid].shape[0] <= 20: bin_id = bin_split_points[fid].shape[0] for idx in range(bin_split_points[fid].shape[0]): if feature_value <= bin_split_points[fid][idx]: bin_id = idx break bins.append(bin_id) else: if feature_value <= bin_split_points[fid][0]: bin_id = 0 elif feature_value > bin_split_points[fid][ bin_split_points[fid].shape[0] - 1]: bin_id = bin_split_points[fid].shape[0] else: left = 0 right = bin_split_points[fid].shape[0] - 1 while left <= right: idx = (left + right) >> 1 if feature_value <= bin_split_points[fid][idx]: bin_id = idx right = idx - 1 else: left = idx + 1 bins.append(bin_id) if data_format == "ndarray": instance.features = np.array(bins, dtype='int') else: instance.features = SparseVector(indices, bins, feature_shape) return instance
def convert_instance_to_bin(instance, bin_split_points=None): sparse_data = False if type(instance.features).__name__ == "ndarray": feature_shape = instance.features.shape[0] else: feature_shape = instance.features.get_shape() sparse_data = True indices = [] data_format = type(instance.features).__name__ features = instance.features bins = [] if sparse_data: feature_values = [kv for kv in features.get_all_data()] else: feature_values = list(zip(range(features.shape[0]), features.tolist())) for fid, feature_value in feature_values: bin_id = 0 if sparse_data: indices.append(fid) if bin_split_points[fid].shape[0] == 0: bins.append(bin_id) continue if bin_split_points[fid].shape[0] <= 20: bin_id = bin_split_points[fid].shape[0] for idx in range(bin_split_points[fid].shape[0]): if feature_value <= bin_split_points[fid][idx]: bin_id = idx break bins.append(bin_id) else: if feature_value <= bin_split_points[fid][0]: bin_id = 0 elif feature_value > bin_split_points[fid][bin_split_points[fid].shape[0] - 1]: bin_id = bin_split_points[fid].shape[0] else: left = 0 right = bin_split_points[fid].shape[0] - 1 while left <= right: idx = (left + right) >> 1 if feature_value <= bin_split_points[fid][idx]: bin_id = idx right = idx - 1 else: left = idx + 1 bins.append(bin_id) if data_format == "ndarray": instance.features = np.array(bins, dtype='int') else: instance.features = SparseVector(indices, bins, feature_shape) return instance