def relative_error(x, y): """ Calculate error between original value and the masked value, for each attribute in given vector. 1) For numeric attribute i: | a_j - a'_j | / max(DOM(A_j). 2) For categorical attribute i: dist(a_j, a'_j) / m, for nominal distance, normalized by number of attributes m. Error is summed over all attributes. :param x: First vector :param y: Second vector :return: Sum of relative error for given pair of vectors """ diff = 0 length = len(x) pair_wise_list = list(zip(*[x, y])) for idx, pair in enumerate(pair_wise_list): if isinstance(pair[0], (int, float)): # numeric attribute min_val = MetaUtils.get_attr_metadata(idx, 'Min_Val') max_val = MetaUtils.get_attr_metadata(idx, 'Max_Val') if pair[0] != 0 or pair[1] != 0: diff += float(abs(pair[0] - pair[1])) / (max_val - min_val) elif isinstance(pair[1], str): # Categorical attribute dtype = MetaUtils.get_attr_metadata(idx, 'Val_Type') w = MetaUtils.get_attr_metadata(idx, 'Weight') max_rank = MetaUtils.get_attr_metadata(idx, 'Max_Rank') if dtype == "ordinal": diff += MetricsUtils.distance_ordinal_feature(pair[0], pair[1], max_rank, w) / float(length) elif dtype == "nominal": diff += MetricsUtils.distance_nominal_feature(pair[0], pair[1], w) / float(length) return diff
def randomize(t, w=None): """ Randomize values of each attribute in tuple For numerical attributes - return uniform random value in [min_val, max_val] range For categorical attributes - return random value from set of unique attribute values :param t: Tuple to be randomized :param w: Current buffer batch of tuples in the cluster to which t belongs. Default None :return: Randomized version of the tuple """ qi = list(t.quasi_identifier) for i in range(0, len(qi)): if not isinstance(qi[i], str): # Numerical attributes min_val = MetaUtils.get_attr_metadata(i, 'Min_Val') max_val = MetaUtils.get_attr_metadata(i, 'Max_Val') if not w: qi[i] = DistributionUtils.get_uniform_rand(min=min_val, max=max_val, dtype=type(qi[i])) else: records = map(lambda x: x.quasi_identifier, w) attr_vals = list(zip(*records))[i] x = DistributionUtils.get_estimated_rand(sample_batch=attr_vals, dtype=type(qi[i])) qi[i] = MetricsUtils.truncate_value(x, l=min_val, u=max_val, dtype=type(qi[i])) else: # Categorical attributes unique_values = MetaUtils.get_attr_metadata(i, 'Distinct_Val') qi[i] = DistributionUtils.get_uniform_rand(sample_batch=unique_values, dtype=type(qi[i])) anonymized = copy.copy(t) anonymized.quasi_identifier = qi return anonymized
def distance(x, y, normalize=True): """ Calculate distance between two tuples (vectors) :param normalize: Whether normalization is required for calculating distance (Default: True) :param x: First tuple :param y: Second tuple :return: Distance between possibly mixed types, considering the attributes' weights """ f_list_len = len(MetaUtils.stream_metadata) - 1 # Ignoring the last class attribute dist, sum_weights = 0, 0 if MetaUtils.is_all_nominal: return MetricsUtils.distance_all_nominal(x, y) elif MetaUtils.is_all_numeric: return MetricsUtils.transformed_euclidean_distance(x, y) else: for i in range(0, f_list_len): dtype, w, max_rank, min_val, max_val, distinct_val = MetaUtils.get_all_attr_metadata(i) sum_weights += w if dtype == "ordinal": dist += MetricsUtils.distance_ordinal_feature(x[i], y[i], max_rank, w) elif dtype == "continuous" or dtype == "discrete": dist += MetricsUtils.distance_interval_feature(x[i], y[i], min_val, max_val, w) elif dtype == "nominal": dist += MetricsUtils.distance_nominal_feature(x[i], y[i], w) return dist / sum_weights
def convert_CSV_to_ARFF(dir, file_name): """ Converts file of CSV format to a WEKA / MOA readable ARFF format :param dir: Directory of source and target files paths :param file_name: Name of file to be converted (dataset name) :return: True if conversion is successful, otherwise False """ # Create temproray path for ARFF file, Weka does not interpret attribute types correctly arff_file = file_name.split('.')[0] + '.arff' csv_path = os.path.abspath(os.path.join(dir, file_name)) arff_path = os.path.abspath(os.path.join(dir, arff_file)) weka_csv_class = 'weka.core.converters.CSVLoader' # Correct only categorical attributes, which are interpreted as numerical by Weka nominal_indx = MetaUtils.get_all_nominal_indx() nominal_indx = ','.join(str(i) for i in nominal_indx) ExternalProcesses.run_process( OrderedDict([('p_type', 'java'), ('t_type', 'weka'), ('jclass', weka_csv_class), ('path', csv_path), ('N', nominal_indx), ('gt', arff_path), ('B', '')])) # Save corrected arrtibute types to new ARFF file and delete temp file # file_path = StreamWriter.correct_ARFF_attribute_types(tmp_arff_path) # os.remove(tmp_arff_path) # return file_path return arff_path
def replace_with_centroid(t, publish_qi): """ Replace each tuple's values with the values of the centroid of cluster to which the tuple belongs. Validate the original type of attribute (int, float, string...), to maintain utility of data. :param t: Tuple to be published and replaced :param publish_qi: Centroid of cluster to which the original tuple belongs (if required, after noise addition) :return: Anonymized version of tuple """ anonymized = copy.copy(t) anonymized.quasi_identifier = publish_qi for i, attr in enumerate(anonymized.quasi_identifier): anonymized.quasi_identifier[i] = MetaUtils.validate_dtype(attr, i) return anonymized
def build_numeric_vec(z): """ Convert non-numeric vector to categorical vector, replacing nominal value with a binary vector Each nominal attribute is replaced with (0,....,0,1,0...,0) vector of length corresponding to number of unique values Numeric representation is used for calculating distance/similarity with numeric similarity functions :param z: Non-numeric vector to be converted :return: """ v = [] m = len(z) if len(z) else None if m: for i in range(0, m): if not isinstance(z[i], str): # Numerical attribute v.append(z[i]) else: # Categorical attribute distinct_vals = MetaUtils.get_attr_metadata(i, 'Distinct_Val') cat = [1 if z[i] == d else 0 for d in distinct_vals] v.extend(cat) return v
def correct_ARFF_attribute_types(arff_path): """ Correct definition of categorical attributes in ARFF file, which are interpreted as numerical by Weka :param arff_path: Path of ARFF file :return: True if correction is successful, otherwise False """ corrected_path = arff_path.replace('_temp', '') weka_filter_class = 'weka.filters.unsupervised.attribute.NumericToNominal' # Correct only categorical attributes, which are interpreted as numerical by Weka # nominal_indx = [i + 1 for i, key in enumerate(sr.STREAM_METADATA) # if sr.STREAM_METADATA[key]['Type'] == "categorical"] nominal_indx = [ i + 1 for i in range(0, len(MetaUtils.stream_attr_names)) if MetaUtils.get_attr_metadata(i, 'Type') == "categorical" ] nominal_indx = ','.join(str(i) for i in nominal_indx) ExternalProcesses.run_process( OrderedDict([('p_type', 'java'), ('t_type', 'weka'), ('jclass', weka_filter_class), ('i', arff_path), ('o', corrected_path), ('R', nominal_indx)])) return corrected_path
def normalize(v, method='minmax', new_min=0, new_max=1): """ Normalize vector of numeric values (not in [0,1] range) to standardized measurement (z-score), for efficient distance calculations Use mean absolute deviation or std unit-variance for variable for each numeric variable f z_if = (x_if - mean_f)/mad_f :param v: Vector to be standardized :param method: Method used for normalization: 'minmax': linear transformation on the original data to the range [new minA,new maxA] [(val - min_val) / (max_val - min_val)] * (new_max - new min) + new_min 'zscore_std': z-score normalization for zero-mean, and unit-variance values) denominator is standard deviation (val - mean) / std 'zscore_mad': z-score normalization for zero-mean, and unit-variance values) denominator is mean absolute deviation (val - mean) / mad :param new_max: New maximum value of range to normalize to (Default: 1). :param new_min: New minimum value of range to normalize to (Default: 0). :return: Standardized vector """ standardize = [] for i, val in enumerate(v): if not isinstance(val, str): if method == 'minmax': # linear transformation on the original data to the range [new minA,new maxA] min_val = MetaUtils.get_attr_metadata(i, 'Min_Val') max_val = MetaUtils.get_attr_metadata(i, 'Max_Val') standardize.append((float(val - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min) elif method == 'zscore_std': mean = MetaUtils.get_attr_metadata(i, 'Mean') s = MetaUtils.get_attr_metadata(i, 'std') standardize.append((val - mean) / s) elif method == 'zscore_mad': mean = MetaUtils.get_attr_metadata(i, 'Mean') s = MetaUtils.get_attr_metadata(i, 'mad') standardize.append((val - mean) / s) else: standardize.append(val) return standardize