Exemplo n.º 1
0
    def relative_error(x, y):
        """
        Calculate error between original value and the masked value, for each attribute in given vector.
        1) For numeric attribute i: | a_j - a'_j | / max(DOM(A_j).
        2) For categorical attribute i: dist(a_j, a'_j) / m,  for nominal distance, normalized  by number of attributes m.
        Error is summed over all attributes.
        :param x: First vector
        :param y: Second vector
        :return: Sum of relative error for given pair of vectors
        """
        diff = 0
        length = len(x)
        pair_wise_list = list(zip(*[x, y]))
        for idx, pair in enumerate(pair_wise_list):
            if isinstance(pair[0], (int, float)):  # numeric attribute
                min_val = MetaUtils.get_attr_metadata(idx, 'Min_Val')
                max_val = MetaUtils.get_attr_metadata(idx, 'Max_Val')
                if pair[0] != 0 or pair[1] != 0:
                    diff += float(abs(pair[0] - pair[1])) / (max_val - min_val)

            elif isinstance(pair[1], str):  # Categorical attribute
                dtype = MetaUtils.get_attr_metadata(idx, 'Val_Type')
                w = MetaUtils.get_attr_metadata(idx, 'Weight')
                max_rank = MetaUtils.get_attr_metadata(idx, 'Max_Rank')
                if dtype == "ordinal":
                    diff += MetricsUtils.distance_ordinal_feature(pair[0], pair[1], max_rank, w) / float(length)
                elif dtype == "nominal":
                    diff += MetricsUtils.distance_nominal_feature(pair[0], pair[1], w) / float(length)
        return diff
 def randomize(t, w=None):
     """
     Randomize values of each attribute in tuple
     For numerical attributes - return uniform random value in [min_val, max_val] range
     For categorical attributes - return random value from set of unique attribute values
     :param t: Tuple to be randomized
     :param w: Current buffer batch of tuples in the cluster to which t belongs. Default None
     :return: Randomized version of the tuple
     """
     qi = list(t.quasi_identifier)
     for i in range(0, len(qi)):
         if not isinstance(qi[i], str):  # Numerical attributes
             min_val = MetaUtils.get_attr_metadata(i, 'Min_Val')
             max_val = MetaUtils.get_attr_metadata(i, 'Max_Val')
             if not w:
                 qi[i] = DistributionUtils.get_uniform_rand(min=min_val, max=max_val, dtype=type(qi[i]))
             else:
                 records = map(lambda x: x.quasi_identifier, w)
                 attr_vals = list(zip(*records))[i]
                 x = DistributionUtils.get_estimated_rand(sample_batch=attr_vals, dtype=type(qi[i]))
                 qi[i] = MetricsUtils.truncate_value(x, l=min_val, u=max_val, dtype=type(qi[i]))
         else:  # Categorical attributes
             unique_values = MetaUtils.get_attr_metadata(i, 'Distinct_Val')
             qi[i] = DistributionUtils.get_uniform_rand(sample_batch=unique_values, dtype=type(qi[i]))
     anonymized = copy.copy(t)
     anonymized.quasi_identifier = qi
     return anonymized
Exemplo n.º 3
0
    def distance(x, y, normalize=True):
        """
        Calculate distance between two tuples (vectors)
        :param normalize: Whether normalization is required for calculating distance (Default: True)
        :param x: First tuple
        :param y: Second tuple
        :return: Distance between possibly mixed types, considering the attributes' weights
        """
        f_list_len = len(MetaUtils.stream_metadata) - 1  # Ignoring the last class attribute

        dist, sum_weights = 0, 0

        if MetaUtils.is_all_nominal:
            return MetricsUtils.distance_all_nominal(x, y)

        elif MetaUtils.is_all_numeric:
            return MetricsUtils.transformed_euclidean_distance(x, y)
        else:
            for i in range(0, f_list_len):
                dtype, w, max_rank, min_val, max_val, distinct_val = MetaUtils.get_all_attr_metadata(i)

                sum_weights += w
                if dtype == "ordinal":
                    dist += MetricsUtils.distance_ordinal_feature(x[i], y[i], max_rank, w)
                elif dtype == "continuous" or dtype == "discrete":
                    dist += MetricsUtils.distance_interval_feature(x[i], y[i], min_val, max_val, w)
                elif dtype == "nominal":
                    dist += MetricsUtils.distance_nominal_feature(x[i], y[i], w)
            return dist / sum_weights
Exemplo n.º 4
0
    def convert_CSV_to_ARFF(dir, file_name):
        """
        Converts file of CSV format to a WEKA / MOA readable ARFF format
        :param dir: Directory of source and target files paths
        :param file_name: Name of file to be converted (dataset name)
        :return: True if conversion is successful, otherwise False
        """
        # Create temproray path for ARFF file, Weka does not interpret attribute types correctly
        arff_file = file_name.split('.')[0] + '.arff'

        csv_path = os.path.abspath(os.path.join(dir, file_name))
        arff_path = os.path.abspath(os.path.join(dir, arff_file))
        weka_csv_class = 'weka.core.converters.CSVLoader'

        #  Correct only categorical attributes, which are interpreted as numerical by Weka
        nominal_indx = MetaUtils.get_all_nominal_indx()
        nominal_indx = ','.join(str(i) for i in nominal_indx)

        ExternalProcesses.run_process(
            OrderedDict([('p_type', 'java'), ('t_type', 'weka'),
                         ('jclass', weka_csv_class), ('path', csv_path),
                         ('N', nominal_indx), ('gt', arff_path), ('B', '')]))

        # Save corrected arrtibute types to new ARFF file and delete temp file
        # file_path = StreamWriter.correct_ARFF_attribute_types(tmp_arff_path)
        # os.remove(tmp_arff_path)
        # return file_path
        return arff_path
    def replace_with_centroid(t, publish_qi):
        """
        Replace each tuple's values with the values of the centroid of cluster to which the tuple belongs.
        Validate the original type of attribute (int, float, string...), to maintain utility of data.
        :param t: Tuple to be published and replaced
        :param publish_qi: Centroid of cluster to which the original tuple belongs (if required, after noise addition)
        :return: Anonymized version of tuple
        """
        anonymized = copy.copy(t)
        anonymized.quasi_identifier = publish_qi

        for i, attr in enumerate(anonymized.quasi_identifier):
            anonymized.quasi_identifier[i] = MetaUtils.validate_dtype(attr, i)
        return anonymized
Exemplo n.º 6
0
 def build_numeric_vec(z):
     """
     Convert non-numeric vector to categorical vector, replacing nominal value with a binary vector
     Each nominal attribute is replaced with (0,....,0,1,0...,0) vector of length corresponding to number of unique values
     Numeric representation is used for calculating distance/similarity with numeric similarity functions
     :param z: Non-numeric vector to be converted
     :return:
     """
     v = []
     m = len(z) if len(z) else None
     if m:
         for i in range(0, m):
             if not isinstance(z[i], str):  # Numerical attribute
                 v.append(z[i])
             else:  # Categorical attribute
                 distinct_vals = MetaUtils.get_attr_metadata(i, 'Distinct_Val')
                 cat = [1 if z[i] == d else 0 for d in distinct_vals]
                 v.extend(cat)
     return v
Exemplo n.º 7
0
    def correct_ARFF_attribute_types(arff_path):
        """
        Correct definition of categorical attributes in ARFF file, which are interpreted as numerical by Weka
        :param arff_path: Path of ARFF file
        :return: True if correction is successful, otherwise False
        """
        corrected_path = arff_path.replace('_temp', '')
        weka_filter_class = 'weka.filters.unsupervised.attribute.NumericToNominal'

        # Correct only categorical attributes, which are interpreted as numerical by Weka
        # nominal_indx = [i + 1 for i, key in enumerate(sr.STREAM_METADATA)
        #                 if sr.STREAM_METADATA[key]['Type'] == "categorical"]
        nominal_indx = [
            i + 1 for i in range(0, len(MetaUtils.stream_attr_names))
            if MetaUtils.get_attr_metadata(i, 'Type') == "categorical"
        ]
        nominal_indx = ','.join(str(i) for i in nominal_indx)

        ExternalProcesses.run_process(
            OrderedDict([('p_type', 'java'), ('t_type', 'weka'),
                         ('jclass', weka_filter_class), ('i', arff_path),
                         ('o', corrected_path), ('R', nominal_indx)]))

        return corrected_path
Exemplo n.º 8
0
 def normalize(v, method='minmax', new_min=0, new_max=1):
     """
     Normalize vector of numeric values (not in [0,1] range) to standardized measurement (z-score),
     for efficient distance calculations
     Use mean absolute deviation or std unit-variance for variable for each numeric variable f
     z_if = (x_if - mean_f)/mad_f
     :param v: Vector to be standardized
     :param method: Method used for normalization:
      'minmax': linear transformation on the original data to the range [new minA,new maxA]
               [(val - min_val) / (max_val - min_val)] * (new_max - new min) + new_min
      'zscore_std': z-score normalization for zero-mean, and unit-variance values) denominator is standard deviation
               (val - mean) / std
      'zscore_mad': z-score normalization for zero-mean, and unit-variance values) denominator is mean absolute deviation
               (val - mean) / mad
     :param new_max: New maximum value of range to normalize to (Default: 1).
     :param new_min: New minimum value of range to normalize to (Default: 0).
     :return: Standardized vector
     """
     standardize = []
     for i, val in enumerate(v):
         if not isinstance(val, str):
             if method == 'minmax':  # linear transformation on the original data to the range [new minA,new maxA]
                 min_val = MetaUtils.get_attr_metadata(i, 'Min_Val')
                 max_val = MetaUtils.get_attr_metadata(i, 'Max_Val')
                 standardize.append((float(val - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min)
             elif method == 'zscore_std':
                 mean = MetaUtils.get_attr_metadata(i, 'Mean')
                 s = MetaUtils.get_attr_metadata(i, 'std')
                 standardize.append((val - mean) / s)
             elif method == 'zscore_mad':
                 mean = MetaUtils.get_attr_metadata(i, 'Mean')
                 s = MetaUtils.get_attr_metadata(i, 'mad')
                 standardize.append((val - mean) / s)
         else:
             standardize.append(val)
     return standardize