Exemplo n.º 1
0
def find_distinct_distance(normalized_data_set):
    """
    find the median of distance which can distinguish the data.
    :param normalized_data_set: dataset to calculate. The dataset must be normalized
    :return: float-distance
    """

    n = len(normalized_data_set)

    # find out all kinds of classes in the dataset
    classes = list(set([i[-1]for i in normalized_data_set]))
    assert len(classes) > 1, "unfortunately all data selected are in same class."
    classes_index = dict()
    for c in classes:
        classes_index[c] = [index for index, i in enumerate(normalized_data_set) if i[-1] == c]

    distances = []
    for data in normalized_data_set:
        c = data[-1]
        diff_class_data_indices = [index for index in range(n) if index not in classes_index[c]]
        distances.append(min([toolkit.euclidean_dist(data, normalized_data_set[index])
                              for index in diff_class_data_indices]))
    import numpy
    median_dist = numpy.median(numpy.array(distances))
    logging.debug("The distinguish distance is %f" % median_dist)
    return median_dist
Exemplo n.º 2
0
def find_distinct_distance(normalized_data_set):
    """
    find the median of distance which can distinguish the data.
    :param normalized_data_set: dataset to calculate. The dataset must be normalized
    :return: float-distance
    """

    n = len(normalized_data_set)

    # find out all kinds of classes in the dataset
    classes = list(set([i[-1] for i in normalized_data_set]))
    assert len(
        classes) > 1, "unfortunately all data selected are in same class."
    classes_index = dict()
    for c in classes:
        classes_index[c] = [
            index for index, i in enumerate(normalized_data_set) if i[-1] == c
        ]

    distances = []
    for data in normalized_data_set:
        c = data[-1]
        diff_class_data_indices = [
            index for index in range(n) if index not in classes_index[c]
        ]
        distances.append(
            min([
                toolkit.euclidean_dist(data, normalized_data_set[index])
                for index in diff_class_data_indices
            ]))
    median_dist = numpy.median(numpy.array(distances))
    logging.debug("The distinguish distance is %f" % median_dist)
    return median_dist
Exemplo n.º 3
0
def whether_add_to_private_cache(data_instance, existed_cache, distinguish_distance):
    """

    :param data_instance: this should be normalized
    :param existed_cache: this should be normalized
    :param distinguish_distance:
    :return:
    """
    # TODO whether need to check the class?!
    for data in existed_cache:
        if toolkit.euclidean_dist(data_instance, data) <= distinguish_distance:
            return False

    return True
Exemplo n.º 4
0
def whether_add_to_private_cache(data_instance, existed_cache,
                                 distinguish_distance):
    """

    :param data_instance: this should be normalized
    :param existed_cache: this should be normalized
    :param distinguish_distance:
    :return:
    """
    # TODO whether need to check the class?!
    for data in existed_cache:
        if toolkit.euclidean_dist(data_instance, data) <= distinguish_distance:
            return False

    return True
Exemplo n.º 5
0
def simplify_morph(data, alpha, beta):
    """
    same as MOPRH. But require
    1) data is handled, no header, class at last column
    2) data has been normalized
    """
    classes = map(list, zip(*data))[-1]
    data = [i for i in data]

    for row_index, row in enumerate(data):  # for each row
        heterogeneous_index = [
            i for i in range(len(classes)) if classes[i] != classes[row_index]
        ]
        boundary_dist = min([
            toolkit.euclidean_dist(row, data[heg])
            for heg in heterogeneous_index
        ])
        boundary_dist /= math.sqrt(len(data[0]) - 2)
        for i in range(len(row)):
            data[row_index][i] += boundary_dist * random.uniform(
                alpha, beta) * random.choice([1, -1])  # shake
    return data
Exemplo n.º 6
0
def morph(attribute_names,
          data_matrix,
          independent_attrs,
          objective_attr,
          objective_as_binary=False,
          data_has_normalized=False,
          alpha=0.15,
          beta=0.35):
    """
    morph is a instance mutation which can shake the instance within the class boundary
    :param attribute_names: the names of attributes, should match the data_matrix
    :param data_matrix: original data
    :param independent_attrs: set up the independent attributes in the dataset. Note: 'name', 'id', etc. might not be
        considered as independent attributes
    :param objective_attr: marking which attribute is the objective to be considered
    :param objective_as_binary: signal to set up whether treat the objective as a binary attribute. Default: False
    :param data_has_normalized: telling whether the data matrix has been normalized.
    :param alpha: morph algorithm parameter
    :param beta: morph algorithm parameter
    :return:
    """

    dataset_t = map(list, zip(*data_matrix))
    dataset = list()
    classes = list()
    for d, a in zip(dataset_t, attribute_names):
        if a in independent_attrs:
            dataset.append(d)
        if a == objective_attr:
            classes = list(d)

    dataset = map(list, zip(*dataset))
    dataset = [map(toolkit.str2num, row) for row in dataset]  # str to numeric
    classes = map(toolkit.str2num, classes)

    if objective_as_binary:
        classes = [1 if i > 0 else 0 for i in classes]
    else:
        classes = toolkit.apply_bin_range(classes)

    is_int = [type(i) is int for i in dataset[0]
              ]  # save. for better representation of the output table

    if data_has_normalized:
        #  adding two instance (all zeros and all ones) so that the normalization and de-normalization process
        #  do not damage the original data
        dataset.append([0] * len(dataset[0]))
        dataset.append([1] * len(dataset[0]))
    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    norm_funcs = []
    denorm_funcs = []

    # normalizing

    for attr_index, attr_elements in enumerate(
            dataset):  # for each attribute elements
        f1, f2 = toolkit.attr_norm(attr_elements)
        norm_funcs.append(f1)
        denorm_funcs.append(f2)
        dataset[attr_index] = map(f1, attr_elements)
    '''dataset mode recover...'''
    dataset = map(list, zip(*dataset))  # transpose again.

    for row_index, row in enumerate(dataset):  # for each row
        heterogeneous_index = [
            i for i in range(len(classes)) if classes[i] != classes[row_index]
        ]
        boundary_dist = min([
            toolkit.euclidean_dist(row, dataset[heg])
            for heg in heterogeneous_index
        ])
        boundary_dist /= math.sqrt(len(independent_attrs) - 1)
        for i in range(len(row)):
            dataset[row_index][i] += boundary_dist * random.uniform(
                alpha, beta) * random.choice([1, -1])  # shake
    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    for attr_index, attr_elements in enumerate(
            dataset):  # for each attribute elements
        dataset[attr_index] = map(denorm_funcs[attr_index],
                                  attr_elements)  # scale to the original
        for i in range(len(dataset[attr_index])):
            if is_int[attr_index]:
                dataset[attr_index][i] = int(round(
                    dataset[attr_index][i]))  # rounding when needed
            else:
                dataset[attr_index][i] = round(dataset[attr_index][i], 4)
    morphed = map(list,
                  zip(*dataset))  # recover to the original mode and finish.
    '''!!morph done!!'''
    if data_has_normalized:
        morphed = morphed[:-2]

    res = list()
    for x, dm in zip(morphed, data_matrix):
        row = list()
        tmp = 0
        for attri, attr in enumerate(attribute_names):
            if attr in independent_attrs:
                row.append(x[tmp])
                tmp += 1
            elif attr == objective_attr:
                row.append(toolkit.str2num(dm[attri]))
            else:
                row.append(dm[attri])
        res.append(row)

    return res
Exemplo n.º 7
0
def MORPH(database, db_folder='not_from_csv_file', write_out_folder=None,
          db_has_normalized=False, effect_scope=[0, -1]):
    """
    MORPH is a instance mutator which can shake the instance within the class boundary
    :param database: original data.
    :param db_folder: from where database fetch. by default, the database is a list of list
    :param write_out_folder: specify the  writing out folder. None means no writing
    :param db_has_normalized: whether the database has been normalized
    :param effect_scope: specify the scope in the database to be morphed. Any data beyond the scope will remain the same
    :return: the morphed data
    """
    alpha = settings.MORPH_alpha
    beta = settings.MORPH_beta

    # load the database
    if db_folder != 'not_from_csv_file':
        with open(db_folder + '/' + database + '.csv', 'r') as db:
            reader = csv.reader(db)
            attributes = next(reader)  # including the last one--class tag
            dataset = []
            for line in reader:
                dataset.append(line)
        dataset = [map(toolkit.str2num, row) for row in dataset]  # str to numeric
    else:
        dataset = database
        attributes = ['foo'] * len(dataset[0])

    # backup the no-need-to-morphed data
    if effect_scope[1] < 0:
        effect_scope[1] += len(dataset)+1
    backup_data_set = copy.deepcopy(dataset[effect_scope[0]:effect_scope[1]])

    if db_has_normalized:
        #  adding two instance (all zeros and all ones) so that the normalization and de-normalization process
        #  do not damage the original data
        dataset.append([0]*len(dataset[0]))
        dataset.append([1]*len(dataset[0]))

    is_int = [type(i) is int for i in dataset[0]]  # save. for better representation of the output table
    classes = [row[-1] for row in dataset]  # fetch the classes
    dataset = [row[:-1] for row in dataset]  # separating the raw data and class

    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    norm_funcs = []
    denorm_funcs = []

    # normalizing

    for attr_index, attr_elements in enumerate(dataset):  # for each attribute elements
        f1, f2 = toolkit.attr_norm(attr_elements)
        norm_funcs.append(f1)
        denorm_funcs.append(f2)
        dataset[attr_index] = map(f1, attr_elements)

    '''dataset mode recover...'''
    dataset = map(list, zip(*dataset))  # transpose again.

    for row_index, row in enumerate(dataset):  # for each row
        heterogeneous_index = [i for i in range(len(classes)) if classes[i] != classes[row_index]]
        boundary_dist = min([toolkit.euclidean_dist(row, dataset[heg]) for heg in heterogeneous_index])
        boundary_dist /= math.sqrt(len(attributes)-1)
        for i in range(len(row)):
            dataset[row_index][i] += boundary_dist*random.uniform(alpha, beta)*random.choice([1, -1])  # shake

    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    for attr_index, attr_elements in enumerate(dataset):  # for each attribute elements
        dataset[attr_index] = map(denorm_funcs[attr_index], attr_elements)  # scale to the original
        for i in range(len(dataset[attr_index])):
            if is_int[attr_index]:
                dataset[attr_index][i] = int(round(dataset[attr_index][i]))  # rounding when needed
    morphed = map(list, zip(*dataset))  # recover to the original mode and finish.

    '''!!MORPH done!!'''

    # writing out and output the results
    for row_index in range(len(morphed)):
        morphed[row_index].append(classes[row_index])
    if db_folder != 'not_from_csv_file':
        morphed.insert(0, attributes)

    if write_out_folder:
        with open(write_out_folder + '/'+database+'.csv', 'wb') as f:
            writer = csv.writer(f)
            writer.writerows(morphed)

    if db_has_normalized:
        morphed = morphed[:-2]

    # recover the unmorphed data
    for backup, row_index in zip(backup_data_set, range(effect_scope[0], effect_scope[1])):
        morphed[row_index] = backup

    return morphed