Exemplo n.º 1
0
def clustering_oka(nec_set, k=25):
    """
    Group record according to NCP. OKA: one time pass k-means
    """
    can_clusters = [cluster for cluster in nec_set if len(cluster) >= k]
    nec_set = [cluster for cluster in nec_set if len(cluster) < k]
    remain = sum([len(t) for t in nec_set])
    clusters = []
    # randomly choose seed and find k-1 nearest records to form cluster with size k
    seed_index = random.sample(range(len(nec_set)), remain / k)
    for index in seed_index:
        can_clusters.append(nec_set[index])
    nec_set = [t for i, t in enumerate(nec_set[:]) if i not in set(seed_index)]
    while len(nec_set) > 0:
        nec = nec_set.pop()
        index = find_best_cluster_iloss(nec, can_clusters)
        can_clusters[index].merge_cluster(nec)
    residual = []
    less_clusters = []
    for cluster in can_clusters:
        if len(cluster) < k:
            less_clusters.append(cluster)
        else:
            if len(cluster) > k:
                adjust_cluster(cluster, residual, k)
            clusters.append(cluster)
    while len(residual) > 0:
        record = residual.pop()
        record_key = qid_to_key(record[:QI_LEN])
        if len(less_clusters) > 0:
            index = find_best_cluster_iloss(record, less_clusters)
            less_clusters[index].add_record(record)
            residual_handle(residual, record_key, less_clusters[index])
            if len(less_clusters[index]) >= k:
                clusters.append(less_clusters.pop(index))
        else:
            index = find_best_cluster_iloss(record, clusters)
            clusters[index].add_record(record)
            residual_handle(residual, record_key, clusters[index])
    # sometimes residual records cannot satisfy less_clusters
    # so we need to handle these clusters
    if len(less_clusters) > 0:
        for cluster in less_clusters:
            residual.extend(cluster.member)
        while len(residual) > 0:
            record = residual.pop()
            record_key = qid_to_key(record[:QI_LEN])
            index = find_best_cluster_iloss(record, clusters)
            clusters[index].add_record(record)
            residual_handle(residual, record_key, clusters[index])
    return clusters
def clustering_oka(nec_set, k=25):
    """
    Group record according to NCP. OKA: one time pass k-means
    """
    can_clusters = [cluster for cluster in nec_set if len(cluster) >= k]
    nec_set = [cluster for cluster in nec_set if len(cluster) < k]
    remain = sum([len(t) for t in nec_set])
    clusters = []
    # randomly choose seed and find k-1 nearest records to form cluster with size k
    seed_index = random.sample(range(len(nec_set)), remain / k)
    for index in seed_index:
        can_clusters.append(nec_set[index])
    nec_set = [t for i, t in enumerate(nec_set[:]) if i not in set(seed_index)]
    while len(nec_set) > 0:
        nec = nec_set.pop()
        index = find_best_cluster_iloss(nec, can_clusters)
        can_clusters[index].merge_cluster(nec)
    residual = []
    less_clusters = []
    for cluster in can_clusters:
        if len(cluster) < k:
            less_clusters.append(cluster)
        else:
            if len(cluster) > k:
                adjust_cluster(cluster, residual, k)
            clusters.append(cluster)
    while len(residual) > 0:
        record = residual.pop()
        record_key = qid_to_key(record[:QI_LEN])
        if len(less_clusters) > 0:
            index = find_best_cluster_iloss(record, less_clusters)
            less_clusters[index].add_record(record)
            residual_handle(residual, record_key, less_clusters[index])
            if len(less_clusters[index]) >= k:
                clusters.append(less_clusters.pop(index))
        else:
            index = find_best_cluster_iloss(record, clusters)
            clusters[index].add_record(record)
            residual_handle(residual, record_key, clusters[index])
    # sometimes residual records cannot satisfy less_clusters
    # so we need to handle these clusters
    if len(less_clusters) > 0:
        for cluster in less_clusters:
            residual.extend(cluster.member)
        while len(residual) > 0:
            record = residual.pop()
            record_key = qid_to_key(record[:QI_LEN])
            index = find_best_cluster_iloss(record, clusters)
            clusters[index].add_record(record)
            residual_handle(residual, record_key, clusters[index])
    return clusters
Exemplo n.º 3
0
def NCP(record):
    """Compute NCP (Normalized Certainty Penalty)
    when generate record to gen_result.
    """
    ncp = 0.0
    # exclude SA values(last one type [])
    list_key = qid_to_key(record)
    try:
        return NCP_CACHE[list_key]
    except KeyError:
        pass
    for i in range(QI_LEN):
        # if leaf_num of numerator is 1, then NCP is 0
        width = 0.0
        if IS_CAT[i] is False:
            try:
                float(record[i])
            except ValueError:
                temp = record[i].split(',')
                width = float(temp[1]) - float(temp[0])
        else:
            width = len(ATT_TREES[i][record[i]]) * 1.0
        width /= QI_RANGE[i]
        ncp += width
    NCP_CACHE[list_key] = ncp
    return ncp
def NCP(record):
    """Compute NCP (Normalized Certainty Penalty)
    when generate record to gen_result.
    """
    ncp = 0.0
    # exclude SA values(last one type [])
    list_key = qid_to_key(record)
    try:
        return NCP_CACHE[list_key]
    except KeyError:
        pass
    for i in range(QI_LEN):
        # if leaf_num of numerator is 1, then NCP is 0
        width = 0.0
        if IS_CAT[i] is False:
            try:
                float(record[i])
            except ValueError:
                temp = record[i].split(',')
                width = float(temp[1]) - float(temp[0])
        else:
            width = len(ATT_TREES[i][record[i]]) * 1.0
        width /= QI_RANGE[i]
        ncp += width
    NCP_CACHE[list_key] = ncp
    return ncp
Exemplo n.º 5
0
def residual_handle(residual, record_key, cluster):
    while True:
        try:
            same_record = residual[-1]
        except IndexError:
            break
        if record_key == qid_to_key(same_record[:QI_LEN]):
            cluster.add_record(residual.pop(-1))
        else:
            break
def residual_handle(residual, record_key, cluster):
    while True:
        try:
            same_record = residual[-1]
        except IndexError:
            break
        if record_key == qid_to_key(same_record[:QI_LEN]):
            cluster.add_record(residual.pop(-1))
        else:
            break
Exemplo n.º 7
0
def create_nec(data):
    """
    create NEC from dateset using dict
    :param data: dataset
    :return: NEC in dict format: key is str, value is Cluster
    """
    nec_dict = dict()
    for record in data:
        key = qid_to_key(record[:QI_LEN])
        try:
            nec_dict[key].add_same_record(record)
        except KeyError:
            nec_dict[key] = Cluster([record], record)
    return nec_dict
def create_nec(data):
    """
    create NEC from dateset using dict
    :param data: dataset
    :return: NEC in dict format: key is str, value is Cluster
    """
    nec_dict = dict()
    for record in data:
        key = qid_to_key(record[:QI_LEN])
        try:
            nec_dict[key].add_same_record(record)
        except KeyError:
            nec_dict[key] = Cluster([record], record)
    return nec_dict