def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    """
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    """
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)
        )
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)
        )
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print "finish cluster"
    sqlite.save_user_group_clustered(user_group_dict)
예제 #2
0
    def KMeansClustering_Iterative(self, step=False):
        '''
        Integrate orange here
        actually it is a little subtle here:
        1.  I dont think kmeans is a good way to decide which community a node(user) should be
        however it is the most generalized one

        2.TODO maybe change the number of clusters(rather than make it automatically...) is better, but you have to check to result first
        '''
        eig_data = self.build_orange_data_from_eig_vector()
        # clustering
        self.km = Orange.clustering.kmeans.Clustering(data=eig_data,
                                                      centroids=5,
                                                      distance=EigDistance)
        # you had better construct it into a vertex_clustering in order to compute modularity, it is not reasonable to use orange to insert into database any more
        clusters = self.km.clusters

        d = {}
        for idx, c in enumerate(clusters):
            if not d.has_key(c):
                d[c] = [idx]
            else:
                d[c].append(idx)

        import Queue
        q = Queue.Queue()

        for v in d.values():
            q.put(v)

        res_list = []

        import CommunityExtraction as ce
        while q.qsize() > 0:
            v = q.get()
            print 'qsize:%s cluster size: %s res list size: %s' % (
                q.qsize(), len(v), len(res_list))
            if len(v) < ce.CRITERION_CLUSTER_NODES_LOWER_BOUND:
                res_list.append(v)
                pass
            elif len(v) > ce.CRITERION_CLUSTER_NODES_UPPER_BOUND:
                # may be it can be iterative
                sub_data = eig_data.get_items(v)
                sub_km = Orange.clustering.kmeans.Clustering(
                    data=sub_data, centroids=5, distance=EigDistance)
                sub_clusters = sub_km.clusters
                temp_d = dict()
                for idx, c in enumerate(sub_clusters):
                    if not temp_d.has_key(c):
                        temp_d[c] = [v[idx]]
                    else:
                        temp_d[c].append(v[idx])

                for sub_v in temp_d.values():
                    q.put(sub_v)
                pass
            else:
                res_list.append(v)
                pass
            pass

        clusters = [0] * len(eig_data)
        for idx, res in enumerate(res_list):
            for r in res:
                clusters[r] = idx
            pass

        import igraph
        self.vertex_clustering = igraph.clustering.VertexClustering(
            self.g, clusters)
        print 'writing vertex_clustering'
        FSDao.write_pickle(self.expr_dir,
                           SQLDao.ce.properties['vertex_clustering_file_name'],
                           self.vertex_clustering)
        print 'finished writing vertex_clustering'
예제 #3
0
def Guess_User_Group_by_KMeans(db_dir, db_file_name):
    '''
    1.  get distinct user ids
    2.  foreach user id, compute which group should it be
        2.1 convert the data to orange data table
        2.2 kmeans
    3.  save them into database
    '''
    sqlite = SQLDao.SQLiteDao(db_dir, db_file_name)
    h1, uids = sqlite.get_distinct_user_id()

    user_group_dict = {}

    for uid in uids:
        # retreive the user group info of a specific user
        h2, uid_group_info = sqlite.get_group_info_by_user_id(
            uid[SQLDao.LABEL_USER_GROUP_INFO_USERID])
        # convert the uid group info into the orange data table
        features = []
        features.append(
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT))
        domain = Orange.data.Domain(features, False)
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID))
        domain.add_meta(
            Orange.feature.Descriptor.new_meta_id(),
            Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID))
        datas = []
        for i in uid_group_info:
            data = Orange.data.Instance(
                domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]])
            data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_USERID]
            data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[
                SQLDao.LABEL_USER_GROUP_INFO_GROUPID]
            datas.append(data)

        table = Orange.data.Table(domain, datas)
        target_instances = []
        if len(table) > 3:
            km = Orange.clustering.kmeans.Clustering(
                data=table, distance=GroupCountDistance)
            clusters = km.clusters
            d = {}
            for idx, c_label in enumerate(clusters):
                if d.has_key(c_label):
                    d[c_label].append(table[idx])
                else:
                    d[c_label] = [table[idx]]

            if len(d) == 3:
                # figure out which cluster represent the largest cluster
                max_label = None
                max_value = -1
                for label, instances in d.items():
                    temp_list = [
                        i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value
                        for i in instances
                    ]
                    if max(temp_list) > max_value:
                        max_value = max(temp_list)
                        max_label = label
                        pass
                for instance in d[max_label]:
                    target_instances.append(instance)
        else:
            # just pick the group which has the largest group_count if it is large enough?
            table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT])
            if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20:
                target_instances.append(table[-1])

        # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]
        user_group_dict[uid[
            SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances
        pass

    print 'finish cluster'
    sqlite.save_user_group_clustered(user_group_dict)