def Guess_User_Group_by_KMeans(db_dir, db_file_name): """ 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database """ sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id(uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append(Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID) ) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID) ) datas = [] for i in uid_group_info: data = Orange.data.Instance(domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering(data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print "finish cluster" sqlite.save_user_group_clustered(user_group_dict)
def Guess_User_Group_by_KMeans(db_dir, db_file_name): ''' 1. get distinct user ids 2. foreach user id, compute which group should it be 2.1 convert the data to orange data table 2.2 kmeans 3. save them into database ''' sqlite = SQLDao.SQLiteDao(db_dir, db_file_name) h1, uids = sqlite.get_distinct_user_id() user_group_dict = {} for uid in uids: # retreive the user group info of a specific user h2, uid_group_info = sqlite.get_group_info_by_user_id( uid[SQLDao.LABEL_USER_GROUP_INFO_USERID]) # convert the uid group info into the orange data table features = [] features.append( Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT)) domain = Orange.data.Domain(features, False) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_USERID)) domain.add_meta( Orange.feature.Descriptor.new_meta_id(), Orange.feature.Continuous(SQLDao.LABEL_USER_GROUP_INFO_GROUPID)) datas = [] for i in uid_group_info: data = Orange.data.Instance( domain, [i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]]) data[SQLDao.LABEL_USER_GROUP_INFO_USERID] = i[ SQLDao.LABEL_USER_GROUP_INFO_USERID] data[SQLDao.LABEL_USER_GROUP_INFO_GROUPID] = i[ SQLDao.LABEL_USER_GROUP_INFO_GROUPID] datas.append(data) table = Orange.data.Table(domain, datas) target_instances = [] if len(table) > 3: km = Orange.clustering.kmeans.Clustering( data=table, distance=GroupCountDistance) clusters = km.clusters d = {} for idx, c_label in enumerate(clusters): if d.has_key(c_label): d[c_label].append(table[idx]) else: d[c_label] = [table[idx]] if len(d) == 3: # figure out which cluster represent the largest cluster max_label = None max_value = -1 for label, instances in d.items(): temp_list = [ i[SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value for i in instances ] if max(temp_list) > max_value: max_value = max(temp_list) max_label = label pass for instance in d[max_label]: target_instances.append(instance) else: # just pick the group which has the largest group_count if it is large enough? table.sort([SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT]) if table[-1][SQLDao.LABEL_USER_GROUP_INFO_GROUPCOUNT].value > 20: target_instances.append(table[-1]) # print 'processing %s'%uid[SQLDao.LABEL_USER_GROUP_INFO_USERID] user_group_dict[uid[ SQLDao.LABEL_USER_GROUP_INFO_USERID]] = target_instances pass print 'finish cluster' sqlite.save_user_group_clustered(user_group_dict)