def __init__(self):
     self.prop = ReadProperties("data/app.properties")
     self.logger = LoggingUtil("data/logs/")
     self.host = self.prop.get("db_host")
     self.port = self.prop.get("db_port")
     self.db = self.prop.get("db")
     self.user_name = self.prop.get("user_name")
     self.password = self.prop.get("password")
Exemplo n.º 2
0
 def __init__(self, media_ap_centers, media_ap_labels, media_ap_indices,
              mediaList_ap_centers, mediaList_ap_labels,
              mediaList_ap_indices):
     self.media_ap_labels = media_ap_labels
     self.media_ap_centers = media_ap_centers
     self.mediaList_ap_centers = mediaList_ap_centers
     self.mediaList_ap_labels = mediaList_ap_labels
     self.media_ap_indices = media_ap_indices
     self.mediaList_ap_indices = mediaList_ap_indices
     self.logger = LoggingUtil("data/logs/")
Exemplo n.º 3
0
    def __init__(self):
        self.prop = ReadProperties("data/app.properties")
        self.logger = LoggingUtil("data/logs/")
        self.host = self.prop.get("db_host")
        self.port = int(self.prop.get("db_port"))
        self.db = self.prop.get("db")
        self.user = self.prop.get("user_name")
        self.password = self.prop.get("password")
        self.trainset = {}
        self.testset = {}

        # n_sim_user: top 20个用户, n_rec_media: top 10个推荐结果
        self.n_sim_media = 20
        self.n_rec_media = 10

        # media_sim_mat: 歌曲之间的相似度, media_popular: 歌曲出现的次数, media_count: 播放歌曲的总数据
        self.media_sim_mat = {}
        self.media_popular = {}
        self.media_count = 0
        self.rec_media_dict = {}
        self.all_rec_medias = {}

        self.logger.log().info('Similar media number = %d' % self.n_sim_media)
        self.logger.log().info('Recommended media number = %d' % self.n_rec_media)
 def __init__(self, media_relation_ap_centers, media_relation_ap_labels):
     self.media_relation_ap_centers = media_relation_ap_centers
     self.media_relation_ap_labels = media_relation_ap_labels
     self.logger = LoggingUtil("data/logs/")
     self.prop = ReadProperties("data/app.properties")
class MediaLabelKMeans:
    def __init__(self, media_relation_ap_centers, media_relation_ap_labels):
        self.media_relation_ap_centers = media_relation_ap_centers
        self.media_relation_ap_labels = media_relation_ap_labels
        self.logger = LoggingUtil("data/logs/")
        self.prop = ReadProperties("data/app.properties")

    def isExists(self, file_name):
        """
        判断文件是否存在
        :param fileName:
        :return:boolean 是否存在
        """
        return os.path.exists(file_name)

    def file_read_fun(self, file_name, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if self.isExists(file_name):
            f = open(file_name, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        else:
            self.logger.log().error("%s is not exists" % file_name)
        return data

    def media_tag_matrix_fun(self, media_list, tag_list,
                             media_tag_relation_file):
        """
        user_tag_matrix_fun(返回歌曲标签矩阵,value为score)
        :param media_list:
        :param tag_list:
        :param media_tag_relation_file:
        :return: media_tag_mat
        """

        self.logger.log().info("media tag matrix building...")
        len_row = len(media_list)
        len_column = len(tag_list)
        array = np.zeros((len_row, len_column))
        if self.isExists(media_tag_relation_file):
            f = open(media_tag_relation_file, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                media_id = row[0].strip()
                tag = row[1].strip()
                if tag in tag_list:
                    array[media_list.index(media_id)][tag_list.index(
                        tag)] = 1.0
        self.logger.log().info("media tag matrix finished!!!")
        return array

    def k_Means_cluster_func(self, data):
        """
        cluster_centers_indices = af.cluster_centers_indices_
        label预测出来的每一个标签的类别标签,label是一个numpy数组
        cluster_centers_indices 预测出来的中心点的索引
        cluster_centers_ 聚类中心矩阵
        :param data:
        :return: cluster_centers_,labels_
        """
        tim_pre = time.time()
        self.logger.log().info(tim_pre)
        """
        damping : 衰减系数,默认为 0.5
        convergence_iter : 迭代次后聚类中心没有变化,算法结束,默认为15.
        max_iter : 最大迭代次数,默认200.
        copy : 是否在元数据上进行计算,默认True,在复制后的数据上进行计算。
        preference : S的对角线上的值
        affinity :S矩阵(相似度),默认为euclidean(欧氏距离)矩阵,即对传入的X计算距离矩阵,也可以设置为precomputed,那么X就作为相似度矩阵。
        verbose : 打印信息的详细程度,默认0,不打印
        """
        # af = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=None,
        #                          affinity='euclidean', verbose=False).fit(
        #     data)  # damping must be >= 0.5 and < 1,默认0.5
        k_means = KMeans(init='k-means++',
                         n_clusters=50,
                         n_init=10,
                         max_iter=300)
        arrayk = k_means.fit(data)
        tim_now = time.time()
        duration = tim_now - tim_pre
        self.logger.log().info("the k-Means cluster used time %s" % duration)
        cluster_centers_ = arrayk.cluster_centers_
        labels_ = arrayk.labels_
        cluster_centers_indices = arrayk.cluster_centers_indices_
        return cluster_centers_, labels_, cluster_centers_indices

    def k_Means_algrothm(self, array):
        """
        将用户标签矩阵聚类,得到用户标签的聚类中心
        :param array:
        :return:cluster_centers_, labels_, cluster_centers_indices
        """
        prop = ReadProperties("data/app.properties")
        cluster_centers_, labels_, cluster_centers_indices = self.k_Means_cluster_func(
            data=array)
        return cluster_centers_, labels_, cluster_centers_indices

    def run_media_tag_algrothm(self):
        """
        单曲的AP聚类
        :return: cluster_centers_, labels_,cluster_centers_indices
        """

        prop = ReadProperties("data/app.properties")
        # 用户Index
        media_index_file = prop.get("all_mediaId_index_path")
        # (commonTag artistTag)标签
        tag_index_file = prop.get("commonTag_index_path")
        # 单曲推荐包含艺术家的用户标签评分
        media_tag_relation_file = prop.get("mediaId_common_tag_path")
        media_list = []
        tag_list = []
        media_list = self.file_read_fun(media_index_file, media_list)
        # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐
        tag_list = self.file_read_fun(tag_index_file, tag_list)
        # array 为单曲推荐的矩阵(commonTag artistTag)
        array = self.media_tag_matrix_fun(media_list, tag_list,
                                          media_tag_relation_file)
        # media tag score ap聚类
        cluster_centers_, labels_, cluster_centers_indices = self.k_Means_algrothm(
            array)
        self.logger.log().info("ap cluster centers nums %s" %
                               cluster_centers_.shape[0])
        np.save(self.media_relation_ap_centers, cluster_centers_)
        np.save(self.media_relation_ap_labels, labels_)
        return cluster_centers_, labels_, cluster_centers_indices
 def __init__(self):
     self.prop = ReadProperties("data/app.properties")
     self.logger = LoggingUtil("data/logs/")
     self.user_tag_dict = {}
     pass
class UserCenterTagFn(object):
    def __init__(self):
        self.prop = ReadProperties("data/app.properties")
        self.logger = LoggingUtil("data/logs/")
        self.user_tag_dict = {}
        pass

    def main(self):
        # media_ap_indices 返回的是一个聚类中心的索引 可以得到用户画像的标签信息
        user_media_indices_file = self.prop.get("media_ap_indices") + ".npy"
        userCenter_user_relation = np.load(user_media_indices_file)
        content = ""
        user_id_index_list = []
        userCenter_id_list = []
        # 查找用户表数据得到用户id,之后再去查找用户标签评分数据得到用户的标签,之后就得到用户聚类中心和标签的关系
        for i in range(0, userCenter_user_relation.shape[0]):
            user_id_index_list.append(userCenter_user_relation[i])
            userCenter_id_list.append(i)
        user_file = self.prop.get("user_now_play_index_path")
        # 用户list
        user_list = []
        user_list = self.file_read_fun_list(fileName=user_file, data=user_list)
        # 用户标签list
        user_tag_list = {}
        user_tag_list = self.file_read_fun_dict(fileName=self.prop.get("user_play_tag_score_single_mat_path"),
                                                data=user_tag_list)

        # 根据用户index 得到用户id
        userId_list = []
        for i in range(0, len(user_id_index_list)):
            userId_list.append(user_list[user_id_index_list[i]])
        user_tag_dict = {}
        for i in range(0, len(userId_list)):
            user_tag_dict.setdefault(i, {})
            user_tag_dict[i].setdefault(userId_list[i], [])
            if userId_list[i] in user_tag_list.keys():
                user_tag_dict[i][userId_list[i]].append(user_tag_list[userId_list[i]])
        self.user_tag_dict = user_tag_dict

    # 存入mysql
    def saveToSql(self):
        conn = None
        cur = None

        try:
            conn = pymysql.connect(host=self.prop.get("db_host"), port=int(self.prop.get("db_port")),
                                   db=self.prop.get("db"), user=self.prop.get("user_name"),
                                   password=self.prop.get("password"),
                                   charset='utf8')
            cur = conn.cursor()  # 获取游标
            cur.execute("TRUNCATE x_media__user_userCenter_tag")
            insert_sql = "INSERT INTO x_media__user_userCenter_tag (user_center_id,user_id,tag,score,create_time,update_time) VALUES"
            temp_sql = ""
            for i in range(0, len(self.user_tag_dict)):
                if str(self.user_tag_dict[i].values()).replace('dict_values', '').replace("(", "").replace(")",
                                                                                                           "").replace(
                    "[", "").replace("]", "") != "":
                    user_id, tags = self.dict_key_value(self.user_tag_dict[i])
                    keys, values = self.dict_key_value(eval(tags))
                    for j in range(0, len(str(keys).split(","))):
                        tagName = str(keys.split(",")[j])
                        score = float(values.split(",")[j])
                        userId = int(user_id)
                        userCenterId = i
                        temp_sql += "(%d,%d,\'%s\',%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                            userCenterId, userId, tagName, score,
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
            insert_sql += temp_sql
            conn.ping(reconnect=True)
            cur.execute(insert_sql.rstrip(','))
            conn.commit()
        except Exception as e:
            conn.rollback()
            print(e)
            self.logger.log().error(" insert error message : %s" % e)
        finally:
            self.logger.log().info("close streaming finished!!!")
            if conn is not None:
                conn.close()
            if cur is not None:
                cur.close()

    def file_read_fun_dict(self, fileName, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if os.path.exists(fileName):
            data = {}
            f = open(fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.setdefault(row[0], {})
                data[row[0]].setdefault(row[1], 0)
                data[row[0]][row[1]] += float(row[2].strip("\n"))
            f.close()
        else:
            self.logger.log().error("%s is not exists" % fileName)
        return data

    def file_read_fun_list(self, fileName, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if os.path.exists(fileName):
            f = open(fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        else:
            self.logger.log().error("%s is not exists" % fileName)
        return data

    def dict_key_value(self, data):
        k = str(data.keys()).replace("dict_keys", "").replace("(", "").replace(")", "").replace("[", "").replace("]",
                                                                                                                 "").replace(
            "'", '')
        v = str(data.values()).replace("dict_values", "").replace("(", "").replace(")", "").replace("[", "").replace(
            "]", "")
        return k, v
Exemplo n.º 8
0
class UserLabelAP:
    def __init__(self, media_ap_centers, media_ap_labels, media_ap_indices,
                 mediaList_ap_centers, mediaList_ap_labels,
                 mediaList_ap_indices):
        self.media_ap_labels = media_ap_labels
        self.media_ap_centers = media_ap_centers
        self.mediaList_ap_centers = mediaList_ap_centers
        self.mediaList_ap_labels = mediaList_ap_labels
        self.media_ap_indices = media_ap_indices
        self.mediaList_ap_indices = mediaList_ap_indices
        self.logger = LoggingUtil("data/logs/")

    def isExists(self, fileName):
        """
        判断文件是否存在
        :param fileName:
        :return:boolean 是否存在
        """
        return os.path.exists(fileName)

    def file_read_fun(self, fileName, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if self.isExists(fileName):
            f = open(fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        else:
            self.logger.log().error("%s is not exists" % fileName)
        return data

    def user_tag_matrix_fun(self, userId_list, tag_list, user_tag_fileName):
        """
        user_tag_matrix_fun(返回用户标签矩阵,value为score)
        :param userId_list:
        :param tag_list:
        :param user_tag_fileName:
        :return: 用户标签矩阵
        """
        self.logger.log().info("user tag matrix building...")
        len_row = len(userId_list)
        len_coloum = len(tag_list)
        array = np.zeros((len_row, len_coloum))
        if self.isExists(user_tag_fileName):
            f = open(user_tag_fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                userId = row[0].strip()
                tag = row[1].strip()
                score = row[2].rstrip("\n")
                if tag in tag_list:
                    array[userId_list.index(userId)][tag_list.index(
                        tag)] = score
        self.logger.log().info("user tag matrix finished!!!")
        return array

    def affinityPropagation(self, data, ap_damping, ap_max_iter,
                            ap_convergence_iter, ap_copy, ap_preference,
                            ap_affinity, ap_verbose):
        """
        cluster_centers_indices = af.cluster_centers_indices_
        label预测出来的每一个标签的类别标签,label是一个numpy数组
        cluster_centers_indices 预测出来的中心点的索引
        cluster_centers_ 聚类中心矩阵
        :param data:
        :return: cluster_centers_,labels_
        """
        tim_pre = time.time()
        self.logger.log().info(tim_pre)
        """
        damping : 衰减系数,默认为 0.5
        convergence_iter : 迭代次后聚类中心没有变化,算法结束,默认为15.
        max_iter : 最大迭代次数,默认200.
        copy : 是否在元数据上进行计算,默认True,在复制后的数据上进行计算。
        preference : S的对角线上的值
        affinity :S矩阵(相似度),默认为euclidean(欧氏距离)矩阵,即对传入的X计算距离矩阵,也可以设置为precomputed,那么X就作为相似度矩阵。
        verbose : 打印信息的详细程度,默认0,不打印
        """
        # af = AffinityPropagation(damping=0.9, max_iter=200, convergence_iter=15, copy=True, preference=None,
        #                          affinity='euclidean', verbose=False).fit(
        #     data)  # damping must be >= 0.5 and < 1,默认0.5
        af = AffinityPropagation(
            damping=ap_damping,
            max_iter=ap_max_iter,
            convergence_iter=ap_convergence_iter,
            copy=ap_copy,
            preference=ap_preference,
            affinity=ap_affinity,
            verbose=ap_verbose).fit(
                data)  # damping must be >= 0.5 and < 1,默认0.5
        tim_now = time.time()
        duration = tim_now - tim_pre
        self.logger.log().info(duration)
        cluster_centers_ = af.cluster_centers_
        labels_ = af.labels_
        cluster_centers_indices = af.cluster_centers_indices_
        return cluster_centers_, labels_, cluster_centers_indices

    def AP_algrothm(self, array):
        """
        将用户标签矩阵聚类,得到用户标签的聚类中心
        :param array:
        :return:cluster_centers_, labels_, cluster_centers_indices
        """
        prop = ReadProperties("data/app.properties")
        cluster_centers_, labels_, cluster_centers_indices = self.affinityPropagation(
            data=array,
            ap_damping=float(prop.get("ap_damping")),
            ap_max_iter=int(prop.get("ap_max_iter")),
            ap_convergence_iter=int(prop.get("ap_convergence_iter")),
            ap_copy=True,
            ap_preference=None,
            ap_affinity=prop.get("ap_affinity"),
            ap_verbose=False)
        return cluster_centers_, labels_, cluster_centers_indices

    def run_Ap_single_algrothm(self):
        """
        单曲的AP聚类
        :return: cluster_centers_, labels_,cluster_centers_indices
        """
        prop = ReadProperties("data/app.properties")
        # 用户Index
        user_index_file = prop.get("userId_index_path")
        # (commonTag artistTag)标签
        tag_index_file = prop.get("common_artist_tag_index_path")
        # 单曲推荐包含艺术家的用户标签评分
        user_tag_file1 = prop.get("user_common_artist_tag_score_path")
        user_list = []
        tag_list = []
        user_list = self.file_read_fun(user_index_file, user_list)
        # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐
        tag_list = self.file_read_fun(tag_index_file, tag_list)
        # array 为单曲推荐的矩阵(commonTag artistTag)
        array = self.user_tag_matrix_fun(user_list, tag_list, user_tag_file1)
        # 单曲推荐的聚类中心
        cluster_centers_, labels_, cluster_centers_indices = self.AP_algrothm(
            array)
        self.logger.log().info("ap cluster centers nums %s" %
                               cluster_centers_.shape[0])
        np.save(self.media_ap_centers, cluster_centers_)
        np.save(self.media_ap_labels, labels_)
        np.save(self.media_ap_indices, cluster_centers_indices)
        return cluster_centers_, labels_, cluster_centers_indices

    def run_Ap_songList_algrothm(self):
        """
        歌单专题的AP聚类
        :return:
        """
        prop = ReadProperties("data/app.properties")
        # 用户Index
        user_index_file = prop.get("userId_index_path")
        # (common)标签
        tag_common_file = prop.get("commonTag_index_path")
        # 歌单专题推荐不包含艺术家的用户标签打分
        user_common_tag_file = prop.get("user_common_tag_score_path")
        user_list = []
        tag_common_list = []
        user_list = self.file_read_fun(user_index_file, user_list)
        tag_common_list = self.file_read_fun(tag_common_file, tag_common_list)
        array = self.user_tag_matrix_fun(user_list, tag_common_list,
                                         user_common_tag_file)
        # 歌单聚类中心
        self.logger.log().info("song_list ap start...")
        cluster_centers_, labels_, cluster_centers_indices = self.AP_algrothm(
            array)
        self.logger.log().info("song_list ap finished!!!")
        self.logger.log().info("ap cluster centers nums: %s " %
                               cluster_centers_.shape[0])
        self.logger.log().info("ap finished!!!")
        np.save(self.mediaList_ap_centers, cluster_centers_)
        np.save(self.mediaList_ap_labels, labels_)
        np.save(self.mediaList_ap_indices, cluster_centers_indices)
        # """
        # 绘制散点图观察聚类效果
        # """
        # import matplotlib.pyplot as plt
        # from itertools import cycle
        # plt.figure('AP')
        # plt.subplots(facecolor=(0.5, 0.5, 0.5))
        # colors = cycle('rgbcmykw')
        # for k, col in zip(range(cluster_centers_.shape[0]), colors):
        #     # labels == k 使用k与labels数组中的每个值进行比较
        #     # 如labels = [1,0],k=0,则‘labels==k’的结果为[False, True]
        #     class_members = labels_ == k
        #     cluster_center = array[cluster_centers_indices[k]]  # 聚类中心的坐标
        #     plt.plot(array[class_members, 0], array[class_members, 1], col + '.')
        #     plt.plot(cluster_center[0], cluster_center[1], markerfacecolor=col,
        #              markeredgecolor='k', markersize=14)
        #     for x in array[class_members]:
        #         plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
        # plt.xticks(fontsize=10, color="darkorange")
        # plt.yticks(fontsize=10, color="darkorange")
        # plt.show()
        return cluster_centers_, labels_, cluster_centers_indices
class ConnectToMysql:
    def __init__(self):
        self.prop = ReadProperties("data/app.properties")
        self.logger = LoggingUtil("data/logs/")
        self.host = self.prop.get("db_host")
        self.port = self.prop.get("db_port")
        self.db = self.prop.get("db")
        self.user_name = self.prop.get("user_name")
        self.password = self.prop.get("password")

    def file_read_fun(self, fileName, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if self.isExists(fileName):
            f = open(fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        return data

    def isExists(self, fileName):
        """
        判断文件是否存在
        :param fileName:
        :return:
        """
        return os.path.exists(fileName)

    def saveUserCenterToMysql(self, truncate_sql, user_id_list_file, user_labels_file, insert_sql):
        """
        将用户和用户聚类中心关系数据存入数据库
        :return:
        """
        conn = None
        cur = None
        try:
            conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name,
                                   password=self.password,
                                   charset='utf8')
            self.logger.log().info("connect to mysql success !!!")
            # 将用户 和 用户中心存入 数据库 用户(user_list)和用户中心(labels) len 相同
            user_id_list = []
            user_id_list = self.file_read_fun(user_id_list_file, user_id_list)
            labels = np.load(user_labels_file)
            nums = len(labels)
            cur = conn.cursor()  # 获取游标
            cur.execute(truncate_sql)
            self.logger.log().info("truncate success table !!!")
            """2000个数据一组插入mysql中"""
            if nums > 2000:
                index = int(nums / 2000)
                for j in range(0, index):
                    sql = insert_sql
                    temp_sql = ""
                    count = 0
                    for i in range(count + j * 2000, (j + 1) * 2000):
                        temp_sql = "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                            int(user_id_list[i]), labels[i],
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                        sql += temp_sql
                        """rstrip(",")表示删除字符串右边的, lstrip(",") 表示删除左边的"""
                        count += 1
                    if j + 1 == index:
                        for k in range(index * 2000, nums):
                            temp_sql = "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                                int(user_id_list[k]), labels[k],
                                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                            sql += temp_sql
                    conn.ping(reconnect=True)
                    cur.execute(sql.rstrip(','))
                    conn.commit()
            else:
                sql = insert_sql
                for i in range(0, nums):
                    sql += "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                        int(user_id_list[i]), labels[i],
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                conn.ping(reconnect=True)
                cur.execute(sql.rstrip(','))
                conn.commit()
            self.logger.log().info("insert  success !!!")
        except Exception as e:
            conn.rollback()
            print(e)
            self.logger.log().error(" insert error message : %s" % e)
        finally:
            self.logger.log().info("stream closed")
            conn.close()
            cur.close()

    def saveSimilarityMatrixToMysql(self, truncate_sql, similiarity_mat, media_file, insert_sql):
        """
        将相似度矩阵数据存入mysql
        :return:
        """
        conn = None
        cur = None
        try:
            conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name,
                                   password=self.password,
                                   charset='utf8')
            self.logger.log().info("connect to mysql success !!!")
            """将 相似度计算后的 用户聚类中心和 歌单 的 socre 数据存入 mysql"""
            similiarity_matrix = np.loadtxt(similiarity_mat)
            media_list_fileName = media_file
            media_list_id = []
            media_list_id = self.file_read_fun(media_list_fileName, media_list_id)
            cur = conn.cursor()  # 获取游标
            cur.execute(truncate_sql)
            for i in range(0, similiarity_matrix.shape[0]):
                sql = insert_sql
                temp_sql = ""
                for j in range(0, similiarity_matrix.shape[1]):
                    if similiarity_matrix[i][j] == 0:
                        continue
                    else:
                        temp_sql = "(%d,%d,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \
                                   "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \
                                   % (i, int(media_list_id[j]), similiarity_matrix[i][j],
                                      datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                      datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                        sql += temp_sql
                if str(sql.rstrip()).endswith('values'):
                    continue
                conn.ping(reconnect=True)
                cur.execute(sql.rstrip(','))
                conn.commit()
        except Exception as e:
            conn.rollback()
            print(e)
            self.logger.log().error("error message:%s" % e)
        finally:
            self.logger.log().info("stream closed")
            conn.close()
            cur.close()

    def insertMediaRelationMatToMysql(self, truncate_sql, similarite_mat, insert_sql):
        conn = None
        cur = None
        self.logger.log().info("load file start...")
        media_relation_similarity_mat = np.loadtxt(similarite_mat)
        self.logger.log().info("load file finished")
        try:
            conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name,
                                   password=self.password,
                                   charset='utf8')
            self.logger.log().info("connect to Mysql success!!!")
            cur = conn.cursor()  # 获取游标
            cur.execute(truncate_sql)
            for i in range(0, media_relation_similarity_mat.shape[0]):
                sql = insert_sql
                temp_sql = ""
                for j in range(0, media_relation_similarity_mat.shape[1]):
                    if i == j:
                        continue
                    else:
                        temp_sql += "(%s,%s,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \
                                    "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \
                                    % (i + 1, j + 1, media_relation_similarity_mat[i][j],
                                       datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                       datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                        sql += temp_sql
            conn.ping(reconnect=True)
            cur.execute(sql.rstrip(','))
            conn.commit()
            self.logger.log().info("insert success part_%s !!!" % (i + 1))
        except Exception as e:
            conn.rollback()
            self.logger.log().error("error message:%s" % e)
        finally:
            self.logger.log().info("stream closed")
            conn.close()
            cur.close()

    def main(self):

        user_id_list_file = self.prop.get("user_now_play_index_path")
        user_mediaListSubject_labels_file = self.prop.get("media_list_ap_labels") + ".npy"
        # 单曲
        truncate_media = "TRUNCATE `x_user__user_center`"
        insert_media_sql = "insert into x_user__user_center(user_id,user_center_id,create_time,update_time) values "
        user_media_labels_file = self.prop.get("media_ap_labels") + ".npy"
        truncate_mat_media = "TRUNCATE `x_user_center__rank`"
        similiarity_mat_file = self.prop.get("media_similarity_matrix")
        media_mat_file = self.prop.get("all_mediaId_index_path")
        insert_mat_media_sql = "insert into x_user_center__rank(user_center_id,media_id,score,create_time,update_time) values "
        self.saveUserCenterToMysql(truncate_sql=truncate_media, user_id_list_file=user_id_list_file,
                                   user_labels_file=user_media_labels_file, insert_sql=insert_media_sql)
        self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_media, similiarity_mat=similiarity_mat_file,
                                         media_file=media_mat_file, insert_sql=insert_mat_media_sql)

        # 歌单
        truncate_mediaList = "TRUNCATE `x_user__user_center_list`"
        insert_mediaList_sql = "insert into x_user__user_center_list(user_id,user_center_id,create_time,update_time) values "

        truncate_mat_mediaList = "TRUNCATE `x_user_center__rank_list`"
        similiarity_mat_file_mediaList = self.prop.get("media_list_similarity_matrix")
        mediaList_mat_file = self.prop.get("mediaList_index_path")
        insert_mat_mediaList_sql = "insert into x_user_center__rank_list(user_center_id,media_list_id,score,create_time,update_time) values "
        self.saveUserCenterToMysql(truncate_sql=truncate_mediaList, user_id_list_file=user_id_list_file,
                                   user_labels_file=user_mediaListSubject_labels_file, insert_sql=insert_mediaList_sql)
        self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_mediaList,
                                         similiarity_mat=similiarity_mat_file_mediaList,
                                         media_file=mediaList_mat_file, insert_sql=insert_mat_mediaList_sql)
        # 专题
        truncate_mediaSubject = "TRUNCATE `x_user__user_center_subject`"
        insert_mediaSubject_sql = "insert into x_user__user_center_subject(user_id,user_center_id,create_time,update_time) values "

        truncate_mat_mediaSubject = "TRUNCATE `x_user_center__rank_subject`"
        similiarity_mat_file_mediaSubject = self.prop.get("media_subject_similarity_matrix")
        mediaSubject_mat_file = self.prop.get("subject_index_path")
        insert_mat_mediaSubject_sql = "insert into x_user_center__rank_subject(user_center_id,subject_id,score,create_time,update_time) values "
        self.saveUserCenterToMysql(truncate_sql=truncate_mediaSubject, user_id_list_file=user_id_list_file,
                                   user_labels_file=user_mediaListSubject_labels_file,
                                   insert_sql=insert_mediaSubject_sql)
        self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_mediaSubject,
                                         similiarity_mat=similiarity_mat_file_mediaSubject,
                                         media_file=mediaSubject_mat_file, insert_sql=insert_mat_mediaSubject_sql)

        # 歌曲相似度计算 同类型推荐
        truncate_media_relation_sql = "TRUNCATE `x_cf_base_media_recommend`";
        insert_media_relation_sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES "
        media_relation_similarity_mat_file = self.prop.get("media_relation_matrix_path")
        self.insertMediaRelationMatToMysql(truncate_sql=truncate_media_relation_sql,
                                           similarite_mat=media_relation_similarity_mat_file,
                                           insert_sql=insert_media_relation_sql)
Exemplo n.º 10
0
class ItemBaseMediaCF:
    def __init__(self):
        self.prop = ReadProperties("data/app.properties")
        self.logger = LoggingUtil("data/logs/")
        self.media_list = {}
        self.host = self.prop.get("db_host")
        self.port = self.prop.get("db_port")
        self.db = self.prop.get("db")
        self.user_name = self.prop.get("user_name")
        self.password = self.prop.get("password")
        pass

    def is_exist(self, file_name):
        """
        判断文件是否存在
        :param fileName:
        :return:boolean 是否存在
        """

        return os.path.exists(file_name)

    def file_read_fun(self, file_name, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if self.is_exist(file_name):
            f = open(file_name, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        else:
            self.logger.log().error("%s is not exists" % file_name)
        return data

    def media_tag_matrix_fun(self, media_id_list, tag_list,
                             media_tag_file_name):
        """
        user_tag_matrix_fun(返回歌曲标签矩阵,value为score)
        :param media_id_list:
        :param tag_list:
        :param media_tag_file_name:
        :return: 歌曲标签矩阵
        """

        self.logger.log().info("media tag matrix building...")
        len_row = len(media_id_list)
        len_column = len(tag_list)
        array = np.zeros((len_row, len_column))
        if self.is_exist(media_tag_file_name):
            f = open(media_tag_file_name, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                media_id = row[0].strip()
                tag = row[1].strip()
                if tag in tag_list:
                    array[media_id_list.index(media_id)][tag_list.index(
                        tag)] = 1.0
        self.logger.log().info("media tag matrix finished!!!")
        return array

    def cal_similarity_func(self, array):
        """
        计算相似度
        :return:similarity_mat 相似度矩阵,top5存入mysql
        """
        sim_mat = Similarity_Matrix()
        len_row = array.shape[0]
        len_column = array.shape[0]
        self.logger.log().info("cal similarity_mat start...")
        conn = None
        cur = None

        try:
            conn = pymysql.connect(host=self.host,
                                   port=int(self.port),
                                   db=self.db,
                                   user=self.user_name,
                                   password=self.password,
                                   charset='utf8')
            self.logger.log().info("connect to Mysql success!!!")
            cur = conn.cursor()  # 获取游标
            # cur.execute("TRUNCATE `x_cf_base_media_recommend`")
            sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES"
            temp_sql = ""
            for i in range(len_row):
                media = {}
                # 索引的关系
                media.setdefault(self.media_list[i], {})
                for j in range(len_column):
                    if i == j:
                        media[self.media_list[i]][self.media_list[j]] = 0.0
                        continue
                    else:
                        score = math.fabs(
                            sim_mat.similarity_func(array[i], array[j]))
                        media[self.media_list[i]][self.media_list[j]] = float(
                            score)
                # media 根据 得分排序取前十个
                media_popular = list(
                    sorted(media[self.media_list[i]].items(),
                           key=lambda x: x[1],
                           reverse=True))[:10]
                media_id_1 = list(media.keys())[0]
                for k in range(len(media_popular)):
                    temp_sql = "(%d,%d,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \
                               "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \
                               % (int(media_id_1), int(media_popular[k][0]), float(media_popular[k][1]),
                                  datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                  datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                    sql += temp_sql
                if (i != 0) & ((i + 1) % 200 == 0) & ((i + 1) != len_row):
                    conn.ping(reconnect=True)
                    cur.execute(sql.rstrip(','))
                    conn.commit()
                    sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES"
                    temp_sql = ""
                    self.logger.log().info("cal media cosine similarity %s" %
                                           str(i + 1))
                else:
                    if (i + 1) == len_row:
                        conn.ping(reconnect=True)
                        cur.execute(sql.rstrip(','))
                        conn.commit()
                        self.logger.log().info(
                            "cal media cosine similarity completed !!!")
        except Exception as e:
            conn.rollback()
            self.logger.log().error("error message:%s" % e)
        finally:
            self.logger.log().info("stream closed")
            conn.close()
            cur.close()

    def main(self):
        """
        同类推荐歌曲相似度计算
        :return:
        """
        # media Index
        mediaId_index_file = self.prop.get("all_mediaId_index_path")
        # (commonTag)标签
        tag_index_file = self.prop.get("commonTag_index_path")
        # 单曲推荐包含艺术家的用户标签评分
        mediaId_tag_file = self.prop.get("mediaId_common_tag_path")
        medaiId_list = []
        tag_list = []
        medaiId_list = self.file_read_fun(mediaId_index_file, medaiId_list)
        self.media_list = medaiId_list
        # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐
        tag_list = self.file_read_fun(tag_index_file, tag_list)
        # array 为单曲推荐的矩阵(commonTag artistTag)
        array = self.media_tag_matrix_fun(medaiId_list, tag_list,
                                          mediaId_tag_file)
        self.cal_similarity_func(array=array)
Exemplo n.º 11
0
class ItemBasedCF():
    """
    TopN recommendation - ItemBasedCF
    """

    def __init__(self):
        self.prop = ReadProperties("data/app.properties")
        self.logger = LoggingUtil("data/logs/")
        self.host = self.prop.get("db_host")
        self.port = int(self.prop.get("db_port"))
        self.db = self.prop.get("db")
        self.user = self.prop.get("user_name")
        self.password = self.prop.get("password")
        self.trainset = {}
        self.testset = {}

        # n_sim_user: top 20个用户, n_rec_media: top 10个推荐结果
        self.n_sim_media = 20
        self.n_rec_media = 10

        # media_sim_mat: 歌曲之间的相似度, media_popular: 歌曲出现的次数, media_count: 播放歌曲的总数据
        self.media_sim_mat = {}
        self.media_popular = {}
        self.media_count = 0
        self.rec_media_dict = {}
        self.all_rec_medias = {}

        self.logger.log().info('Similar media number = %d' % self.n_sim_media)
        self.logger.log().info('Recommended media number = %d' % self.n_rec_media)

    @staticmethod
    def loadfile(filename):
        """loadfile(加载文件,返回一个生成器)

        Args:
            filename   文件名
        Returns:
            line       行数据,去空格
        """
        fp = open(filename, 'r')
        for i, line in enumerate(fp):
            yield line.strip('\r\n')
            # if i > 0 and i % 100000 == 0:
            #     print('loading %s(%s)' % (filename, i), file=sys.stderr)
        fp.close()

    def generate_dataset(self, filename, pivot=0.7):
        """loadfile(加载文件,将数据集按照7:3 进行随机拆分)
        Args:
            filename   文件名
            pivot      拆分比例
        """
        trainset_len = 0
        testset_len = 0

        for line in self.loadfile(filename):
            user, media, rating, count, percentage_count = line.split('\t')
            # 通过pivot和随机函数比较,然后初始化用户和对应的值
            if random.random() < pivot:
                # dict.setdefault(key, default=None)
                # key -- 查找的键值
                # default -- 键不存在时,设置的默认键值
                self.trainset.setdefault(user, {})
                self.trainset[user][media] = str(rating + "\t" + count + "\t" + percentage_count)
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][media] = str(rating + "\t" + count + "\t" + percentage_count)
                testset_len += 1
        self.logger.log().info('分离训练集和测试集成功')
        self.logger.log().info('train set = %s' % trainset_len)
        self.logger.log().info('test set = %s' % testset_len)

    def calc_media_sim(self):
        """
        calc_media_sim()
        :return: item_sim_mat
        """
        self.logger.log().info('counting medias number and popularity...')
        # 统计在所有的用户中`,不同歌曲的总播放次数, user, medias 某首歌曲被多少人播放过,数据来源中用户播放一首歌曲的所有记录被评分在了一起
        for _, medias in self.trainset.items():
            for media in medias:
                # count item popularity
                if media not in self.media_popular:
                    self.media_popular[media] = 0
                self.media_popular[media] += 1

        self.logger.log().info('count medias number and popularity success')

        # total numbers of media
        self.media_count = len(self.media_popular)
        self.logger.log().info('total media number = %d' % self.media_count)

        # 统计在相同用户时,不同歌曲同时出现的次数(本意就是用户播放的每一首曲子与之相关的其他曲子的播放次数)
        item_sim_mat = self.media_sim_mat
        self.logger.log().info('building co-rated users matrix...')
        # user, medias
        for _, medias in self.trainset.items():
            for m1 in medias:
                for m2 in medias:
                    if m1 == m2:
                        continue
                    item_sim_mat.setdefault(m1, {})
                    item_sim_mat[m1].setdefault(m2, 0)
                    item_sim_mat[m1][m2] += 1
        self.logger.log().info('build co-rated users matrix success')

        # calculate similarity matrix
        self.logger.log().info('calculating media similarity matrix...')
        simfactor_count = 0
        for m1, related_movies in item_sim_mat.items():
            for m2, count in related_movies.items():
                # 余弦相似度
                item_sim_mat[m1][m2] = count / math.sqrt(
                    self.media_popular[m1] * self.media_popular[m2])
                simfactor_count += 1
        self.logger.log().info('calculate media similarity matrix(similarity factor) success')
        self.logger.log().info('total similarity factor number = %d' % simfactor_count)

    def recommend(self, user):
        """recommend(找出top K的歌曲,对歌曲进行相似度sum的排序,取出top N的歌曲)
        Args:
            user       用户
        Returns:
            rec_movie  歌曲推荐列表,按照相似度从大到小的排序
        """
        ''' Find K similar medias and recommend N medias. '''
        K = self.n_sim_media
        N = self.n_rec_media
        rank = {}
        listened_media = self.trainset[user]

        # rating=歌曲得分, w=不同歌曲出现的次数
        for media, rating in listened_media.items():
            if media in self.media_sim_mat.keys():
                for related_media, w in sorted(
                        self.media_sim_mat[media].items(),
                        key=itemgetter(1),
                        reverse=True):
                    if related_media in listened_media:
                        continue
                    rank.setdefault(related_media, 0)
                    rank[related_media] += w * float(str(rating).split("\t")[0])
        # return the N best medias
        return sorted(rank.items(), key=itemgetter(1), reverse=True)

    def evaluate(self):
        """
        :param self:
        :return: precision, recall, coverage and popularity
        """
        self.logger.log().info('Evaluation start...')

        # 返回top N的推荐结果
        N = self.n_rec_media
        # varables for precision and recall
        # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的歌曲数目
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        # varables for popularity
        popular_sum = 0

        # enumerate将其组成一个索引序列,利用它可以同时获得索引和值
        # 参考地址:http://blog.csdn.net/churximi/article/details/51648388
        for i, user in enumerate(self.trainset):
            if i > 0 and i % 500 == 0:
                self.logger.log().info('recommended for %d users' % i)
            test_medias = self.testset.get(user, {})
            rec_medias = self.recommend(user)

            # 对比测试集和推荐集的差异 media, w
            for media, _ in rec_medias:
                if media in test_medias:
                    hit += 1
                self.all_rec_medias.setdefault(user, {})
                self.all_rec_medias[user][media] = float(_)
                # 计算用户对应的歌曲出现次数log值的sum加和
                popular_sum += math.log(1 + self.media_popular[str(media).split("\t")[0]])
            rec_count += N
            test_count += len(test_medias)

        precision = hit / (1.0 * rec_count)  # 命中/总推荐次数
        recall = hit / (1.0 * test_count)  # 命中/总测试数据
        coverage = len(self.all_rec_medias) / (1.0 * self.media_count)  # 推荐结果覆盖所有歌曲的覆盖率
        popularity = popular_sum / (1.0 * rec_count)  # 这个参数越大说明数据关联性越强

        self.logger.log().info('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
            precision, recall, coverage, popularity))

    def insert_to_mysql(self):
        """
        将based-item 的用户推荐结果插入数据库
        :return:
        """
        conn = None
        cur = None
        try:
            self.logger.log().info("connect to mysql start ....")
            conn = pymysql.connect(host=self.host, port=self.port, db=self.db, user=self.user,
                                   password=self.password,
                                   charset='utf8')
            self.logger.log().info("connect to mysql success !!!")

            cur = conn.cursor()  # 获取游标
            cur.execute('truncate  x_cf_item_recommend')
            self.logger.log().info("truncate table:x_cf_item_recommend success !!!")

            count = 0
            sql = "insert into x_cf_item_recommend(user_id,media_id,score,create_time,update_time) values "
            temp_sql = ""
            total_count = 0
            for user in self.all_rec_medias:
                media_score = self.all_rec_medias[user]
                if len(media_score) >= 5:
                    total_count += 5
                else:
                    total_count += len(media_score)
            count_index = 0
            for i, user in enumerate(self.all_rec_medias):
                media_score = self.all_rec_medias[user]
                for j, media in enumerate(media_score):
                    if j < 5:
                        score = media_score[media]
                        temp_sql = "(\'%d\',\'%s\',%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                            int(user), str(media).split("\t")[0], float(score),
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ","
                        sql += temp_sql
                        count_index += 1
                        if (count_index % 2000 == 0) & (count_index != total_count):
                            conn.ping(reconnect=True)
                            cur.execute(sql.rstrip(","))
                            conn.commit()
                            sql = "insert into x_cf_item_recommend(user_id,media_id,score,create_time,update_time) values "
                            temp_sql = ""
                            self.logger.log().info("insert to mysql part-%s success !!!" % (int(count_index / 2000)))
                            continue
                        if count_index == total_count:
                            conn.ping(reconnect=True)
                            cur.execute(sql.rstrip(","))
                            conn.commit()
                    else:
                        break
        except Exception as e:
            self.logger.log().error("insert error %s" % e)
        finally:
            conn.close()
            cur.close()
            self.logger.log().info("close streaming...")
Exemplo n.º 12
0
 def __init__(self, host, user_name):
     self.host = host
     self.user_name = user_name
     self.logger = LoggingUtil("data/logs/")
Exemplo n.º 13
0
class DownLoadData:
    def __init__(self, host, user_name):
        self.host = host
        self.user_name = user_name
        self.logger = LoggingUtil("data/logs/")

    def file_upload(self, local_path, hdfs_path):
        self.logger.log().info("file upload start...")
        fs = pyhdfs.HdfsClient(hosts=self.host, user_name=self.user_name)
        print(fs.listdir('/'))
        fs.copy_from_local(local_path, hdfs_path)
        self.logger.log().info("file upload finished !!!")

    def file_down(self, local_path, hdfs_path):
        self.logger.log().info("file download start")
        fs = pyhdfs.HdfsClient(hosts=self.host, user_name=self.user_name)
        # 判断目录是否存在 并且路径是否为目录
        try:
            if fs.exists(hdfs_path):
                if fs.get_file_status(path=hdfs_path).get("type") == "DIRECTORY":
                    for file in fs.listdir(hdfs_path):
                        if file.startswith("part"):
                            fs.copy_to_local(hdfs_path + "/" + file, local_path, overwrite=True)
                            self.logger.log().info(
                                "file download finished the fileName is %s" % (hdfs_path + "/" + file))
                    self.logger.log().info("down load file nums is %s" % len(fs.listdir(hdfs_path)))
                else:
                    self.logger.log().info("file download finished ther fileName is %s" % hdfs_path)
                    fs.copy_to_local(hdfs_path, local_path, overwrite=True)
            else:
                self.logger.log().error("filepath :  %s is not exists!!!" % hdfs_path)
                pass
        except Exception as e:
            print(e, file=sys.stderr)
            self.logger.log().error("fileNotFound Exception")

    def makeDir(self, path):
        is_exists = os.path.exists(path)
        # 判断结果
        if not is_exists:
            # 如果不存在则创建
            os.makedirs(path)
            # LoggingUtil.log().info(% path
            # "%s is make success")
            return True
        else:
            return False

    def main(self):
        prop_path = "data/app.properties"
        prop = ReadProperties(prop_path)
        date_path = datetime.today().strftime("%Y%m%d")
        # 创建一个日志文件夹
        self.makeDir("data/logs/")
        self.makeDir("data/input/")
        self.makeDir("data/output")
        # tagIndex信息 commonTag artistTag
        self.file_down(
            local_path=prop.get("common_artist_tag_index_path"),
            hdfs_path=prop.get("fs_common_artist_tag_index") + date_path)
        # 用户数据
        self.file_down(
            local_path=prop.get("userId_index_path"), hdfs_path=prop.get("fs_user_index") + date_path)
        # 歌单标签数据
        self.file_down(
            local_path=prop.get("mediaList_tag_path"),
            hdfs_path=prop.get("fs_media_list_tag"))
        # 专题标签数据
        self.file_down(
            local_path=prop.get("subject_tag_path"),
            hdfs_path=prop.get("fs_subject_tag"))

        # commonTagIndex 数据
        self.file_down(
            local_path=prop.get("commonTag_index_path"),
            hdfs_path=prop.get("fs_common_tag_index"))

        # 用户基础标签评分(歌单和专题推荐)
        self.file_down(
            local_path=prop.get("user_common_tag_score_path"),
            hdfs_path=prop.get("fs_user_common_tag_score") + date_path)

        # 用户标签评分(单曲推荐)
        self.file_down(
            local_path=prop.get("user_common_artist_tag_score_path"),
            hdfs_path=prop.get("fs_user_common_artist_tag_score") + date_path)

        # 歌单 index 数据(index mediaList_id)
        self.file_down(
            local_path=prop.get("mediaList_index_path"),
            hdfs_path=prop.get("fs_media_list_index"))

        # 单曲播放得分 和 用户之间的关系(userId,mediaId,score)
        self.file_down(
            local_path=prop.get("media_play_score_path"),
            hdfs_path=prop.get("fs_user_play_score") + date_path + "/")
        # 专题index
        self.file_down(
            local_path=prop.get("subject_index_path"),
            hdfs_path=prop.get("fs_subject_index"))

        # mediaId index
        self.file_down(local_path=prop.get("mediaId_index_path"), hdfs_path=prop.get("fs_mediaId_index") + date_path)
        #
        # mediaId tag
        self.file_down(local_path=prop.get("mediaId_tag_path"), hdfs_path=prop.get("fs_mediaId_tag") + date_path)

        # userId tag(commonTag) score 单纯的播放的用户标签关系 歌单专题推荐
        self.file_down(local_path=prop.get("user_play_tag_score_path"),
                       hdfs_path=prop.get("fs_user_play_tag_score_path") + date_path)

        # userId tag(commonTag) score 单纯的收藏的用户标签关系 歌单专题推荐
        self.file_down(local_path=prop.get("user_collect_tag_score_path"),
                       hdfs_path=prop.get("fs_user_collect_tag_score_path") + date_path)

        # userId tag(commonTag) score(disperse) 单纯的播放的用户标签离散化得分 歌单专题推荐
        self.file_down(local_path=prop.get("user_play_tag_disperse_score_path"),
                       hdfs_path=prop.get("fs_user_play_tag_disperse_score_mat_list_subject_path") + date_path)

        # index userId (now) 当前用户index 不包含历史收藏
        self.file_down(local_path=prop.get("user_now_play_index_path"),
                       hdfs_path=prop.get("fs_user_now_play_index_path") + date_path)

        # userId tag(common & artist tag) score 单纯播放的用户标签评分 单曲推荐
        self.file_down(local_path=prop.get("user_play_tag_score_single_mat_path"),
                       hdfs_path=prop.get("fs_user_play_tag_score_single_mat_path") + date_path)

        # 所有歌曲 index mediaId
        self.file_down(local_path=prop.get("all_mediaId_index_path"),
                       hdfs_path=prop.get("fs_all_mediaId_index_path"))

        # all mediaId common_tag
        self.file_down(local_path=prop.get("mediaId_common_tag_path"),
                       hdfs_path=prop.get("fs_mediaId_common_tag_path"))
        # one month media play score data
        self.file_down(local_path=prop.get("media_play_score_one_month_path"),
                       hdfs_path=prop.get("fs_media_play_score_one_month_path") + date_path)
class Similarity_Matrix:
    def __init__(self):
        self.logger = LoggingUtil("data/logs/")

    def similarity_func(self, A, B):
        """
        计算每个点与其他所有点之间的欧几里德距离
        :param A:
        :param B:
        :return:
        """
        # 这里是余弦相似度
        X = np.array(A)
        Y = np.array(B)
        AB = np.sum(X * Y)
        A2 = np.sqrt(np.sum(X * X))
        B2 = np.sqrt(np.sum(Y * Y))
        if A2 == 0 or B2 == 0:
            return 0
        else:
            return AB / ((A2 * B2))

    def file_read_fun(self, fileName, data):
        """
        读取数据格式为 (String String) 的 .csv文件
        :param fileName:
        :param data:
        :return:
        """
        if self.isExists(fileName):
            f = open(fileName, encoding="utf-8")
            for line in f.readlines():
                row = line.split("\t")
                data.append(row[1].rstrip("\n"))
            f.close()
        return data

    def isExists(self, fileName):
        """
        判断文件是否存在
        :param fileName:
        :return:
        """
        return os.path.exists(fileName)

    def cluster_media_list_fun(self, cluster_centers, array):
        """
        # 计算相似度
        :param cluster_centers:
        :param array:
        :return: similarityMatrix
        """
        len_row = cluster_centers.shape[0]
        len_coloum = array.shape[0]
        similarityMatrix = np.zeros((len_row, len_coloum))
        for i in range(len_row):
            for j in range(len_coloum):
                score = math.fabs(
                    self.similarity_func(cluster_centers[i], array[j]))
                similarityMatrix[i][j] = score
        return similarityMatrix

    def generate_relation_mat(self, row, column, file):
        """
        建立歌单和标签的矩阵 行:歌单,列:标签
        :param mediaList_list:
        :param tag_list:
        :param mediaList_fileName:
        :return: array
        """
        len_row = len(row)
        len_column = len(column)
        array = np.zeros((len_row, len_column))
        if self.isExists(file):
            f = open(file, encoding="utf-8")
            for line in f.readlines():
                arr = line.split("\t")
                media = arr[0]
                tag = arr[1].rstrip("\n")
                if tag in column:
                    array[row.index(media)][column.index(tag)] = 1.0
        return array

    def cal_similarity_func(self, row_file, column_file,
                            row_coloum_relation_file, cluster_centers,
                            mat_out_file):
        """
        similarity_media_list_func 计算相似度
        :return: similarity_matrix_media_list
        """
        row_index = []
        column_index = []
        self.logger.log().info("read row_index_file start...")
        row_index = self.file_read_fun(row_file, row_index)
        self.logger.log().info("read row_index_file finished!!!")
        self.logger.log().info("read column_index_file start...")
        column_index = self.file_read_fun(column_file, column_index)
        self.logger.log().info("read column_index_file finished!!!")
        array = self.generate_relation_mat(row_index, column_index,
                                           row_coloum_relation_file)
        self.logger.log().info("calculate similarity matrix start...")
        similarity_matrix_media = self.cluster_media_list_fun(
            cluster_centers, array)
        self.logger.log().info("calculate similarity matrix finished !!!")
        np.savetxt(mat_out_file, similarity_matrix_media)
        return similarity_matrix_media
 def __init__(self):
     self.logger = LoggingUtil("data/logs/")