def tmp_generate_X_y_arrays(f_train_set='%s/train_set.csv' % (data_path)): """ 生成分类器的训练集X 和标签集y, 暂时删除其中某列 Args: f_train_set: 训练集的csv文件 Returns: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] """ from sklearn import preprocessing import numpy as np X = [] y = [] with open(f_train_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') X.append([float(i) for i in (cols[1:4] + cols[5:])]) y.append(int(cols[0])) # tag在第一列,0 或 -1 logger.debug('classifier input X_size=[%s, %s] y_size=[%s, 1]' % (len(X), len(X[0]), len(y))) X = preprocessing.scale(np.array(X)) y = np.array(y) return X, y
def generate_X_y_arrays(f_train_set='%s/train_set.csv' % (data_path)): """ 生成分类器的训练集X 和标签集y Args: f_train_set: 训练集的csv文件 Returns: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] """ from sklearn import preprocessing import numpy as np X = [] y = [] #logger.debug('generate X, y arrays from %s ...' % (f_train_set)) with open(f_train_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') X.append([float(i) for i in cols[1:]]) y.append(int(cols[0])) # tag在第一列,0 或 -1 logger.debug('classifier input X_size=[%s, %s] y_size=[%s, 1]' % (len(X), len(X[0]), len(y))) X = preprocessing.scale(np.array(X)) y = np.array(y) logger.debug('Scale params: mean=%s, std=%s' % (X.mean(axis=0), X.std(axis=0))) return X, y
def insert_train_user_2table(connect, fin='tianchi_mobile_recommend_train_user.csv'): """ 把train_user.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_user_new SET user_id=%s, item_id=%s," "behavior_type=%s, user_geohash='%s', item_category=%s," "time=%s;" % (cols[0], cols[1], cols[2], cols[3], cols[4], arrow.get(cols[5], 'YYYY-MM-DD HH').timestamp)) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def get_buy_list(train_user_connect, timerange=('2014-12-18', '2014-12-19')): from datetime import datetime logger.info('get_buy_list start,timerange = %s to %s' % (timerange[0], timerange[1])) starttime = datetime.strptime(str(timerange[0]), '%Y-%m-%d') stoptime = datetime.strptime(str(timerange[1]), '%Y-%m-%d') buys = train_user_connect.find({'behavior_type': '4', 'time': {'$gt': starttime, '$lt': stoptime}}) # .distinct('user_id') logger.debug('database qury done') buy_dict = {} count = 0 for doc in buys: user_id = doc['user_id'] item_id = doc['item_id'] behavior_type = doc['behavior_type'] item_category = doc['item_category'] time = doc['time'] if buy_dict.has_key(user_id): category_inbuy = buy_dict[user_id] if category_inbuy.has_key(item_category): category_inbuy[item_category].append(item_id) else: category_inbuy[item_category] = [item_id] else: category_inbuy = {item_category: [item_id]} buy_dict[user_id] = category_inbuy count += 1 if count % 1000 == 0: logger.debug('No.%s done' % count) return buy_dict
def cal_user_behavior(connect, timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 计算时间加权后的用户行为 结果输出文件的格式: -------------- content --------------- | user_id,item_id,see,favorite,cart,buy,tag | -------------------------------------- 其中,see代表浏览的时间加权结果,favorite代表收藏,cart代表添加到购物车,favorite代表买 Args: connect: MySQLdb.connect(), 数据库连接句柄 timerange: 时间筛选条件 f_train_set: string, 训练集结果文件 ------ content ------ | user_id,item_id,tag | --------------------- Returns: f_output: sting,输出文件 """ import arrow from math import exp f_output = f_train_set.replace('.csv', '_calUserBehavior.csv') # 输出文件的名称 predict_timestamp = arrow.get('2014-12-19').timestamp time_atten = 3600 * 48 # 时间戳的衰减因子, exp(-1/a * delta_t) (timerange_start, timerange_end) = map(lambda elem: arrow.get(elem).timestamp, timerange) cursor = connect.cursor() with open(f_train_set, 'r') as fin, open(f_output, 'w') as fout: fin.readline() # 忽略首行 fout.write('user_id,item_id,see,favorite,cart,buy,tag\n') counter = 0 # for log logger.debug('start generate...') for in_line in fin: in_cols = in_line.strip().split(',') [user_id, item_id, tag] = in_cols sql = 'select behavior_type, time from train_user where user_id=%s and item_id=%s and time>%s and time<=%s;' % ( user_id, item_id, timerange_start, timerange_end) # logger.debug('sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() time_weights = [0.0, 0.0, 0.0, 0.0] for [behavior_type, timestamp] in result: time_weights[int(behavior_type) - 1] += exp( (timestamp - predict_timestamp) / time_atten) fout.write('%s,%s,%s,%s,%s,%s,%s\n' % (user_id, item_id, time_weights[0], time_weights[1], time_weights[2], time_weights[3], tag)) counter += 1 if counter % 300 == 0: logger.debug( 'NO.%s: user_id=%s, item_id=%s, time_weights=%s, tag=%s' % (counter, user_id, item_id, time_weights, tag)) cursor.close() return f_output
def output(connect, fout="first_try.csv", top_N=5): """ 把用户最后浏览过的N条记录作为推荐结果输出 Args: connect: Mysqldb.connect(), 数据库连接句柄 fout: string, 结果的输出文件 top_N: int, 对每个用户的推荐数 Returns: None """ cursor = connect.cursor() sql_uids = 'select distinct user_id from train_user;' cursor.execute(sql_uids) result_uids = cursor.fetchall() count = 0 with open(fout, 'w') as f: for [uid] in result_uids: sql = ('select distinct item_id from train_user where user_id=%s' 'and behavior_type=1 order by time DESC limit %s;' % (uid, top_N)) cursor.execute(sql) result_item_ids = cursor.fetchall() count += 1 if count % 500 == 0: logger.debug('output %s users' % (count)) for [item_id] in result_item_ids: f.write('%s,%s\n' % (uid, item_id)) cursor.close()
def cal_popularity_in_category(item_id, stoptime_str, train_user_connect): """ 计算某个商品在其所属的类内的热门程度,若在商品子集(train_item中)不存在该商品,则返回0, 否则返回被购买数的百分比 :param item_id:商品id :param stoptime_str:截止日期 :param train_user_connect: Mongodb的train_user表连接 :return: """ from datetime import datetime stoptime = datetime.strptime(str(stoptime_str + ' 00'), '%Y-%m-%d %H') current_item = train_user_connect.find_one({'item_id': item_id}) category_id = current_item['item_category'] item_bought_count = train_user_connect.find({ 'item_id': item_id, 'behavior_type': '4', 'time': { '$lt': stoptime } }).count() bought_max_count = train_user_connect.find({ 'item_category': category_id, 'behavior_type': '4', 'time': { '$lt': stoptime } }).count() if bought_max_count != 0: popularity_in_category = float(item_bought_count) / bought_max_count logger.debug('item ' + item_id + ' popularity_in_category = ' + str(popularity_in_category)) return popularity_in_category else: return 0.0
def insert_train_item_2table(connect, fin='tianchi_mobile_recommend_train_item.csv'): """ 把train_item.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_item_new SET item_id=%s, item_geohash='%s'," "item_category=%s" % (cols[0], cols[1], cols[2])) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def insert_train_user_2table(connect, fin='tianchi_mobile_recommend_train_user.csv'): """ 把train_user.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_user_new SET user_id=%s, item_id=%s," "behavior_type=%s, user_geohash='%s', item_category=%s," "time=%s;" % (cols[0], cols[1], cols[2], cols[3], cols[4], arrow.get(cols[5], 'YYYY-MM-DD HH').timestamp) ) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def cal_popularity_in_category(item_id, stoptime_str, train_user_connect): """ 计算某个商品在其所属的类内的热门程度,若在商品子集(train_item中)不存在该商品,则返回0, 否则返回被购买数的百分比 :param item_id:商品id :param stoptime_str:截止日期 :param train_user_connect: Mongodb的train_user表连接 :return: """ from datetime import datetime stoptime = datetime.strptime(str(stoptime_str + ' 00'), '%Y-%m-%d %H') current_item = train_user_connect.find_one({'item_id': item_id}) category_id = current_item['item_category'] item_bought_count = train_user_connect.find({'item_id': item_id, 'behavior_type': '4', 'time': {'$lt': stoptime}}).count() bought_max_count = train_user_connect.find({'item_category': category_id, 'behavior_type': '4', 'time': {'$lt': stoptime}}).count() if bought_max_count != 0: popularity_in_category = float(item_bought_count) / bought_max_count logger.debug('item ' + item_id + ' popularity_in_category = ' + str(popularity_in_category)) return popularity_in_category else: return 0.0
def tmp_generate_X_y_arrays(f_train_set='%s/train_set.csv' % (data_path)): """ 生成分类器的训练集X 和标签集y, 暂时删除其中某列 Args: f_train_set: 训练集的csv文件 Returns: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] """ from sklearn import preprocessing import numpy as np X = [] y = [] with open(f_train_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') X.append([float(i) for i in (cols[1:4]+cols[5:])]) y.append(int(cols[0])) # tag在第一列,0 或 -1 logger.debug('classifier input X_size=[%s, %s] y_size=[%s, 1]' % (len(X), len(X[0]), len(y))) X = preprocessing.scale(np.array(X)) y = np.array(y) return X, y
def insert_train_item_2table(connect, fin='tianchi_mobile_recommend_train_item.csv'): """ 把train_item.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ( "INSERT INTO train_item_new SET item_id=%s, item_geohash='%s'," "item_category=%s" % (cols[0], cols[1], cols[2])) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model import LogisticRegression import scipy # Exhaustive Grid Search exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]} clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)} clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter) names = ["Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y) # 逻辑回归 for C in [0.01, 0.1, 1, 10, 100, 1000, 10000]: logger.info('Use LR with l1 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l1', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_)) logger.info('Use LR with l2 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l2', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_))
def cal_user_behavior(connect, timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 计算时间加权后的用户行为 结果输出文件的格式: -------------- content --------------- | user_id,item_id,see,favorite,cart,buy,tag | -------------------------------------- 其中,see代表浏览的时间加权结果,favorite代表收藏,cart代表添加到购物车,favorite代表买 Args: connect: MySQLdb.connect(), 数据库连接句柄 timerange: 时间筛选条件 f_train_set: string, 训练集结果文件 ------ content ------ | user_id,item_id,tag | --------------------- Returns: f_output: sting,输出文件 """ import arrow from math import exp f_output = f_train_set.replace('.csv', '_calUserBehavior.csv') # 输出文件的名称 predict_timestamp = arrow.get('2014-12-19').timestamp time_atten = 3600 * 48 # 时间戳的衰减因子, exp(-1/a * delta_t) (timerange_start, timerange_end) = map(lambda elem: arrow.get(elem).timestamp, timerange) cursor = connect.cursor() with open(f_train_set, 'r') as fin, open(f_output, 'w') as fout: fin.readline() # 忽略首行 fout.write('user_id,item_id,see,favorite,cart,buy,tag\n') counter = 0 # for log logger.debug('start generate...') for in_line in fin: in_cols = in_line.strip().split(',') [user_id, item_id, tag] = in_cols sql = 'select behavior_type, time from train_user where user_id=%s and item_id=%s and time>%s and time<=%s;' % ( user_id, item_id, timerange_start, timerange_end) # logger.debug('sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() time_weights = [0.0, 0.0, 0.0, 0.0] for [behavior_type, timestamp] in result: time_weights[int(behavior_type) - 1] += exp((timestamp - predict_timestamp) / time_atten) fout.write('%s,%s,%s,%s,%s,%s,%s\n' % ( user_id, item_id, time_weights[0], time_weights[1], time_weights[2], time_weights[3], tag)) counter += 1 if counter % 300 == 0: logger.debug('NO.%s: user_id=%s, item_id=%s, time_weights=%s, tag=%s' % ( counter, user_id, item_id, time_weights, tag)) cursor.close() return f_output
def generate_predict_set(connect, timerange, f_train_set='%s/test_set.csv' % (data_path)): """ 构建预测集 Args: connect: Mysqldb.connect(), 数据库连接句柄 timerange: tuple, 测试集的时间筛选条件, (start, end) f_train_set: string, 测试集结果文件 ---- content ---- | user_id,item_id | ----------------- Returns: None """ import arrow cursor = connect.cursor() (timerange_start, timerange_end) = map(lambda elem: arrow.get(elem).timestamp, timerange) with open(f_train_set, 'w') as fout: fout.write('user_id,item_id,tag\n') sql = 'select distinct user_id, item_id from train_user where time>%s and time<=%s;' % ( timerange_start, timerange_end) logger.debug('sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() logger.debug('start generate test set') for [user_id, item_id] in result: fout.write('%s,%s,%s\n' % (user_id, item_id, -1)) logger.debug('success generate test set') cursor.close()
def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert(counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def generate_predict_result(f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))
def dump_train_item(csv_path, db_address): """ 将train_item.csv的数据存入MongoDB数据库 :param csv_path: csv文件的路径 :param db_address: MongoDB的地址 :return: """ logger.info('dump_train_item start') csvfile = open(csv_path) # for line in csvfile: head = csvfile.readline() head = head.replace('\n', '') title = [] for x in head.split(','): title.append(x) print title conn = pymongo.Connection(db_address, 27017) db = conn.TianchiData train_item_db = db.train_item_new # line = csvfile.readline() count = 0 for line in csvfile: line = line.replace('\n', '') data = {} temp = line.split(',') for i in range(len(title)): # if title[i] != 'time': data[title[i]] = temp[i] # else: # data[title[i]] = datetime.strptime(str(temp[i]), '%Y-%m-%d %H') train_item_db.insert(data) count += 1 if count % 10000 == 0: logger.debug('%s inserted' % count) # line = csvfile.readline() conn.disconnect() logger.info('dump_train_item done') print '处理完毕'
def add_time_affect(connect, time_atten=3600 * 48, fout_name='%s/UserCF_result.json' % (data_path), fin_name='%s/split_json_item_rates.json' % (data_path)): """ 根据已有的推荐结果,加入时间衰减的因子 Args: connect: Mysqldb.connect(), 数据库连接句柄 time_atten: int, 时间戳的衰减因子, exp(-1/a * delta_t) fin_name: string, 未加入时间考量的推荐评分文件 fout_name: string, 结果的输出文件 Returns: None """ cursor = connect.cursor() predict_timestamp = arrow.get('2014-12-19').timestamp counter = 0 with open(fin_name, 'r') as fin, open(fout_name, 'w') as fout: for line in fin: record = json.loads(line.strip()) user_id = record['user_id'] for item_id in record['items'].iterkeys(): sql = 'select time from train_user where user_id=%s and item_id=%s;' % ( user_id, item_id) cursor.execute(sql) result = cursor.fetchall() time_weight = 0 # 记录时间因子 for [timestamp] in result: time_weight += exp( (timestamp - predict_timestamp) / time_atten) record['items'][item_id] *= time_weight counter += 1 logger.debug('user_id=%s done, %s/10000' % (user_id, counter)) fout.write('%s\n' % (json.dumps(record))) cursor.close()
def get_popularity_in_category(self, item_id): """ 计算某个商品在其所属的类内的热门程度,若在商品子集(train_item中)不存在该商品,则返回0, 否则返回被购买数的百分比 :param item_id:商品id :param stoptime_str:截止日期 :param train_user_connect: Mongodb的train_user表连接 :return: """ if self.result_dict.has_key(item_id): return self.result_dict[item_id] else: current_item = self.train_user.find_one({'item_id': item_id}) category_id = current_item['item_category'] # if self.item_bought_dict.has_key(item_id): # item_bought_count = self.item_bought_dict[item_id] # else: item_bought_count = self.train_user.find({ 'item_id': item_id, 'behavior_type': '4', 'time': { '$lt': self.stoptime } }).count() # self.item_bought_dict[item_id] = item_bought_count if item_bought_count != 0: if self.category_bought_dict.has_key(category_id): bought_max_count = self.category_bought_dict[category_id] else: bought_max_count = self.train_user.find({ 'item_category': category_id, 'behavior_type': '4', 'time': { '$lt': self.stoptime } }).count() # bought_max_count = temp.count() # k = len(temp.distinct('item_id')) self.category_bought_dict[category_id] = bought_max_count if bought_max_count != 0: popularity_in_category = float( item_bought_count) / bought_max_count logger.debug('item ' + item_id + ' popularity_in_category = ' + str(popularity_in_category)) self.result_dict[item_id] = popularity_in_category return popularity_in_category else: self.result_dict[item_id] = 0.0 logger.debug('item ' + item_id + ' popularity_in_category = ' + str(0.0)) return 0.0 else: logger.debug('item ' + item_id + ' popularity_in_category = ' + str(0.0)) return 0.0
def add_time_affect(connect, time_atten=3600*48, fout_name='%s/UserCF_result.json' % (data_path), fin_name='%s/split_json_item_rates.json' % (data_path)): """ 根据已有的推荐结果,加入时间衰减的因子 Args: connect: Mysqldb.connect(), 数据库连接句柄 time_atten: int, 时间戳的衰减因子, exp(-1/a * delta_t) fin_name: string, 未加入时间考量的推荐评分文件 fout_name: string, 结果的输出文件 Returns: None """ cursor = connect.cursor() predict_timestamp = arrow.get('2014-12-19').timestamp counter = 0 with open(fin_name, 'r') as fin, open(fout_name, 'w') as fout: for line in fin: record = json.loads(line.strip()) user_id = record['user_id'] for item_id in record['items'].iterkeys(): sql = 'select time from train_user where user_id=%s and item_id=%s;' % (user_id, item_id) cursor.execute(sql) result = cursor.fetchall() time_weight = 0 # 记录时间因子 for [timestamp] in result: time_weight += exp((timestamp-predict_timestamp)/time_atten) record['items'][item_id] *= time_weight counter += 1 logger.debug('user_id=%s done, %s/10000' % (user_id, counter)) fout.write('%s\n' % (json.dumps(record))) cursor.close()
def generate_from_popularity_in_category(f_recommend, stoptime_str, train_user_connect): """ 根据类内排名生成结果 Args: f_recommend: fin, 推荐结果文件 stoptime_str: string, 预测日期 train_user_connect: MongoDB的 train_user的连接 Returns: f_output: fout, 最终输出结果 """ import random category_popularity_item = dict() # key:item_id, value:类内流行度 category_popularity = dict() # key:(user_id,item_id), value:类内流行度 with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') ui_tuple = (cols[0], cols[1]) if cols[1] not in category_popularity_item: category_popularity[ui_tuple] = cal_popularity_in_category( cols[1], stoptime_str, train_user_connect) else: category_popularity[ui_tuple] = category_popularity_item[ cols[1]] # 最终结果由类内排名前%25的 加上随机的25% sorted_result = sorted(category_popularity.iteritems(), key=lambda d: d[1], reverse=True) index_slice = int(0.25 * len(sorted_result)) front_25 = sorted_result[:index_slice] last_75 = sorted_result[index_slice:] random.shuffle(last_75) random_25 = last_75[:index_slice] # 写出最后结果 f_output = f_recommend.replace('.csv', '_categoryPopularity.csv') logger.debug('Start store result to %s' % (f_output)) with open(f_output, 'w') as fout: fout.write('user_id,item_id\n') for elem in front_25: logger.debug('front 25, popularity:%s' % (elem[1])) fout.write('%s,%s\n' % (elem[0][0], elem[0][1])) for elem in random_25: logger.debug('random 25, popularity:%s' % (elem[1])) fout.write('%s,%s\n' % (elem[0][0], elem[0][1])) return f_output
def generate_from_popularity_in_category(f_recommend, stoptime_str, train_user_connect): """ 根据类内排名生成结果 Args: f_recommend: fin, 推荐结果文件 stoptime_str: string, 预测日期 train_user_connect: MongoDB的 train_user的连接 Returns: f_output: fout, 最终输出结果 """ import random category_popularity_item = dict() # key:item_id, value:类内流行度 category_popularity = dict() # key:(user_id,item_id), value:类内流行度 with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') ui_tuple = (cols[0], cols[1]) if cols[1] not in category_popularity_item: category_popularity[ui_tuple] = cal_popularity_in_category(cols[1], stoptime_str, train_user_connect) else: category_popularity[ui_tuple] = category_popularity_item[cols[1]] # 最终结果由类内排名前%25的 加上随机的25% sorted_result = sorted(category_popularity.iteritems(), key=lambda d: d[1], reverse=True) index_slice = int(0.25*len(sorted_result)) front_25 = sorted_result[:index_slice] last_75 = sorted_result[index_slice:] random.shuffle(last_75) random_25 = last_75[:index_slice] # 写出最后结果 f_output = f_recommend.replace('.csv', '_categoryPopularity.csv') logger.debug('Start store result to %s' % (f_output)) with open(f_output, 'w') as fout: fout.write('user_id,item_id\n') for elem in front_25: logger.debug('front 25, popularity:%s' % (elem[1])) fout.write('%s,%s\n' % (elem[0][0], elem[0][1])) for elem in random_25: logger.debug('random 25, popularity:%s' % (elem[1])) fout.write('%s,%s\n' % (elem[0][0], elem[0][1])) return f_output
def get_popularity_in_category(self, item_id): """ 计算某个商品在其所属的类内的热门程度,若在商品子集(train_item中)不存在该商品,则返回0, 否则返回被购买数的百分比 :param item_id:商品id :param stoptime_str:截止日期 :param train_user_connect: Mongodb的train_user表连接 :return: """ if self.result_dict.has_key(item_id): return self.result_dict[item_id] else: current_item = self.train_user.find_one({'item_id': item_id}) category_id = current_item['item_category'] # if self.item_bought_dict.has_key(item_id): # item_bought_count = self.item_bought_dict[item_id] # else: item_bought_count = self.train_user.find({'item_id': item_id, 'behavior_type': '4', 'time': {'$lt': self.stoptime}}).count() # self.item_bought_dict[item_id] = item_bought_count if item_bought_count != 0: if self.category_bought_dict.has_key(category_id): bought_max_count = self.category_bought_dict[category_id] else: bought_max_count = self.train_user.find({'item_category': category_id, 'behavior_type': '4', 'time': {'$lt': self.stoptime}}).count() # bought_max_count = temp.count() # k = len(temp.distinct('item_id')) self.category_bought_dict[category_id] = bought_max_count if bought_max_count != 0: popularity_in_category = float(item_bought_count) / bought_max_count logger.debug('item ' + item_id + ' popularity_in_category = ' + str(popularity_in_category)) self.result_dict[item_id] = popularity_in_category return popularity_in_category else: self.result_dict[item_id] = 0.0 logger.debug('item ' + item_id + ' popularity_in_category = ' + str(0.0)) return 0.0 else: logger.debug('item ' + item_id + ' popularity_in_category = ' + str(0.0)) return 0.0
def generate_test_set(connect, timerange, f_train_set='%s/test_set.csv' % (data_path)): """ 构建测试集, 即用来验证结果正确性的文件 Args: connect: Mysqldb.connect(), 数据库连接句柄 timerange: tuple, 测试集的时间筛选条件, (start, end) f_train_set: string, 测试集结果文件 ---- content ---- | user_id,item_id | ----------------- Returns: None """ import arrow cursor = connect.cursor() (timerange_start, timerange_end) = map(lambda elem: arrow.get(elem).timestamp, timerange) counter = 0 with open(f_train_set, 'w') as fout: fout.write('user_id,item_id\n') sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time<=%s;' % ( timerange_start, timerange_end) logger.debug('sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() logger.debug('start generate test set') for [user_id, item_id] in result: fout.write('%s,%s\n' % (user_id, item_id)) counter += 1 logger.debug('success generate test set, and size=%s.' % (counter)) logger.info('Result store in: %s' % (f_train_set)) cursor.close()
def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert (counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def generate_predict_result( f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model import LogisticRegression import scipy # Exhaustive Grid Search exhaustive_parameters = { 'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-4] } clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = { 'kernel': ['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1) } clf_SVC_randomized = grid_search.RandomizedSearchCV( SVC(), randomized_parameter) names = [ "Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA" ] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y) # 逻辑回归 for C in [0.01, 0.1, 1, 10, 100, 1000, 10000]: logger.info('Use LR with l1 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l1', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_)) logger.info('Use LR with l2 penalty, C=%s:' % (C)) clf = LogisticRegression(C=C, penalty='l2', tol=0.01) clf = train_classifier(clf, X, y) logger.debug('coef matrix: %s' % (clf.coef_))
def generate_train_set(connect, positive_set_timerange, negative_set_timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 构建训练集 Args: connect: Mysqldb.connect(), 数据库连接句柄 positive_set_timerange: tuple, 正样本的时间筛选条件, (start, end) e.g. ('2014-12-17', '2014-12-18') negative_set_timerange: tuple, 负样本的时间筛选条件, (start, end) e.g. ('2014-11-17', '2014-12-17') f_train_set: string, 训练集结果文件 tag=1,正样本;tag=-1,负样本 ------ content ------ | user_id,item_id,tag | --------------------- Returns: None """ import arrow from random import randint cursor = connect.cursor() # 正样本的时间过滤条件 # positive_timestamp_start = arrow.get('2014-12-17').timestamp # positive_timestamp_end = arrow.get('2014-12-18').timestamp time2timestamp = lambda elem: arrow.get(elem).timestamp (positive_timestamp_start, positive_timestamp_end) = map(time2timestamp, positive_set_timerange) (negative_timestamp_start, negative_timestamp_end) = map(time2timestamp, negative_set_timerange) with open(f_train_set, 'w') as fout: fout.write('user_id,item_id,tag\n') set_counter = 0 # 正样本 tag = 1 sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time <=%s;' % ( positive_timestamp_start, positive_timestamp_end) logger.debug('positive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() positive_set = set() # 保存正样本,以防止负样本与正样本相同 logger.debug('start store positive set') for [user_id, item_id] in result: set_counter += 1 positive_set.add((user_id, item_id)) fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if set_counter % 300 == 0: logger.debug('[train set] positive No.%s' % (set_counter)) logger.info('[train set] positive set DONE, num of set = %s' % (set_counter)) # 负样本 tag = -1 log_counter = 0 # order by rand() 效率太低,使用两步法代替 """ # sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s order by rand() limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter+1000) sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter*2) logger.debug('negtive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() result = list(result) # result type: tuple -> list logger.debug('start store negtive set') shuffle(result) for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter == set_counter: break if log_counter % 1000 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) """ # Step 1: 获得PK的最小值和PK的最大值 sql_PK_min = 'select record_id from train_user where time>%s and time <=%s order by time limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_min) result = cursor.fetchall() PK_min = int(result[0][0]) logger.debug('min Primary Key = %s' % (PK_min)) sql_PK_max = 'select record_id from train_user where time>%s and time <=%s order by time DESC limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_max) result = cursor.fetchall() PK_max = int(result[0][0]) logger.debug('max Primary Key = %s' % (PK_max)) # Step 2: 生成随机数(min,max),直至取出与正样本相同数目的负样本 logger.debug('start store negtive set') while log_counter < set_counter: sql = 'select user_id, item_id from train_user where record_id=%s' % (randint(PK_min, PK_max)) cursor.execute(sql) result = cursor.fetchall() for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter % 300 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) logger.info('[train set] negtive set DONE, num of set = %s' % (log_counter)) cursor.close()
def filter_with_category_popularity(connect, train_user_connect, f_recommend, f_category_relationship, stoptime_str): """ 用类间承接关系和类内排名过滤结果 Args: connect: MySQLdb.connect(), 数据库连接句柄 train_user_connect: Mongodb的train_user表连接 f_recommend: fin, 推荐结果文件 f_category_relationship: fin, 类间承接关系 stoptime_str: string, 截止日期 Returns: f_output: fout, 过滤后的结果 """ import arrow import random cursor = connect.cursor() f_output = f_recommend.replace('.csv', '_filter.csv') logger.debug('Start filter recommend result..') # 根据推荐文件生成 stoptime_timestamp = arrow.get(stoptime_str).timestamp recommend_dict = {} # {u_id1:[i_id1,i_id2], u_id2:[i_id3,i_id4]} with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') if recommend_dict.has_key(cols[0]): recommend_dict[cols[0]].append(cols[1]) else: recommend_dict[cols[0]] = [cols[1]] logger.debug('完成根据推荐文件生成第一步, len:%s' % (len(recommend_dict))) # 分两步生成最后的dict是为了减少mysql查询数 recommend_tuple_dict = { } # {(u_id1,i_id1):(u_last_category, id1_category), (u_id1, i_id2):(u_last_category, id2_category)} user_counter = 0 for (u_id, i_ids) in recommend_dict.iteritems(): sql = 'SELECT item_category FROM train_user WHERE user_id=%s and time<%s ORDER BY time DESC limit 1;' % ( u_id, stoptime_timestamp) cursor.execute(sql) result = cursor.fetchall() user_last_category = result[0][0] user_counter += 1 if user_counter % 200 == 0: logger.debug('No.%s user, user_id=%s, last_item_category=%s' % (user_counter, u_id, user_last_category)) for i_id in i_ids: sql = 'SELECT item_category FROM train_item WHERE item_id=%s;' % ( i_id) cursor.execute(sql) result = cursor.fetchall() recommend_tuple_dict[(u_id, i_id)] = (user_last_category, result[0][0]) logger.debug('原推荐结果长度:%s' % (len(recommend_tuple_dict))) # 根据承接关系文件生成 relationship_set = set() with open(f_category_relationship, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') relationship_set.add((cols[0], cols[1])) logger.debug('承接关系结果长度:%s' % (len(relationship_set))) # 输出结果 with open(f_output, 'w') as fout: in_counter = 0 random_counter = 0 fout.write('user_id,item_id\n') for ((user_id, item_id), category_tuple) in recommend_tuple_dict.iteritems(): if category_tuple in relationship_set: in_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) else: if random.random() <= cal_popularity_in_category( item_id, stoptime_str, train_user_connect): random_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) logger.debug('NO.%s random pick, [%s,%s]' % (random_counter, user_id, item_id)) logger.info('对推荐结果的筛选完成,结果路径:%s' % ()) logger.info('筛选前%s, 筛选后%s. 其中在承接关系的有%s, 随机挑选的有%s' % (len(recommend_tuple_dict), in_counter + random_counter, in_counter, random_counter))
def find_category_relationship( train_user_connect, train_item_connect, json_output_path=r'%s/relationData_new.json' % data_path, csv_output_path=r'%s/relationData_new.csv' % data_path, time_window=1): """ 计算商品子集中所有类别的承接关系 :param train_user_connect: :param train_item_connect: :param time_window: :return: """ import pymongo import json logger.info('find_category_relationship start!') userids = train_user_connect.distinct('user_id') logger.debug('userids loaded!') # category_items = train_item_connect.distinct('item_id') # logger.debug('category_items loaded') relationDict = {} itemcount = 0 usercount = 0 for user_id in userids: usercount += 1 # print 'user_index:' + str(usercount) # 返回根据时间升序排序的所有该用户的购买行为 user_buy_behaviors = train_user_connect.find({ 'user_id': user_id, 'behavior_type': '4' }).sort('time', pymongo.ASCENDING) categoryList = [] # 存储(类别id,行为时间)元祖 for buy_behavior in user_buy_behaviors: categoryList.append( (buy_behavior['item_category'], buy_behavior['time'])) # 根据时间窗口寻找类别之间的承接关系 len_category = len(categoryList) # print 'len_buylist = ' + str(len_category) for i in range(len_category): current_item = categoryList[i] itemcount += 1 currentCategory = current_item[0] targetCategoryDict = {} if relationDict.has_key(currentCategory): targetCategoryDict = relationDict.get(currentCategory) # else: # relationDict[currentCategory] = targetCategoryDict # else: # continue # 商品子集中没有该商品,则跳过 j = i while j < len_category: if (categoryList[j][1] - current_item[1]).days <= time_window: # 两次购买行为在时间窗口tw内,则存在承接关系 if categoryList[j][0] != current_item[0]: targetCategory = categoryList[j][0] # 更新dict中的次数计数 if targetCategoryDict.has_key(targetCategory): targetCategoryDict[targetCategory] += 1 else: targetCategoryDict[targetCategory] = 1 j += 1 else: break # 若购买行为超出了时间窗口,则跳出while if len(targetCategoryDict) > 0: relationDict[currentCategory] = targetCategoryDict # break if usercount % 1000 == 0: logger.debug('No.%s user done, user_index:%s\tlen_category = %s' % (usercount, usercount, len_category)) jsonstr = json.dumps(relationDict) output = open(json_output_path, 'w') output.write(jsonstr) # dict转存为csv csvout = open(csv_output_path, 'w') csvout.write('source_category,target_category,link_count\n') for source in relationDict.keys(): for target in relationDict.get(source): csvout.write('%s,%s,%s\n' % (source, target, relationDict[source][target])) logger.info( 'find_category_relationship done, json_output_path=%s\tcsv_output_path=%s' % (json_output_path, csv_output_path))
def filter_with_mongodb(train_user_connect, f_recommend_path, f_category_relation_path, fout_path='%s/result/filter_output.csv' % data_path, time_range=('2014-12-04', '2014-12-05')): """ 推荐结果筛选的mongoDB版 :param train_user_connect:mongodb的train_user连接 :param f_recommend_path:推荐结果的csv文件路径 :param f_category_relation_path:精简后的类别关系csv路径 :param fout_path:筛选结果输出路径 :param time_range:“最近购买”的时间范围 :return: """ import pymongo import random import math logger.info('start filter_with_mongoDB') popularity_calculator = PopularityInCategory(train_user_connect, time_range[1]) # 推荐列表预处理 f_recommend = open(f_recommend_path) f_recommend.readline() # 忽略首行 recommend_dict = {} old_count = 0 new_count = 0 for line in f_recommend: cols = line.strip().split(',') user_id = cols[0] item_id = cols[1] if recommend_dict.has_key(user_id): recommend_dict[user_id].append(item_id) else: recommend_dict[user_id] = [item_id] # link_count = cols[2] # 类别关系列表预处理 f_category_relation = open(f_category_relation_path) f_category_relation.readline() category_dict = {} for line in f_category_relation: cols = line.strip().split(',') source = cols[0] target = cols[1] if category_dict.has_key(source): category_dict[source].append(target) else: category_dict[source] = [target] # 开始进行筛选 logger.debug('start filtering!') fout = open(fout_path, 'w') fout.write('user_id,item_id\n') result = [] usercount = 0 for u_id in recommend_dict.keys(): usercount += 1 logger.debug(usercount) i_ids = recommend_dict[u_id] last_buy_cursor = train_user_connect.find({'user_id': u_id, 'behavior_type': '4', 'time': { '$gt': time_range[0], '$lt': time_range[1]}}).sort('time', pymongo.DESCENDING) item_category_dict = {} item_popularity_in_category_dict = {} last_buy = next(last_buy_cursor, None) for i_id in i_ids: old_count += 1 if item_category_dict.has_key(i_id): i_category = item_category_dict.get(i_id) temp_category = i_category else: temp = train_user_connect.find_one({'item_id': i_id}) temp_category = temp['item_category'] item_category_dict[i_id] = temp_category # 判断所推荐的商品种类是否与该用户最后一次购买的类别存在顺承关系 if last_buy and temp_category in category_dict[last_buy['item_category']]: # last_category = last_buy['item_category'] # if temp_category in category_dict[last_category]: logger.debug('有承接关系 u_id=%s,i_id=%s' % (u_id, i_id)) fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 else: # 不符合顺承关系则进入随机保留 if item_popularity_in_category_dict.has_key(i_id): item_popularity = item_popularity_in_category_dict[i_id] else: item_popularity = popularity_calculator.get_popularity_in_category(i_id) item_popularity_in_category_dict[i_id] = item_popularity item_popularity = 1 / (1 + math.e ** (item_popularity +1)) logger.debug('random prob = %s' % item_popularity) if random.random() <= item_popularity: fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info('done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % ( old_count, new_count, fout_path)) return result
def find_category_relationship(train_user_connect, train_item_connect, json_output_path=r'%s/relationData_new.json' % data_path, csv_output_path=r'%s/relationData_new.csv' % data_path, time_window=1): """ 计算商品子集中所有类别的承接关系 :param train_user_connect: :param train_item_connect: :param time_window: :return: """ import pymongo import json logger.info('find_category_relationship start!') userids = train_user_connect.distinct('user_id') logger.debug('userids loaded!') # category_items = train_item_connect.distinct('item_id') # logger.debug('category_items loaded') relationDict = {} itemcount = 0 usercount = 0 for user_id in userids: usercount += 1 # print 'user_index:' + str(usercount) # 返回根据时间升序排序的所有该用户的购买行为 user_buy_behaviors = train_user_connect.find({'user_id': user_id, 'behavior_type': '4'}).sort('time', pymongo.ASCENDING) categoryList = [] # 存储(类别id,行为时间)元祖 for buy_behavior in user_buy_behaviors: categoryList.append((buy_behavior['item_category'], buy_behavior['time'])) # 根据时间窗口寻找类别之间的承接关系 len_category = len(categoryList) # print 'len_buylist = ' + str(len_category) for i in range(len_category): current_item = categoryList[i] itemcount += 1 currentCategory = current_item[0] targetCategoryDict = {} if relationDict.has_key(currentCategory): targetCategoryDict = relationDict.get(currentCategory) # else: # relationDict[currentCategory] = targetCategoryDict # else: # continue # 商品子集中没有该商品,则跳过 j = i while j < len_category: if (categoryList[j][1] - current_item[1]).days <= time_window: # 两次购买行为在时间窗口tw内,则存在承接关系 if categoryList[j][0] != current_item[0]: targetCategory = categoryList[j][0] # 更新dict中的次数计数 if targetCategoryDict.has_key(targetCategory): targetCategoryDict[targetCategory] += 1 else: targetCategoryDict[targetCategory] = 1 j += 1 else: break # 若购买行为超出了时间窗口,则跳出while if len(targetCategoryDict) > 0: relationDict[currentCategory] = targetCategoryDict # break if usercount % 1000 == 0: logger.debug('No.%s user done, user_index:%s\tlen_category = %s' % (usercount, usercount, len_category)) jsonstr = json.dumps(relationDict) output = open(json_output_path, 'w') output.write(jsonstr) # dict转存为csv csvout = open(csv_output_path, 'w') csvout.write('source_category,target_category,link_count\n') for source in relationDict.keys(): for target in relationDict.get(source): csvout.write('%s,%s,%s\n' % (source, target, relationDict[source][target])) logger.info('find_category_relationship done, json_output_path=%s\tcsv_output_path=%s' % ( json_output_path, csv_output_path))
def generate_train_set(connect, positive_set_timerange, negative_set_timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 构建训练集 Args: connect: Mysqldb.connect(), 数据库连接句柄 positive_set_timerange: tuple, 正样本的时间筛选条件, (start, end) e.g. ('2014-12-17', '2014-12-18') negative_set_timerange: tuple, 负样本的时间筛选条件, (start, end) e.g. ('2014-11-17', '2014-12-17') f_train_set: string, 训练集结果文件 tag=1,正样本;tag=-1,负样本 ------ content ------ | user_id,item_id,tag | --------------------- Returns: None """ import arrow from random import randint cursor = connect.cursor() # 正样本的时间过滤条件 # positive_timestamp_start = arrow.get('2014-12-17').timestamp # positive_timestamp_end = arrow.get('2014-12-18').timestamp time2timestamp = lambda elem: arrow.get(elem).timestamp (positive_timestamp_start, positive_timestamp_end) = map(time2timestamp, positive_set_timerange) (negative_timestamp_start, negative_timestamp_end) = map(time2timestamp, negative_set_timerange) with open(f_train_set, 'w') as fout: fout.write('user_id,item_id,tag\n') set_counter = 0 # 正样本 tag = 1 sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time <=%s;' % ( positive_timestamp_start, positive_timestamp_end) logger.debug('positive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() positive_set = set() # 保存正样本,以防止负样本与正样本相同 logger.debug('start store positive set') for [user_id, item_id] in result: set_counter += 1 positive_set.add((user_id, item_id)) fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if set_counter % 300 == 0: logger.debug('[train set] positive No.%s' % (set_counter)) logger.info('[train set] positive set DONE, num of set = %s' % (set_counter)) # 负样本 tag = -1 log_counter = 0 # order by rand() 效率太低,使用两步法代替 """ # sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s order by rand() limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter+1000) sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter*2) logger.debug('negtive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() result = list(result) # result type: tuple -> list logger.debug('start store negtive set') shuffle(result) for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter == set_counter: break if log_counter % 1000 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) """ # Step 1: 获得PK的最小值和PK的最大值 sql_PK_min = 'select record_id from train_user where time>%s and time <=%s order by time limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_min) result = cursor.fetchall() PK_min = int(result[0][0]) logger.debug('min Primary Key = %s' % (PK_min)) sql_PK_max = 'select record_id from train_user where time>%s and time <=%s order by time DESC limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_max) result = cursor.fetchall() PK_max = int(result[0][0]) logger.debug('max Primary Key = %s' % (PK_max)) # Step 2: 生成随机数(min,max),直至取出与正样本相同数目的负样本 logger.debug('start store negtive set') while log_counter < set_counter: sql = 'select user_id, item_id from train_user where record_id=%s' % ( randint(PK_min, PK_max)) cursor.execute(sql) result = cursor.fetchall() for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter % 300 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) logger.info('[train set] negtive set DONE, num of set = %s' % (log_counter)) cursor.close()
def filter_with_category_popularity(connect, train_user_connect, f_recommend, f_category_relationship, stoptime_str): """ 用类间承接关系和类内排名过滤结果 Args: connect: MySQLdb.connect(), 数据库连接句柄 train_user_connect: Mongodb的train_user表连接 f_recommend: fin, 推荐结果文件 f_category_relationship: fin, 类间承接关系 stoptime_str: string, 截止日期 Returns: f_output: fout, 过滤后的结果 """ import arrow import random cursor = connect.cursor() f_output = f_recommend.replace('.csv', '_filter.csv') logger.debug('Start filter recommend result..') # 根据推荐文件生成 stoptime_timestamp = arrow.get(stoptime_str).timestamp recommend_dict = {} # {u_id1:[i_id1,i_id2], u_id2:[i_id3,i_id4]} with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') if recommend_dict.has_key(cols[0]): recommend_dict[cols[0]].append(cols[1]) else: recommend_dict[cols[0]] = [cols[1]] logger.debug('完成根据推荐文件生成第一步, len:%s' % (len(recommend_dict))) # 分两步生成最后的dict是为了减少mysql查询数 recommend_tuple_dict = {} # {(u_id1,i_id1):(u_last_category, id1_category), (u_id1, i_id2):(u_last_category, id2_category)} user_counter = 0 for (u_id, i_ids) in recommend_dict.iteritems(): sql = 'SELECT item_category FROM train_user WHERE user_id=%s and time<%s ORDER BY time DESC limit 1;' % (u_id, stoptime_timestamp) cursor.execute(sql) result = cursor.fetchall() user_last_category = result[0][0] user_counter += 1 if user_counter % 200 == 0: logger.debug('No.%s user, user_id=%s, last_item_category=%s' % (user_counter, u_id, user_last_category)) for i_id in i_ids: sql = 'SELECT item_category FROM train_item WHERE item_id=%s;' % (i_id) cursor.execute(sql) result = cursor.fetchall() recommend_tuple_dict[(u_id, i_id)] = (user_last_category, result[0][0]) logger.debug('原推荐结果长度:%s' % (len(recommend_tuple_dict))) # 根据承接关系文件生成 relationship_set = set() with open(f_category_relationship, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') relationship_set.add((cols[0], cols[1])) logger.debug('承接关系结果长度:%s' % (len(relationship_set))) # 输出结果 with open(f_output, 'w') as fout: in_counter = 0 random_counter = 0 fout.write('user_id,item_id\n') for ((user_id, item_id), category_tuple) in recommend_tuple_dict.iteritems(): if category_tuple in relationship_set: in_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) else: if random.random() <= cal_popularity_in_category(item_id, stoptime_str, train_user_connect): random_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) logger.debug('NO.%s random pick, [%s,%s]' % (random_counter, user_id, item_id)) logger.info('对推荐结果的筛选完成,结果路径:%s' % ()) logger.info('筛选前%s, 筛选后%s. 其中在承接关系的有%s, 随机挑选的有%s' %(len(recommend_tuple_dict), in_counter+random_counter, in_counter, random_counter))
def filter_with_mongodb(train_user_connect, f_recommend_path, f_category_relation_path, fout_path='%s/result/filter_output.csv' % data_path, time_range=('2014-12-04', '2014-12-05')): """ 推荐结果筛选的mongoDB版 :param train_user_connect:mongodb的train_user连接 :param f_recommend_path:推荐结果的csv文件路径 :param f_category_relation_path:精简后的类别关系csv路径 :param fout_path:筛选结果输出路径 :param time_range:“最近购买”的时间范围 :return: """ import pymongo import random import math logger.info('start filter_with_mongoDB') popularity_calculator = PopularityInCategory(train_user_connect, time_range[1]) # 推荐列表预处理 f_recommend = open(f_recommend_path) f_recommend.readline() # 忽略首行 recommend_dict = {} old_count = 0 new_count = 0 for line in f_recommend: cols = line.strip().split(',') user_id = cols[0] item_id = cols[1] if recommend_dict.has_key(user_id): recommend_dict[user_id].append(item_id) else: recommend_dict[user_id] = [item_id] # link_count = cols[2] # 类别关系列表预处理 f_category_relation = open(f_category_relation_path) f_category_relation.readline() category_dict = {} for line in f_category_relation: cols = line.strip().split(',') source = cols[0] target = cols[1] if category_dict.has_key(source): category_dict[source].append(target) else: category_dict[source] = [target] # 开始进行筛选 logger.debug('start filtering!') fout = open(fout_path, 'w') fout.write('user_id,item_id\n') result = [] usercount = 0 for u_id in recommend_dict.keys(): usercount += 1 logger.debug(usercount) i_ids = recommend_dict[u_id] last_buy_cursor = train_user_connect.find({ 'user_id': u_id, 'behavior_type': '4', 'time': { '$gt': time_range[0], '$lt': time_range[1] } }).sort('time', pymongo.DESCENDING) item_category_dict = {} item_popularity_in_category_dict = {} last_buy = next(last_buy_cursor, None) for i_id in i_ids: old_count += 1 if item_category_dict.has_key(i_id): i_category = item_category_dict.get(i_id) temp_category = i_category else: temp = train_user_connect.find_one({'item_id': i_id}) temp_category = temp['item_category'] item_category_dict[i_id] = temp_category # 判断所推荐的商品种类是否与该用户最后一次购买的类别存在顺承关系 if last_buy and temp_category in category_dict[ last_buy['item_category']]: # last_category = last_buy['item_category'] # if temp_category in category_dict[last_category]: logger.debug('有承接关系 u_id=%s,i_id=%s' % (u_id, i_id)) fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 else: # 不符合顺承关系则进入随机保留 if item_popularity_in_category_dict.has_key(i_id): item_popularity = item_popularity_in_category_dict[i_id] else: item_popularity = popularity_calculator.get_popularity_in_category( i_id) item_popularity_in_category_dict[i_id] = item_popularity item_popularity = 1 / (1 + math.e**(item_popularity + 1)) logger.debug('random prob = %s' % item_popularity) if random.random() <= item_popularity: fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info( 'done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % (old_count, new_count, fout_path)) return result