def main(): testbase = 'ub' dataset_name = "u1m.data" f = open('../testres.txt', 'a+') k_val = [5] per = [30] #per = [1.4] #metric = 'pr' metric = 'mae' #k_val = [5 9, 13, 17, 21] test_name = "New User-based Test on: " + dataset_name + ' ' if item_based: testbase = 'ib' test_name = "New Item-based test on: " + dataset_name + ' ' iterate = product(per, k_val) for per, k in iterate: f.write('\n') timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) f.write(test_name + timestamp + ' --> ') e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, eval_metric=metric) f.write( str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ') f.write(str(e.evaluate())) f.close()
def iterate_epoch(self, model, lr, epoch, weight_decay=0, warmup=0, lr_decay_rate=1, lr_decay_every=10, eval_every=5, early_stop=False): eval_model = Evaluater(self.data_dir, model_name=self.model_name) #es = EarlyStop(self.data_dir[0:-6] + 'early_stopping/', self.model_name, patience=6) es = EarlyStop('../data_beauty_2core_es/early_stopping/', self.model_name, patience=6) plot_loss_list = [] plot_score_list = [] for i in range(epoch): plot_loss_list.extend( self.iterate_train(model, lr=lr, weight_decay=weight_decay, print_every=10000)) # early stop if early_stop: pre_model = es.early_stop(model) if pre_model: print('Early Stop eposh: {}'.format(i + 1)) return eval_model.topn_map(pre_model) # lrスケジューリング if i > warmup: if (i - warmup) % lr_decay_every == 0: lr = lr * lr_decay_rate if (i + 1) % eval_every == 0: #score = eval_model.topn_precision(model) #print('epoch: {} precision: {}'.format(i, score)) score = eval_model.topn_map(model) print('epoch: {} map: {}'.format(i, score)) plot_score_list.append(score) #self._plot(plot_loss_list) #self._plot(plot_score_list) #return eval_model.topn_precision(model) return eval_model.topn_map(model)
def cal_loss_redundancy(resource, availability, replication, lease): replication_count = 0 data_loss = 0 data_count = 100 data_iterated = 0 lease_period = int(lease) desired_availability = float(availability) iteration = 0 bad_iteration = 0 if replication == 'random1copy' or replication == 'min1copy': copy_number = 1 elif replication == 'random2copy' or replication == 'min2copy': copy_number = 2 elif replication == 'random3copy' or replication == 'min3copy': copy_number = 3 while iteration < 30: quantile = int(range_minute * 0.05) time_point = random.randint(start_minute + quantile, end_minute - quantile) job_count = len(interval_tree[time_point]) # evaluate sizes of data set and job set if job_count < data_count * 3 * copy_number: # print "Error : job set is less than 3 times of data set" bad_iteration += 1 continue availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: # print "Error : match_job_dict is none" bad_iteration += 1 continue for job in match_job_dict: replication_count += len(match_job_dict[job]) evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 data_loss_rate = float(data_loss) / float(data_iterated) redundancy_rate = float(replication_count) / float(data_iterated) print "data loss rate : ", data_loss_rate print "redundancy : ", redundancy_rate print "bad iteration : ", bad_iteration return (data_loss_rate, redundancy_rate)
def __init__(self, data_dir): # 本当はAmazonDatasetクラスを渡した方が速いが、 self.evaluater = Evaluater(data_dir) self.dataset = AmazonDataset(data_dir, model_name='TransE') edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in self.dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network self.G = nx.DiGraph() self.G.add_nodes_from( [i for i in range(len(self.dataset.entity_list))]) self.G.add_edges_from(edges)
def objective(trial): start = time.time() # ハイパラ読み込み # gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3) # lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) alpha = trial.suggest_uniform('alpha', 0, 1) beta = trial.suggest_uniform('beta', 0, 0.5) data_dirs = [ '../' + data_path + '/valid1/', '../' + data_path + '/valid2/' ] score_sum = 0 for data_dir in data_dirs: # dataload dataset = AmazonDataset(data_dir) # laod model #slim = train_SLIM(data_dir, load=True) sim_mat = load_sim_mat('sim_mat' + data_dir[-2] + '.csr', len(dataset.user_list), len(dataset.item_list)) edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) evaluater = Evaluater(data_dir) #ranking_mat = get_ranking_mat(G, slim, alpha, beta, dataset) ranking_mat = get_ranking_mat(G, sim_mat, alpha, beta, dataset) #score = evaluater.topn_map(ranking_mat) score = evaluater.topn_precision(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}s'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): start = time.time() # hyper parameter #gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3) #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) #slim = train_SLIM(lin_model, gamma) alpha = trial.suggest_uniform('alpha', 0, 0.5) beta = trial.suggest_uniform('beta', 0, 0.5) gamma1 = trial.suggest_uniform('gamma1', 0, 1) gamma2 = trial.suggest_uniform('gamma2', 0, 1) gamma3 = trial.suggest_uniform('gamma3', 0, 1) gamma = [gamma1, gamma2, gamma3] data_dir = ['../data_luxury_5core/valid1', '../data_luxury_5core/valid2'] score_sum = 0 for i in range(len(data_dir)): # dataload dataset = AmazonDataset(data_dir[i], model_name='TransE') edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) #user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb')) # load network G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta) #score = topn_precision(ranking_mat, user_items_test_dict) evaluater = Evaluater(data_dir[i]) score = evaluater.topn_map(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): start = time.time() # hyper parameter alpha = trial.suggest_uniform('alpha', 0, 0.5) beta = trial.suggest_uniform('beta', 0, 0.5) gamma1 = trial.suggest_uniform('gamma1', 0, 1) gamma2 = trial.suggest_uniform('gamma2', 0, 1) gamma3 = trial.suggest_uniform('gamma3', 0, 1) gamma = [gamma1, gamma2, gamma3] data_dir = ['../' + data_path + '/valid1', '../' + data_path + '/valid2'] score_sum = 0 for i in range(len(data_dir)): # dataload dataset = AmazonDataset(data_dir[i], model_name='SparseTransE') # load network edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta) #score = topn_precision(ranking_mat, user_items_test_dict) evaluater = Evaluater(data_dir[i]) score = evaluater.topn_map(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * score_sum / 2
# load param params = load_params() alpha = params['alpha'] beta = params['beta'] gamma1 = params['gamma1'] gamma2 = params['gamma2'] gamma3 = params['gamma3'] gamma = [gamma1, gamma2, gamma3] # dataload dataset = AmazonDataset(data_dir, model_name='TransE') # load network edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model, gamma, alpha, beta) evaluater = Evaluater(data_dir) score = evaluater.topn_map(ranking_mat) mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) np.savetxt('score_transe3.txt', np.array([score]))
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new = record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
bad_iteration += 1 continue availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability # print availability_dict match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: print "Error : match_job_dict is none" bad_iteration += 1 continue # for job in match_job_dict: # print job, match_job_dict[job] evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 print "data_loss : ", data_loss print "data_iterated : ", data_iterated print "iteration, bad_iteration : ", iteration, bad_iteration loss_rate_list.append(float(data_loss) / float(data_iterated)) print loss_rate_list print loss_rate_list resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'} avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'} file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[ 2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt'