def main(): testbase = 'ub' dataset_name = "u1m.data" f = open('../testres.txt', 'a+') k_val = [5] per = [30] #per = [1.4] #metric = 'pr' metric = 'mae' #k_val = [5 9, 13, 17, 21] test_name = "New User-based Test on: " + dataset_name + ' ' if item_based: testbase = 'ib' test_name = "New Item-based test on: " + dataset_name + ' ' iterate = product(per, k_val) for per, k in iterate: f.write('\n') timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) f.write(test_name + timestamp + ' --> ') e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, eval_metric=metric) f.write(str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ') f.write(str(e.evaluate())) f.close()
def main(): testbase = 'ub' dataset_name = "u1m.data" f = open('../testres.txt', 'a+') k_val = [5] per = [30] #per = [1.4] #metric = 'pr' metric = 'mae' #k_val = [5 9, 13, 17, 21] test_name = "New User-based Test on: " + dataset_name + ' ' if item_based: testbase = 'ib' test_name = "New Item-based test on: " + dataset_name + ' ' iterate = product(per, k_val) for per, k in iterate: f.write('\n') timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) f.write(test_name + timestamp + ' --> ') e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, eval_metric=metric) f.write( str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ') f.write(str(e.evaluate())) f.close()
def cal_loss_redundancy(resource, availability, replication, lease): replication_count = 0 data_loss = 0 data_count = 100 data_iterated = 0 lease_period = int(lease) desired_availability = float(availability) iteration = 0 bad_iteration = 0 if replication == 'random1copy' or replication == 'min1copy': copy_number = 1 elif replication == 'random2copy' or replication == 'min2copy': copy_number = 2 elif replication == 'random3copy' or replication == 'min3copy': copy_number = 3 while iteration < 30: quantile = int(range_minute * 0.05) time_point = random.randint(start_minute + quantile, end_minute - quantile) job_count = len(interval_tree[time_point]) # evaluate sizes of data set and job set if job_count < data_count * 3 * copy_number: # print "Error : job set is less than 3 times of data set" bad_iteration += 1 continue availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: # print "Error : match_job_dict is none" bad_iteration += 1 continue for job in match_job_dict: replication_count += len(match_job_dict[job]) evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 data_loss_rate = float(data_loss) / float(data_iterated) redundancy_rate = float(replication_count) / float(data_iterated) print "data loss rate : ", data_loss_rate print "redundancy : ", redundancy_rate print "bad iteration : ", bad_iteration return (data_loss_rate, redundancy_rate)
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new= record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new = record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
#k = [5] #percentage = [10] prod = product(datafiles, k, metrics, r_method, n) for filename, k, metric, r_m, n in prod: if filename == 'datahouse': X, y = getXy(filename) else: X, y = getXy(filename, delimiter=',') if n: normalize(X) print 'data is normalized' else: print 'data is not normalized' reg = KNeighborClassifier(X, y, k, metric, r_method=r_m) e = Evaluater(reg, test_percentage = 30) print '\t', e.evaluate() print '\n' #reg = KNeighborClassifier(X, y, m, sim_cosine, r_method="uniform") #e = Evaluater(reg, test_percentage = l) #print 'Test: k=%s, test/all_data=%s' % (m, l) #print "\t", e.evaluate() #del reg, e
availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability # print availability_dict match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: print "Error : match_job_dict is none" bad_iteration += 1 continue # for job in match_job_dict: # print job, match_job_dict[job] evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 print "data_loss : ", data_loss print "data_iterated : ", data_iterated print "iteration, bad_iteration : ", iteration, bad_iteration loss_rate_list.append(float(data_loss) / float(data_iterated)) print loss_rate_list print loss_rate_list resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'} avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'} file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[ 2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt' print file_name with open(file_name, 'wb') as fp: pickle.dump(loss_rate_list, fp)