Пример #1
0
def main():
    testbase = 'ub'
    
    dataset_name = "u1m.data"
    f = open('../testres.txt', 'a+')
    
    k_val = [5]
    per = [30]
    #per = [1.4]
    #metric = 'pr'
    metric = 'mae'

    #k_val = [5 9, 13, 17, 21]
    


    test_name = "New User-based Test on: "  + dataset_name + ' '
    if item_based:
        testbase = 'ib'
        test_name  = "New Item-based test on: " + dataset_name + ' '
    iterate = product(per, k_val)
    for per, k in iterate:
        f.write('\n')
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        f.write(test_name + timestamp + ' --> ')
        e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, 
                                                eval_metric=metric)
        f.write(str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ')
        f.write(str(e.evaluate()))
    f.close()
Пример #2
0
def main():
    testbase = 'ub'

    dataset_name = "u1m.data"
    f = open('../testres.txt', 'a+')

    k_val = [5]
    per = [30]
    #per = [1.4]
    #metric = 'pr'
    metric = 'mae'

    #k_val = [5 9, 13, 17, 21]

    test_name = "New User-based Test on: " + dataset_name + ' '
    if item_based:
        testbase = 'ib'
        test_name = "New Item-based test on: " + dataset_name + ' '
    iterate = product(per, k_val)
    for per, k in iterate:
        f.write('\n')
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        f.write(test_name + timestamp + ' --> ')
        e = Evaluater(dataset_name,
                      rec_type=testbase,
                      k=k,
                      test_percentage=per,
                      eval_metric=metric)
        f.write(
            str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ')
        f.write(str(e.evaluate()))
    f.close()
Пример #3
0
def cal_loss_redundancy(resource, availability, replication, lease):
    replication_count = 0
    data_loss = 0
    data_count = 100
    data_iterated = 0
    lease_period = int(lease)
    desired_availability = float(availability)
    iteration = 0
    bad_iteration = 0
    if replication == 'random1copy' or replication == 'min1copy':
        copy_number = 1
    elif replication == 'random2copy' or replication == 'min2copy':
        copy_number = 2
    elif replication == 'random3copy' or replication == 'min3copy':
        copy_number = 3
    while iteration < 30:
        quantile = int(range_minute * 0.05)
        time_point = random.randint(start_minute + quantile,
                                    end_minute - quantile)
        job_count = len(interval_tree[time_point])
        # evaluate sizes of data set and job set
        if job_count < data_count * 3 * copy_number:
            #			print "Error : job set is less than 3 times of data set"
            bad_iteration += 1
            continue
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            #			print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
        for job in match_job_dict:
            replication_count += len(match_job_dict[job])
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    data_loss_rate = float(data_loss) / float(data_iterated)
    redundancy_rate = float(replication_count) / float(data_iterated)
    print "data loss rate : ", data_loss_rate
    print "redundancy : ", redundancy_rate
    print "bad iteration : ", bad_iteration
    return (data_loss_rate, redundancy_rate)
Пример #4
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new= record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))


            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')


            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #5
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new = record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))

            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')

            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #6
0
#k = [5]
#percentage = [10]
prod = product(datafiles, k, metrics, r_method, n)
for filename, k, metric, r_m, n in prod:
    if filename == 'datahouse':
        X, y = getXy(filename)
    else:
        X, y = getXy(filename, delimiter=',')
    if n:
        normalize(X)
        print 'data is normalized'
    else:
        print 'data is not normalized'
    reg = KNeighborClassifier(X, y, k, metric, r_method=r_m)
    e = Evaluater(reg, test_percentage = 30)
    print '\t', e.evaluate()
    print '\n'




    



    #reg = KNeighborClassifier(X, y, m, sim_cosine, r_method="uniform")
    #e = Evaluater(reg, test_percentage = l)
    #print 'Test: k=%s, test/all_data=%s' % (m, l)
    #print "\t", e.evaluate()
    #del reg, e
Пример #7
0
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
#		print availability_dict
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
#		for job in match_job_dict:
#			print job, match_job_dict[job]
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    print "data_loss : ", data_loss
    print "data_iterated : ", data_iterated
    print "iteration, bad_iteration : ", iteration, bad_iteration
    loss_rate_list.append(float(data_loss) / float(data_iterated))
    print loss_rate_list
print loss_rate_list

resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'}
avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'}
file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[
    2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt'
print file_name
with open(file_name, 'wb') as fp:
    pickle.dump(loss_rate_list, fp)