Пример #1
0
def main():
    testbase = 'ub'

    dataset_name = "u1m.data"
    f = open('../testres.txt', 'a+')

    k_val = [5]
    per = [30]
    #per = [1.4]
    #metric = 'pr'
    metric = 'mae'

    #k_val = [5 9, 13, 17, 21]

    test_name = "New User-based Test on: " + dataset_name + ' '
    if item_based:
        testbase = 'ib'
        test_name = "New Item-based test on: " + dataset_name + ' '
    iterate = product(per, k_val)
    for per, k in iterate:
        f.write('\n')
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        f.write(test_name + timestamp + ' --> ')
        e = Evaluater(dataset_name,
                      rec_type=testbase,
                      k=k,
                      test_percentage=per,
                      eval_metric=metric)
        f.write(
            str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ')
        f.write(str(e.evaluate()))
    f.close()
Пример #2
0
    def iterate_epoch(self,
                      model,
                      lr,
                      epoch,
                      weight_decay=0,
                      warmup=0,
                      lr_decay_rate=1,
                      lr_decay_every=10,
                      eval_every=5,
                      early_stop=False):
        eval_model = Evaluater(self.data_dir, model_name=self.model_name)
        #es = EarlyStop(self.data_dir[0:-6] + 'early_stopping/', self.model_name, patience=6)
        es = EarlyStop('../data_beauty_2core_es/early_stopping/',
                       self.model_name,
                       patience=6)
        plot_loss_list = []
        plot_score_list = []

        for i in range(epoch):
            plot_loss_list.extend(
                self.iterate_train(model,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   print_every=10000))

            # early stop
            if early_stop:
                pre_model = es.early_stop(model)
                if pre_model:
                    print('Early Stop eposh: {}'.format(i + 1))
                    return eval_model.topn_map(pre_model)

            # lrスケジューリング
            if i > warmup:
                if (i - warmup) % lr_decay_every == 0:
                    lr = lr * lr_decay_rate

            if (i + 1) % eval_every == 0:
                #score = eval_model.topn_precision(model)
                #print('epoch: {}  precision: {}'.format(i, score))
                score = eval_model.topn_map(model)
                print('epoch: {}  map: {}'.format(i, score))
                plot_score_list.append(score)

        #self._plot(plot_loss_list)
        #self._plot(plot_score_list)

        #return eval_model.topn_precision(model)
        return eval_model.topn_map(model)
Пример #3
0
def cal_loss_redundancy(resource, availability, replication, lease):
    replication_count = 0
    data_loss = 0
    data_count = 100
    data_iterated = 0
    lease_period = int(lease)
    desired_availability = float(availability)
    iteration = 0
    bad_iteration = 0
    if replication == 'random1copy' or replication == 'min1copy':
        copy_number = 1
    elif replication == 'random2copy' or replication == 'min2copy':
        copy_number = 2
    elif replication == 'random3copy' or replication == 'min3copy':
        copy_number = 3
    while iteration < 30:
        quantile = int(range_minute * 0.05)
        time_point = random.randint(start_minute + quantile,
                                    end_minute - quantile)
        job_count = len(interval_tree[time_point])
        # evaluate sizes of data set and job set
        if job_count < data_count * 3 * copy_number:
            #			print "Error : job set is less than 3 times of data set"
            bad_iteration += 1
            continue
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            #			print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
        for job in match_job_dict:
            replication_count += len(match_job_dict[job])
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    data_loss_rate = float(data_loss) / float(data_iterated)
    redundancy_rate = float(replication_count) / float(data_iterated)
    print "data loss rate : ", data_loss_rate
    print "redundancy : ", redundancy_rate
    print "bad iteration : ", bad_iteration
    return (data_loss_rate, redundancy_rate)
Пример #4
0
    def __init__(self, data_dir):

        # 本当はAmazonDatasetクラスを渡した方が速いが、
        self.evaluater = Evaluater(data_dir)
        self.dataset = AmazonDataset(data_dir, model_name='TransE')
        edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in self.dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        # load network
        self.G = nx.DiGraph()
        self.G.add_nodes_from(
            [i for i in range(len(self.dataset.entity_list))])
        self.G.add_edges_from(edges)
Пример #5
0
def objective(trial):
    start = time.time()
    # ハイパラ読み込み
    # gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    # lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    alpha = trial.suggest_uniform('alpha', 0, 1)
    beta = trial.suggest_uniform('beta', 0, 0.5)

    data_dirs = [
        '../' + data_path + '/valid1/', '../' + data_path + '/valid2/'
    ]

    score_sum = 0
    for data_dir in data_dirs:
        # dataload
        dataset = AmazonDataset(data_dir)

        # laod model
        #slim = train_SLIM(data_dir, load=True)
        sim_mat = load_sim_mat('sim_mat' + data_dir[-2] + '.csr',
                               len(dataset.user_list), len(dataset.item_list))

        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        # load network
        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        evaluater = Evaluater(data_dir)
        #ranking_mat = get_ranking_mat(G, slim, alpha, beta, dataset)
        ranking_mat = get_ranking_mat(G, sim_mat, alpha, beta, dataset)
        #score = evaluater.topn_map(ranking_mat)
        score = evaluater.topn_precision(ranking_mat)

        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))

    return -1 * score_sum / 2
Пример #6
0
def objective(trial):
    start = time.time()
    # hyper parameter
    #gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    #slim = train_SLIM(lin_model, gamma)
    alpha = trial.suggest_uniform('alpha', 0, 0.5)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    gamma1 = trial.suggest_uniform('gamma1', 0, 1)
    gamma2 = trial.suggest_uniform('gamma2', 0, 1)
    gamma3 = trial.suggest_uniform('gamma3', 0, 1)
    gamma = [gamma1, gamma2, gamma3]

    data_dir = ['../data_luxury_5core/valid1', '../data_luxury_5core/valid2']
    score_sum = 0
    for i in range(len(data_dir)):
        # dataload
        dataset = AmazonDataset(data_dir[i], model_name='TransE')
        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])
        #user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

        # load network
        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta)
        #score = topn_precision(ranking_mat, user_items_test_dict)
        evaluater = Evaluater(data_dir[i])
        score = evaluater.topn_map(ranking_mat)
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    return -1 * score_sum / 2
Пример #7
0
def objective(trial):
    start = time.time()
    # hyper parameter
    alpha = trial.suggest_uniform('alpha', 0, 0.5)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    gamma1 = trial.suggest_uniform('gamma1', 0, 1)
    gamma2 = trial.suggest_uniform('gamma2', 0, 1)
    gamma3 = trial.suggest_uniform('gamma3', 0, 1)
    gamma = [gamma1, gamma2, gamma3]
    
    data_dir = ['../' + data_path + '/valid1', '../' + data_path + '/valid2']
    score_sum = 0
    for i in range(len(data_dir)):
        # dataload
        dataset = AmazonDataset(data_dir[i], model_name='SparseTransE')

        # load network
        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta)
        #score = topn_precision(ranking_mat, user_items_test_dict)
        evaluater = Evaluater(data_dir[i])
        score = evaluater.topn_map(ranking_mat)
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))
    
    return -1 * score_sum / 2
Пример #8
0
    # load param
    params = load_params()
    alpha = params['alpha']
    beta = params['beta']
    gamma1 = params['gamma1']
    gamma2 = params['gamma2']
    gamma3 = params['gamma3']
    gamma = [gamma1, gamma2, gamma3]

    # dataload
    dataset = AmazonDataset(data_dir, model_name='TransE')

    # load network
    edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
    # user-itemとitem-userどちらの辺も追加
    for r in dataset.triplet_df.values:
        if r[2] == 0:
            edges.append([r[1], r[0]])

    G = nx.DiGraph()
    G.add_nodes_from([i for i in range(len(dataset.entity_list))])
    G.add_edges_from(edges)

    ranking_mat = get_ranking_mat(G, dataset, model, gamma, alpha, beta)
    evaluater = Evaluater(data_dir)
    score = evaluater.topn_map(ranking_mat)

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    np.savetxt('score_transe3.txt', np.array([score]))
Пример #9
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new = record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))

            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')

            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #10
0
            bad_iteration += 1
            continue
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
#		print availability_dict
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
#		for job in match_job_dict:
#			print job, match_job_dict[job]
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    print "data_loss : ", data_loss
    print "data_iterated : ", data_iterated
    print "iteration, bad_iteration : ", iteration, bad_iteration
    loss_rate_list.append(float(data_loss) / float(data_iterated))
    print loss_rate_list
print loss_rate_list

resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'}
avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'}
file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[
    2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt'