Пример #1
0
def main():
    testbase = 'ub'

    dataset_name = "u1m.data"
    f = open('../testres.txt', 'a+')

    k_val = [5]
    per = [30]
    #per = [1.4]
    #metric = 'pr'
    metric = 'mae'

    #k_val = [5 9, 13, 17, 21]

    test_name = "New User-based Test on: " + dataset_name + ' '
    if item_based:
        testbase = 'ib'
        test_name = "New Item-based test on: " + dataset_name + ' '
    iterate = product(per, k_val)
    for per, k in iterate:
        f.write('\n')
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        f.write(test_name + timestamp + ' --> ')
        e = Evaluater(dataset_name,
                      rec_type=testbase,
                      k=k,
                      test_percentage=per,
                      eval_metric=metric)
        f.write(
            str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ')
        f.write(str(e.evaluate()))
    f.close()
Пример #2
0
def main():
    testbase = 'ub'
    
    dataset_name = "u1m.data"
    f = open('../testres.txt', 'a+')
    
    k_val = [5]
    per = [30]
    #per = [1.4]
    #metric = 'pr'
    metric = 'mae'

    #k_val = [5 9, 13, 17, 21]
    


    test_name = "New User-based Test on: "  + dataset_name + ' '
    if item_based:
        testbase = 'ib'
        test_name  = "New Item-based test on: " + dataset_name + ' '
    iterate = product(per, k_val)
    for per, k in iterate:
        f.write('\n')
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        f.write(test_name + timestamp + ' --> ')
        e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, 
                                                eval_metric=metric)
        f.write(str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ')
        f.write(str(e.evaluate()))
    f.close()
Пример #3
0
    def iterate_epoch(self,
                      model,
                      lr,
                      epoch,
                      weight_decay=0,
                      warmup=0,
                      lr_decay_rate=1,
                      lr_decay_every=10,
                      eval_every=5,
                      early_stop=False):
        eval_model = Evaluater(self.data_dir, model_name=self.model_name)
        #es = EarlyStop(self.data_dir[0:-6] + 'early_stopping/', self.model_name, patience=6)
        es = EarlyStop('../data_beauty_2core_es/early_stopping/',
                       self.model_name,
                       patience=6)
        plot_loss_list = []
        plot_score_list = []

        for i in range(epoch):
            plot_loss_list.extend(
                self.iterate_train(model,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   print_every=10000))

            # early stop
            if early_stop:
                pre_model = es.early_stop(model)
                if pre_model:
                    print('Early Stop eposh: {}'.format(i + 1))
                    return eval_model.topn_map(pre_model)

            # lrスケジューリング
            if i > warmup:
                if (i - warmup) % lr_decay_every == 0:
                    lr = lr * lr_decay_rate

            if (i + 1) % eval_every == 0:
                #score = eval_model.topn_precision(model)
                #print('epoch: {}  precision: {}'.format(i, score))
                score = eval_model.topn_map(model)
                print('epoch: {}  map: {}'.format(i, score))
                plot_score_list.append(score)

        #self._plot(plot_loss_list)
        #self._plot(plot_score_list)

        #return eval_model.topn_precision(model)
        return eval_model.topn_map(model)
Пример #4
0
def cal_loss_redundancy(resource, availability, replication, lease):
    replication_count = 0
    data_loss = 0
    data_count = 100
    data_iterated = 0
    lease_period = int(lease)
    desired_availability = float(availability)
    iteration = 0
    bad_iteration = 0
    if replication == 'random1copy' or replication == 'min1copy':
        copy_number = 1
    elif replication == 'random2copy' or replication == 'min2copy':
        copy_number = 2
    elif replication == 'random3copy' or replication == 'min3copy':
        copy_number = 3
    while iteration < 30:
        quantile = int(range_minute * 0.05)
        time_point = random.randint(start_minute + quantile,
                                    end_minute - quantile)
        job_count = len(interval_tree[time_point])
        # evaluate sizes of data set and job set
        if job_count < data_count * 3 * copy_number:
            #			print "Error : job set is less than 3 times of data set"
            bad_iteration += 1
            continue
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            #			print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
        for job in match_job_dict:
            replication_count += len(match_job_dict[job])
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    data_loss_rate = float(data_loss) / float(data_iterated)
    redundancy_rate = float(replication_count) / float(data_iterated)
    print "data loss rate : ", data_loss_rate
    print "redundancy : ", redundancy_rate
    print "bad iteration : ", bad_iteration
    return (data_loss_rate, redundancy_rate)
Пример #5
0
    def __init__(self, data_dir):

        # 本当はAmazonDatasetクラスを渡した方が速いが、
        self.evaluater = Evaluater(data_dir)
        self.dataset = AmazonDataset(data_dir, model_name='TransE')
        edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in self.dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        # load network
        self.G = nx.DiGraph()
        self.G.add_nodes_from(
            [i for i in range(len(self.dataset.entity_list))])
        self.G.add_edges_from(edges)
Пример #6
0
def objective(trial):
    start = time.time()
    # ハイパラ読み込み
    # gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    # lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    alpha = trial.suggest_uniform('alpha', 0, 1)
    beta = trial.suggest_uniform('beta', 0, 0.5)

    data_dirs = [
        '../' + data_path + '/valid1/', '../' + data_path + '/valid2/'
    ]

    score_sum = 0
    for data_dir in data_dirs:
        # dataload
        dataset = AmazonDataset(data_dir)

        # laod model
        #slim = train_SLIM(data_dir, load=True)
        sim_mat = load_sim_mat('sim_mat' + data_dir[-2] + '.csr',
                               len(dataset.user_list), len(dataset.item_list))

        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        # load network
        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        evaluater = Evaluater(data_dir)
        #ranking_mat = get_ranking_mat(G, slim, alpha, beta, dataset)
        ranking_mat = get_ranking_mat(G, sim_mat, alpha, beta, dataset)
        #score = evaluater.topn_map(ranking_mat)
        score = evaluater.topn_precision(ranking_mat)

        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))

    return -1 * score_sum / 2
Пример #7
0
def objective(trial):
    start = time.time()
    # hyper parameter
    #gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    #slim = train_SLIM(lin_model, gamma)
    alpha = trial.suggest_uniform('alpha', 0, 0.5)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    gamma1 = trial.suggest_uniform('gamma1', 0, 1)
    gamma2 = trial.suggest_uniform('gamma2', 0, 1)
    gamma3 = trial.suggest_uniform('gamma3', 0, 1)
    gamma = [gamma1, gamma2, gamma3]

    data_dir = ['../data_luxury_5core/valid1', '../data_luxury_5core/valid2']
    score_sum = 0
    for i in range(len(data_dir)):
        # dataload
        dataset = AmazonDataset(data_dir[i], model_name='TransE')
        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])
        #user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

        # load network
        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta)
        #score = topn_precision(ranking_mat, user_items_test_dict)
        evaluater = Evaluater(data_dir[i])
        score = evaluater.topn_map(ranking_mat)
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    return -1 * score_sum / 2
Пример #8
0
def objective(trial):
    start = time.time()
    # hyper parameter
    alpha = trial.suggest_uniform('alpha', 0, 0.5)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    gamma1 = trial.suggest_uniform('gamma1', 0, 1)
    gamma2 = trial.suggest_uniform('gamma2', 0, 1)
    gamma3 = trial.suggest_uniform('gamma3', 0, 1)
    gamma = [gamma1, gamma2, gamma3]
    
    data_dir = ['../' + data_path + '/valid1', '../' + data_path + '/valid2']
    score_sum = 0
    for i in range(len(data_dir)):
        # dataload
        dataset = AmazonDataset(data_dir[i], model_name='SparseTransE')

        # load network
        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta)
        #score = topn_precision(ranking_mat, user_items_test_dict)
        evaluater = Evaluater(data_dir[i])
        score = evaluater.topn_map(ranking_mat)
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))
    
    return -1 * score_sum / 2
Пример #9
0
    # load param
    params = load_params()
    alpha = params['alpha']
    beta = params['beta']
    gamma1 = params['gamma1']
    gamma2 = params['gamma2']
    gamma3 = params['gamma3']
    gamma = [gamma1, gamma2, gamma3]

    # dataload
    dataset = AmazonDataset(data_dir, model_name='TransE')

    # load network
    edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
    # user-itemとitem-userどちらの辺も追加
    for r in dataset.triplet_df.values:
        if r[2] == 0:
            edges.append([r[1], r[0]])

    G = nx.DiGraph()
    G.add_nodes_from([i for i in range(len(dataset.entity_list))])
    G.add_edges_from(edges)

    ranking_mat = get_ranking_mat(G, dataset, model, gamma, alpha, beta)
    evaluater = Evaluater(data_dir)
    score = evaluater.topn_map(ranking_mat)

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    np.savetxt('score_transe3.txt', np.array([score]))
Пример #10
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new= record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))


            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')


            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #11
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new = record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))

            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')

            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #12
0
    # dataload
    data_dir = '../data_luxury_5core/test/'
    dataset = AmazonDataset(data_dir)

    # laod model
    slim_param = pickle.load(open('best_param_slim.pickle', 'rb'))
    slim = train_SLIM2(data_dir, slim_param)

    # load network
    edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
    ## user-itemとitem-userどちらの辺も追加
    for r in dataset.triplet_df.values:
        if r[2] == 0:
            edges.append([r[1], r[0]])

    G = nx.DiGraph()
    G.add_nodes_from([i for i in range(len(dataset.entity_list))])
    G.add_edges_from(edges)

    evaluater = Evaluater(data_dir)
    model_mat = load_sim_mat('sim_mat_test.csr', len(dataset.user_list),
                             len(dataset.item_list))
    ranking_mat = get_ranking_mat(G, model_mat, alpha, beta, dataset)
    #score = evaluater.topn_map(ranking_mat)
    score = evaluater.topn_precision(ranking_mat)

    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))

    np.savetxt('score.txt', np.array([score]))
Пример #13
0
class Inference():
    def __init__(self, data_dir):

        # 本当はAmazonDatasetクラスを渡した方が速いが、
        self.evaluater = Evaluater(data_dir)
        self.dataset = AmazonDataset(data_dir, model_name='TransE')
        edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values]
        # user-itemとitem-userどちらの辺も追加
        for r in self.dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        # load network
        self.G = nx.DiGraph()
        self.G.add_nodes_from(
            [i for i in range(len(self.dataset.entity_list))])
        self.G.add_edges_from(edges)

    def get_score(self, model, gamma, alpha, beta):
        ranking_mat = self.get_ranking_mat(model, gamma, alpha, beta)
        score = self.evaluater.topn_map(ranking_mat)
        return score

    def mk_sparse_sim_mat(self, model, gamma):
        item_idx = torch.tensor([
            self.dataset.entity_list.index(i) for i in self.dataset.item_list
        ],
                                dtype=torch.long,
                                device=device)

        user_idx = torch.tensor([
            self.dataset.entity_list.index(u) for u in self.dataset.user_list
        ],
                                dtype=torch.long,
                                device=device)

        brand_idx = torch.tensor([
            self.dataset.entity_list.index(b) for b in self.dataset.brand_list
        ],
                                 dtype=torch.long,
                                 device=device)

        # ここもっと上手く書きたい
        item_embed = model.entity_embed(item_idx)
        item_sim_mat = F.relu(torch.mm(item_embed, torch.t(item_embed)))
        item_sim_mat = gamma[0] * scipy.sparse.csr_matrix(
            item_sim_mat.to('cpu').detach().numpy().copy())

        user_embed = model.entity_embed(user_idx)
        user_sim_mat = F.relu(torch.mm(user_embed, torch.t(user_embed)))
        user_sim_mat = gamma[1] * scipy.sparse.csr_matrix(
            user_sim_mat.to('cpu').detach().numpy().copy())

        brand_embed = model.entity_embed(brand_idx)
        brand_sim_mat = F.relu(torch.mm(brand_embed, torch.t(brand_embed)))
        brand_sim_mat = gamma[2] * scipy.sparse.csr_matrix(
            brand_sim_mat.to('cpu').detach().numpy().copy())

        M = scipy.sparse.block_diag(
            (item_sim_mat, user_sim_mat, brand_sim_mat))
        M_ = np.array(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))

        M = M / np.max(M.sum(axis=1)) + scipy.sparse.diags(M_.transpose()[0])
        return M

    def pagerank_scipy(self,
                       sim_mat,
                       personal_vec=None,
                       alpha=0.85,
                       beta=0.01,
                       max_iter=100,
                       tol=1.0e-6,
                       weight='weight',
                       dangling=None):

        N = len(self.G)
        if N == 0:
            return {}

        nodelist = self.G.nodes()
        M = nx.to_scipy_sparse_matrix(self.G,
                                      nodelist=nodelist,
                                      weight=weight,
                                      dtype=float)
        S = scipy.array(M.sum(axis=1)).flatten()
        S[S != 0] = 1.0 / S[S != 0]
        Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
        M = Q * M

        # 遷移行列とsim_matを統合
        #sim_mat = mk_sparse_sim_mat(G, item_mat)
        M = beta * M + (1 - beta) * sim_mat

        # initial vector
        x = scipy.repeat(1.0 / N, N)

        # Personalization vector
        p = personal_vec

        dangling_weights = p
        is_dangling = scipy.where(S == 0)[0]

        #print(x.shape)
        #print(M.shape)
        #print(p.shape)

        ppr_mat = []
        for i in range(p.shape[1]):
            ppr = self.power_iterate(N, M, x, p[:, i], dangling_weights[:, i],
                                     is_dangling, alpha, max_iter, tol)
            ppr_mat.append(ppr)

            #if i > 100:
            #    print(np.array(ppr_mat).shape)
            #    break

        return np.array(ppr_mat)

    def power_iterate(self,
                      N,
                      M,
                      x,
                      p,
                      dangling_weights,
                      is_dangling,
                      alpha,
                      max_iter=500,
                      tol=1.0e-6):
        #print(M.shape)
        #print(x.shape)
        #print(p.shape)
        # power iteration: make up to max_iter iterations
        for i in range(max_iter):
            xlast = x
            x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
                (1 - alpha) * p
            # check convergence, l1 norm
            x = x / x.sum()
            err = scipy.absolute(x - xlast).sum()
            if err < N * tol:
                #return dict(zip(nodelist, map(float, x)))
                #print(i)
                return x
        # pagerankの収束ちゃんとやっとく
        #print(x.sum())
        #print(err)
        #print(N * tol)
        #raise NetworkXError('pagerank_scipy: power iteration failed to converge '
        #'in %d iterations.' % max_iter)

        #return dict(zip(nodelist, map(float, x)))
        return x

    def item_ppr(self, sim_mat, alpha, beta):

        # personal_vecを作る(eneity_size * user_size)
        user_idx = [
            self.dataset.entity_list.index(u) for u in self.dataset.user_list
        ]
        personal_vec = []
        for u in user_idx:
            val = np.zeros(len(self.G.nodes()))
            val[u] = 1
            personal_vec.append(val[np.newaxis, :])
        personal_vec = np.concatenate(personal_vec, axis=0).transpose()

        #ppr = pagerank_torch(G, sim_mat, personal_vec, alpha, beta)
        ppr = self.pagerank_scipy(sim_mat, personal_vec, alpha, beta)

        item_idx = [
            self.dataset.entity_list.index(i) for i in self.dataset.item_list
        ]
        pred = ppr[:, item_idx]
        #print(pred.shape)
        return pred

    def get_ranking_mat(self, model, gamma, alpha=0.85, beta=0.01):
        ranking_mat = []
        #sim_mat = reconstruct_kg(model)
        sim_mat = self.mk_sparse_sim_mat(model, gamma)
        pred = self.item_ppr(sim_mat, alpha, beta)
        #print(pred.shape)
        for i in range(len(self.dataset.user_list)):
            sorted_idx = np.argsort(np.array(pred[i]))[::-1]
            ranking_mat.append(sorted_idx)
            #break
        return ranking_mat
Пример #14
0
#percentage = [10, 20, 30]
#k = [5]
#percentage = [10]
prod = product(datafiles, k, metrics, r_method, n)
for filename, k, metric, r_m, n in prod:
    if filename == 'datahouse':
        X, y = getXy(filename)
    else:
        X, y = getXy(filename, delimiter=',')
    if n:
        normalize(X)
        print 'data is normalized'
    else:
        print 'data is not normalized'
    reg = KNeighborClassifier(X, y, k, metric, r_method=r_m)
    e = Evaluater(reg, test_percentage = 30)
    print '\t', e.evaluate()
    print '\n'




    



    #reg = KNeighborClassifier(X, y, m, sim_cosine, r_method="uniform")
    #e = Evaluater(reg, test_percentage = l)
    #print 'Test: k=%s, test/all_data=%s' % (m, l)
    #print "\t", e.evaluate()
    #del reg, e
Пример #15
0
            bad_iteration += 1
            continue
        availability_dict = dict()
        for data_index in range(data_count):
            data_name = "data" + str(data_index)
            availability_dict[data_name] = desired_availability
#		print availability_dict
        match_job_dict = scheduler.schedule(time_point, lease_period,
                                            availability_dict)
        if not match_job_dict:
            print "Error : match_job_dict is none"
            bad_iteration += 1
            continue
#		for job in match_job_dict:
#			print job, match_job_dict[job]
        evaluater = Evaluater(interval_tree, job_dict)
        data_iterated = data_iterated + data_count
        data_loss += evaluater.evaluate(time_point + lease_period,
                                        match_job_dict)
        iteration += 1
    print "data_loss : ", data_loss
    print "data_iterated : ", data_iterated
    print "iteration, bad_iteration : ", iteration, bad_iteration
    loss_rate_list.append(float(data_loss) / float(data_iterated))
    print loss_rate_list
print loss_rate_list

resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'}
avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'}
file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[
    2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt'