예제 #1
0
def train_model(para, data, path_excel):
    ## data and hyperparameters
    [train_data, train_data_interaction, user_num, item_num, test_data, pre_train_feature, hypergraph_embeddings, graph_embeddings, propagation_embeddings, sparse_propagation_matrix, _] = data
    [_, _, MODEL, LR, LAMDA, LAYER, EMB_DIM, BATCH_SIZE, TEST_USER_BATCH, N_EPOCH, IF_PRETRAIN, _, TOP_K] = para[0:13]
    if MODEL == 'LightLCFN': [_, _, _, KEEP_PORB, SAMPLE_RATE, GRAPH_CONV, PREDICTION, LOSS_FUNCTION, GENERALIZATION, OPTIMIZATION, IF_TRASFORMATION, ACTIVATION, POOLING] = para[13:]
    if MODEL == 'SGNN': [_, PROP_EMB, _] = para[13:]
    para_test = [train_data, test_data, user_num, item_num, TOP_K, TEST_USER_BATCH]
    ## Define the model
    if MODEL == 'MF': model = model_MF(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA)
    if MODEL == 'NCF': model = model_NCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'GCMC': model = model_GCMC(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix)
    if MODEL == 'NGCF': model = model_NGCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix)
    if MODEL == 'SCF': model = model_SCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix)
    if MODEL == 'CGMC': model = model_CGMC(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix)
    if MODEL == 'LightGCN': model = model_LightGCN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix)
    if MODEL == 'LCFN': model = model_LCFN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, graph_embeddings=hypergraph_embeddings)
    if MODEL == 'LightLCFN': model = model_LightLCFN(n_users=user_num, n_items=item_num, lr=LR, lamda=LAMDA, emb_dim=EMB_DIM, layer=LAYER, pre_train_latent_factor=pre_train_feature, graph_embeddings=graph_embeddings, graph_conv = GRAPH_CONV, prediction = PREDICTION, loss_function=LOSS_FUNCTION, generalization = GENERALIZATION, optimization=OPTIMIZATION, if_pretrain=IF_PRETRAIN, if_transformation=IF_TRASFORMATION, activation=ACTIVATION, pooling=POOLING)
    if MODEL == 'SGNN': model = model_SGNN(n_users=user_num, n_items=item_num, lr=LR, lamda=LAMDA, emb_dim=EMB_DIM, layer=LAYER, pre_train_latent_factor=pre_train_feature, propagation_embeddings=propagation_embeddings, if_pretrain=IF_PRETRAIN, prop_emb=PROP_EMB)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    ## Split the training samples into batches
    batches = list(range(0, len(train_data_interaction), BATCH_SIZE))
    batches.append(len(train_data_interaction))
    ## Training iteratively
    F1_max = 0
    F1_df = pd.DataFrame(columns=TOP_K)
    NDCG_df = pd.DataFrame(columns=TOP_K)
    t1 = time.clock()
    for epoch in range(N_EPOCH):
        for batch_num in range(len(batches) - 1):
            train_batch_data = []
            for sample in range(batches[batch_num], batches[batch_num + 1]):
                (user, pos_item) = train_data_interaction[sample]
                sample_num = 0
                while sample_num < (SAMPLE_RATE if MODEL == 'LightLCFN' else 1):
                    neg_item = int(rd.uniform(0, item_num))
                    if not (neg_item in train_data[user]):
                        sample_num += 1
                        train_batch_data.append([user, pos_item, neg_item])
            train_batch_data = np.array(train_batch_data)
            _, loss = sess.run([model.updates, model.loss], feed_dict={model.users: train_batch_data[:, 0], model.pos_items: train_batch_data[:, 1], model.neg_items: train_batch_data[:, 2], model.keep_prob: KEEP_PORB if MODEL == 'LightLCFN' else 1})
        ## test the model each epoch
        F1, NDCG = test_model(sess, model, para_test)
        F1_max = max(F1_max, F1[0])
        ## print performance
        # print_value([epoch + 1, loss, F1_max, F1, NDCG])
        if epoch % 10 == 0: print('%.5f' % (F1_max), end = ' ', flush = True)
        ## save performance
        F1_df.loc[epoch + 1] = F1
        NDCG_df.loc[epoch + 1] = NDCG
        save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)
        if loss > 10 ** 10: break
    t2 = time.clock()
    print('time cost:', (t2 - t1) / 200)
    return F1_max
예제 #2
0
    for key in list_index:
        value = para_dict.get(key)
        if len(value) > 1:  # 如果这个参数对应的value(是个set)里存的值多于一个
            changed_para.append(key)  # 说明这个参数的值在不同次的实验中发生了改变
        temp_list = []
        temp_list.append(key)
        for v in value:
            temp_list.append(v)  # 将参数名和参数值连接成一个list
        para_df = para_df.append(
            pd.DataFrame(temp_list).T)  # 将这个list存成dataframe

    para_df = para_df.set_index(0)  # 将第0列设为index(第0列是参数名)
    changed_para_str = ', '.join(changed_para)  # 将改变了的参数的参数名连成字符串,用于生产输出文件的文件名
    path_write = path + '\\data_collection' + '\\' + model_dataset + '_' + changed_para_str + '_' + str(
        int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx'
    save_value([[para_df, 'Parameters']], path_write,
               first_sheet=True)  # 将所有出现过的参数名和参数值存下来

    if len(changed_para) == 1 or len(
            changed_para) == 0:  # 只有一个参数发生改变(没有参数改变的也当做只有一个参数改变的特殊情况)
        if len(changed_para) == 1:
            index_name = str(str(changed_para[0]))
            print("(*/ω\*)     ", model_dataset + ": " + changed_para[0] +
                  " is the variable")  # 输出发生改变的参数
        else:
            print("o(≧口≦)o    ",
                  model_dataset + ": there is no changed parameters")
            index_name = "dataset"
        top_k = str(para_df.loc['top_k', 1])  # 对top_k的值进行一些字符串处理
        top_k = top_k.strip(',')
        top_k = top_k.strip('[')
        top_k = top_k.strip(']')
예제 #3
0
        for e in wb.sheetnames:
            sheets.add(e)
    sheets = list(sheets)
    sheets.sort()

    parameter = pd.DataFrame(pd.read_excel(path_read + '\\' + value[0], sheetname=0, header = None , index_col=0)) # 将文件的参数存成dataframe的形式,将第0列设置为index
    parameter.index.name = 'para'                       # 设置dataframe的行名和列名
    parameter.columns.name = 'value'

    dataset = str(parameter.loc['DATASET',1])       # 保留文件的dataset和model,作为后面输出文件的文件名
    model = str(parameter.loc['MODEL',1])
    #eta = str(parameter.loc['eta',1])
    #lambda_r = str(parameter.loc['lambda_r',1])
    path_write = path + '\\data_process\\' + dataset + '_' + model + '_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx'  # 输出文件的文件路径
    #path_write = path + '\\data_process\\' +dataset+'_'+model+'_eta='+eta+'_lambda='+lambda_r+'_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx'
    save_value([[parameter, 'Parameters']], path_write, first_sheet=True)  # 将参数存进excel
    for sheet in sheets:                                                     # 处理表中的每一个sheet,包括F1和NDCG
        if operator.eq(sheet, 'Parameters') == 0 and operator.eq(sheet, 'Filename') == 0: # 如果这个sheet不是parameter也不是filename,则进行下面的操作
            df_max = pd.DataFrame()                                          # 存储这个sheet(F1或者NDCG)对应的处理结果,df_max表示这个表里的结果是之前待处理的表的max值合成的
            df_top = pd.DataFrame()                                          # df_top表示这个表里的结果是之前待处理的表的top_ave的平均值合成的
            for file_p in value:                                               # 对这个key对应的value的list里的每一个文件进行处理
                temp_f = load_workbook(path_read + '\\' + file_p)
                temp_sn = temp_f.sheetnames
                if sheet in temp_sn:
                    metric = pd.DataFrame(pd.read_excel(path_read + '\\' + file_p, sheetname=sheet, header=0, index_col=0)) # 读入某一个文件里的一个sheet
                    list_max = process_metric(metric,method = 'max',para = top_ave  )             # 处理这个metric,得到F1_max或NDCG_max,得到的值为一行
                    list_top = process_metric(metric, method = 'top', para = top_ave )            # 处理这个metric,得到F1_top或NDCG_top,得到的值为一行
                    df_max = df_max.append(list_max, ignore_index=True)           # 将list_max 和 list_top分别加在对应的dataframe上
                    df_top = df_top.append(list_top, ignore_index=True)           # 经过上述操作,在df_top 和df_max里,每一行由一个实验结果文件生成
            df_top = df_top.append(df_top.mean(),ignore_index=True)           # 再将上述所有各次实验的平均值追加在对应的dataframe后面
            df_max = df_max.append(df_max.mean(), ignore_index=True)