Пример #1
0
def bl_IsRec_best(a_dataset):
    model_name = 'IsRec_best'  # 'IsRec'  'IsRec_best_modified'
    epoch_num = 20
    neighbor_size = 15
    topTopicNum = 3
    cluster_mode = 'LDA'
    cluster_mode_topic_nums = [50]  # 10,25,75,,100,125,150
    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)
    for cluster_mode_topic_num in cluster_mode_topic_nums:
        HINRec_model = HINRec_new(
            model_name=model_name,
            semantic_mode='TF_IDF',
            epoch_num=epoch_num,
            neighbor_size=neighbor_size,
            topTopicNum=topTopicNum,
            cluster_mode=cluster_mode,
            cluster_mode_topic_num=cluster_mode_topic_num)

        if os.path.exists(HINRec_model.weight_path):
            print('have trained,return!')
        else:
            HINRec_model.train(test_data)
            HINRec_model.save_model()

            evalute_by_epoch(HINRec_model,
                             HINRec_model,
                             HINRec_model.model_name,
                             test_data,
                             evaluate_by_slt_apiNum=True)  # )
Пример #2
0
def bl_PasRec(a_dataset):
    model_name = 'PasRec_2path'  # 'PasRec_2path'
    epoch_num = 20  # 之前是40  40比20差点
    neighbor_size = 15
    topTopicNum = 3

    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)
    HINRec_model = HINRec_new(model_name=model_name,
                              epoch_num=epoch_num,
                              neighbor_size=neighbor_size,
                              topTopicNum=topTopicNum)

    # 使用LDA处理PasRec的相似度   50 100 150
    # HINRec_model = HINRec_new(model_name=model_name, semantic_mode='LDA', LDA_topic_num=50, epoch_num=epoch_num,
    #                           neighbor_size=neighbor_size,
    #                           topTopicNum=topTopicNum)
    if os.path.exists(HINRec_model.weight_path):
        print('have trained,return!')
    else:
        # 这里是每隔20epoch测试一下,所以train中输入test_data
        HINRec_model.train(test_data)
        HINRec_model.save_model()

        evalute_by_epoch(
            HINRec_model,
            HINRec_model,
            HINRec_model.model_name,
            test_data,
            evaluate_by_slt_apiNum=True)  # ,if_save_recommend_result=True)
Пример #3
0
def bl_IsRec(a_dataset):
    model_name = 'IsRec'  # ''
    epoch_nums = [20]  # 15,100,1000
    neighbor_size = 15
    topTopicNums = [3]  # [3,4,5,6]

    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)

    for epoch_num in epoch_nums:
        for topTopicNum in topTopicNums:
            HINRec_model = HINRec_new(model_name=model_name,
                                      epoch_num=epoch_num,
                                      neighbor_size=neighbor_size,
                                      topTopicNum=topTopicNum)

            if os.path.exists(HINRec_model.weight_path):
                print('have trained,return!')
            else:
                HINRec_model.train(test_data)
                # HINRec_model.test_model(test_data)
                HINRec_model.save_model()

                evalute_by_epoch(HINRec_model,
                                 HINRec_model,
                                 HINRec_model.model_name,
                                 test_data,
                                 evaluate_by_slt_apiNum=True
                                 )  # ,if_save_recommend_result=True)
Пример #4
0
def bl_DHSR(a_dataset):
    dhsr_recommend_model = DHSR_model()
    dhsr_model = dhsr_recommend_model.get_model()

    # a_dataset.transfer() # 将重复sample删除?  'newScene'且need_slt_apis=False时

    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)
    dhsr_model = load_preTrained_model(
        dhsr_recommend_model, dhsr_model, train_data, test_data,
        *new_Para.param.train_paras)  # 'monitor loss&acc'
    dhsr_recommend_model.save_sth()
    evalute_by_epoch(
        dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name,
        test_data
    )  # ,if_save_recommend_result=True,evaluate_by_slt_apiNum = True)
Пример #5
0
def bl_DHSR_new(a_dataset):
    train_datas, test_datas = a_dataset.transfer_false_test_DHSR(
        if_reduct_train=True)  # 是否约减训练集
    # 选择的服务数目不同,训练对应的模型,并评估效果
    for slt_num in range(1, new_Para.param.slt_item_num + 1):
        train_data, test_data = train_datas[slt_num - 1], test_datas[slt_num -
                                                                     1]
        # old_new = 'new','new_sigmoid', 'new_reduct'效果最好
        dhsr_recommend_model = DHSR_model(old_new='new_reduct',
                                          slt_num=slt_num)
        dhsr_model = dhsr_recommend_model.get_model()
        dhsr_model = load_preTrained_model(
            dhsr_recommend_model, dhsr_model, train_data, test_data,
            *new_Para.param.train_paras)  # 'monitor loss&acc'
        evalute_by_epoch(dhsr_recommend_model,
                         dhsr_model,
                         dhsr_recommend_model.model_name,
                         test_data,
                         evaluate_by_slt_apiNum=True)
        dhsr_recommend_model.save_sth()
        print('DHSR, slt_num:{}, train_predict,done!'.format(slt_num))
Пример #6
0
def DINRec(a_dataset, new_old='new'):
    train_data, test_data = a_dataset.train_data, a_dataset.test_data
    CI_recommend_model = CI_Model(new_old)  # 'old'
    CI_recommend_model.prepare()
    CI_model_obj = CI_recommend_model.get_model()
    CI_model_obj = load_preTrained_model(
        CI_recommend_model, CI_model_obj, train_data, test_data,
        *new_Para.param.train_paras
    )  # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc'

    DINRec_model = DIN_Rec(CI_recommend_model,
                           new_Para.param.predict_fc_unit_nums)
    DINRec_model.prepare()
    DINRec_model_obj = DINRec_model.get_model()
    DINRec_model_obj = load_preTrained_model(
        DINRec_model, DINRec_model_obj, train_data, test_data,
        *new_Para.param.train_paras
    )  # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc'
    evalute_by_epoch(DINRec_model,
                     DINRec_model_obj,
                     DINRec_model.simple_name,
                     test_data,
                     if_save_recommend_result=True,
                     evaluate_by_slt_apiNum=True)
Пример #7
0
def train_save_by_early_stop(recommend_model, model, train_data, test_data):
    """
    训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试
    :return:
    """
    if_Train = True if new_Para.param.pairwise else False
    train_labels = train_data[-1]
    train_instances_tuple = recommend_model.get_instances(
        *train_data[:-1], test_phase_flag=if_Train)

    train_model = recommend_model.get_pairwise_model(
    ) if new_Para.param.pairwise else model
    if new_Para.param.pairwise:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss=lambda y_true, y_pred: y_pred,
                            metrics=['accuracy'])
    else:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss='binary_crossentropy',
                            metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=10,
                                   verbose=2,
                                   mode='min')
    hist = train_model.fit([*train_instances_tuple],
                           train_labels,
                           epochs=new_Para.param.num_epochs,
                           batch_size=new_Para.param.small_batch_size,
                           callbacks=[early_stopping],
                           validation_split=new_Para.param.validation_split,
                           shuffle=True)  #
    model.save_weights(
        dataset.crt_ds.new_model_para_path.format(recommend_model.model_dir,
                                                  'min_loss'))  # !!! 改正

    model_name = recommend_model.get_simple_name() + recommend_model.get_name(
    ) + '_min_loss'
    save_loss_acc(hist, model_name, if_multi_epoch=True)

    epoch_evaluate_result = evalute_by_epoch(recommend_model, model,
                                             model_name, test_data)
    return model
Пример #8
0
 def test_model(self, test_data):
     evalute_by_epoch(self, self, self.model_name, test_data)
Пример #9
0
def train_save_best_NDCG_model(recommend_model,
                               model,
                               train_data,
                               test_data,
                               start_epoch=0,
                               true_candidates_dict=None,
                               CI_start_test_epoch=2,
                               earlyStop_epochs=5):
    """
    训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型
    :param recommend_model:  整体的推荐模型
    :param model:  model_core
    :param train_data:
    :param test_data:
    :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练
    :param true_candidates_dict: ?
    :return:
    """
    print('training_save_best_NDCG_model...')
    epoch_evaluate_results = []

    train_labels = train_data[-1]
    if new_Para.param.final_activation == 'softmax':  # 针对softmax变换labels
        train_labels = utils.to_categorical(train_labels, num_classes=2)

    # ??? 跟获取的实例有关
    test_phase_flag = False if (new_Para.param.pairwise
                                or new_Para.param.NI_OL_mode == 'OL_GE'
                                or new_Para.param.train_mashup_best
                                ) else True  # 在线NI时,也要区分训练和测试过程得到隐式表示的方法
    train_instances_tuple = recommend_model.get_instances(
        *train_data[:-1],
        test_phase_flag=test_phase_flag)  # 有必要才获取负例:pairwise的训练

    # 读取之前训练过的最优指标
    if start_epoch > 0:
        with open(
                dataset.crt_ds.new_best_epoch_path.format(
                    recommend_model.get_simple_name()), 'r') as f:
            best_epoch = int(f.read().strip())
        with open(
                dataset.crt_ds.new_best_NDCG_path.format(
                    recommend_model.get_simple_name()), 'r') as f:
            best_NDCG_5 = float(f.read().strip())
    else:
        best_epoch, best_NDCG_5 = 0, 0

    # 有必要转化为pairwise模型
    train_model = recommend_model.get_pairwise_model(
    ) if new_Para.param.pairwise else model

    for epoch in range(new_Para.param.num_epochs - start_epoch):
        if start_epoch == 0 and epoch == 0:  # 首次训练要编译
            if new_Para.param.pairwise:
                train_model.compile(optimizer=recommend_model.optimizer,
                                    loss=lambda y_true, y_pred: y_pred,
                                    metrics=['accuracy'])
            else:
                train_model.compile(optimizer=recommend_model.optimizer,
                                    loss='binary_crossentropy',
                                    metrics=['accuracy'])
            print('model compile,done!')

        if start_epoch > 0:  # 载入原模型,直接在原来的基础上训练
            train_model = load_trained_model(recommend_model, model)

        epoch = epoch + start_epoch
        print('Epoch {}'.format(epoch))

        # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model()  # pairwise时需要复用相关参数!!!

        if type(train_instances_tuple) == tuple:
            hist = train_model.fit(
                [*train_instances_tuple],
                np.array(train_labels),
                batch_size=new_Para.param.batch_size,
                epochs=1,
                verbose=2,
                shuffle=True,
                validation_split=new_Para.param.validation_split
            )  #可以观察过拟合欠拟合 ,validation_split=0.1
        else:
            hist = train_model.fit(
                train_instances_tuple,
                np.array(train_labels),
                batch_size=new_Para.param.batch_size,
                epochs=1,
                verbose=2,
                shuffle=True,
                validation_split=new_Para.param.validation_split)
        print('model train,done!')

        # 记录:数据集情况,模型新旧完整,模型架构,训练设置
        # recommend_model.get_simple_name()+ '---'+
        model_name = dataset.crt_ds.data_name + recommend_model.get_name(
        ) + new_Para.param.train_name if epoch == 0 else ''  # 记录在测试集的效果,写入evalute.csv

        # 每个epoch的测试
        save_loss_acc(hist, model_name, epoch=epoch)

        if not os.path.exists(recommend_model.model_dir):
            os.makedirs(recommend_model.model_dir)

        if isinstance(recommend_model, CI_Model) and not isinstance(
                recommend_model, NI_Model_online):
            first_test_epoch = CI_start_test_epoch  # 前3轮效果差,一般不用测
        else:
            first_test_epoch = 0

        if epoch < first_test_epoch:  # 暂不测试,提高速度
            epoch_evaluate_results.append(None)
            continue

        if epoch == first_test_epoch:  # 记录第一个epoch的测试时间
            with open(new_Para.param.time_path, 'a+') as f1:
                f1.write(recommend_model.get_simple_name())
                f1.write('\n')

        # test_model = model if not new_Para.param.pairwise else recommend_model.get_single_model()  # pairwise时需要复用相关参数!!!
        # 没必要使用get_model再获取,传入的model是对象引用,pairwise更新后model也变化

        # 每个epoch的测试
        epoch_evaluate_result = evalute_by_epoch(
            recommend_model,
            model,
            model_name,
            test_data,
            record_time=True if epoch == 1 else False,
            true_candidates_dict=true_candidates_dict)
        epoch_evaluate_results.append(epoch_evaluate_result)

        if epoch_evaluate_result[0][3] >= best_NDCG_5:  # 优于目前的best_NDCG_5才存储
            best_NDCG_5 = epoch_evaluate_result[0][3]
            best_epoch = epoch
            model.save_weights(
                dataset.crt_ds.new_model_para_path.format(
                    recommend_model.model_dir, epoch))  # 记录该epoch下的模型参数***
        else:
            if epoch - best_epoch >= earlyStop_epochs:  # 大于若干个epoch,效果没有提升,即时终止
                break
        #@@@# # 第一个 epoch之后存储HIN_sim对象??? 删去only_MLP_model的判断,换成了CI? NI为什么要记录?or isinstance(recommend_model,NI_Model)
        # if epoch==0 and (isinstance(recommend_model,gx_text_tag_continue_only_MLP_model) ):
        #     recommend_model.save_HIN_sim()

        # 看word embedding矩阵是否发生改变,尤其是padding的0
        # print('some embedding parameters after {} epoch:'.format(epoch))
        # print (recommend_model.embedding_layer.get_weights ()[0][:2])

    # 记录最优epoch和最优NDCG@5
    with open(
            dataset.crt_ds.new_best_epoch_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_epoch))
    with open(
            dataset.crt_ds.new_best_NDCG_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_NDCG_5))
    print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5))

    # 记录最优指标
    csv_table_name = 'best_indicaters\n'  # 命名格式!!!
    summary(new_Para.param.evaluate_path, csv_table_name,
            epoch_evaluate_results[best_epoch], new_Para.param.topKs)

    # 把记录的非最优的epoch模型参数都删除
    try:
        for i in range(new_Para.param.num_epochs):
            temp_path = dataset.crt_ds.new_model_para_path.format(
                recommend_model.model_dir, i)
            if i != best_epoch and os.path.exists(temp_path):
                os.remove(temp_path)

        model.load_weights(
            dataset.crt_ds.new_model_para_path.format(
                recommend_model.model_dir, best_epoch))
    finally:
        return model