예제 #1
0
def bl_IsRec_best(a_dataset):
    model_name = 'IsRec_best'  # 'IsRec'  'IsRec_best_modified'
    epoch_num = 20
    neighbor_size = 15
    topTopicNum = 3
    cluster_mode = 'LDA'
    cluster_mode_topic_nums = [50]  # 10,25,75,,100,125,150
    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)
    for cluster_mode_topic_num in cluster_mode_topic_nums:
        HINRec_model = HINRec(model_name=model_name,
                              semantic_mode='TF_IDF',
                              epoch_num=epoch_num,
                              neighbor_size=neighbor_size,
                              topTopicNum=topTopicNum,
                              cluster_mode=cluster_mode,
                              cluster_mode_topic_num=cluster_mode_topic_num)

        if os.path.exists(HINRec_model.weight_path):
            print('have trained,return!')
        else:
            HINRec_model.train(test_data)
            HINRec_model.save_model()

            evalute_by_epoch(HINRec_model,
                             HINRec_model,
                             HINRec_model.model_name,
                             test_data,
                             evaluate_by_slt_apiNum=True)  # )
예제 #2
0
def bl_IsRec(a_dataset):
    model_name = 'IsRec'  # ''
    epoch_nums = [20]  # 15,100,1000
    neighbor_size = 15
    topTopicNums = [3]  # [3,4,5,6]

    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)

    for epoch_num in epoch_nums:
        for topTopicNum in topTopicNums:
            HINRec_model = HINRec(model_name=model_name,
                                  epoch_num=epoch_num,
                                  neighbor_size=neighbor_size,
                                  topTopicNum=topTopicNum)

            if os.path.exists(HINRec_model.weight_path):
                print('have trained,return!')
            else:
                HINRec_model.train(test_data)
                # HINRec_model.test_model(test_data)
                HINRec_model.save_model()

                evalute_by_epoch(HINRec_model,
                                 HINRec_model,
                                 HINRec_model.model_name,
                                 test_data,
                                 evaluate_by_slt_apiNum=True
                                 )  # ,if_save_recommend_result=True)
예제 #3
0
def bl_PasRec():
    model_name = 'PasRec_2path'  # 'PasRec_2path'
    epoch_num = 60  # 之前是40  40比20差点
    neighbor_size = 15
    topTopicNum = 3

    args = data_repository.get_args()
    train_data, test_data = data_repository.get_ds(
    ).train_data, data_repository.get_ds().test_data

    HINRec_model = HINRec(args,
                          model_name=model_name,
                          epoch_num=epoch_num,
                          neighbor_size=neighbor_size,
                          topTopicNum=topTopicNum)
    if os.path.exists(HINRec_model.weight_path):
        print('have trained,return!')
    else:
        # 这里是每隔20epoch测试一下,所以train中输入test_data
        HINRec_model.train(test_data)
        HINRec_model.save_model()
        evalute_by_epoch(
            HINRec_model,
            HINRec_model,
            HINRec_model.model_name,
            test_data,
            evaluate_by_slt_apiNum=False)  # ,if_save_recommend_result=True)
예제 #4
0
def bl_DHSR(a_dataset):
    dhsr_recommend_model = DHSR_model()
    dhsr_model = dhsr_recommend_model.get_model()

    # a_dataset.transfer() # 将重复sample删除?  'newScene'且need_slt_apis=False时

    train_data, test_data = get_train_test_data(a_dataset.train_data,
                                                a_dataset.test_data)
    dhsr_model = train_model(dhsr_recommend_model, dhsr_model, train_data,
                             test_data,
                             *new_Para.param.train_paras)  # 'monitor loss&acc'
    dhsr_recommend_model.save_sth()
    evalute_by_epoch(
        dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name,
        test_data
    )  # ,if_save_recommend_result=True,evaluate_by_slt_apiNum = True)
예제 #5
0
def bl_DHSR_new(a_dataset):
    train_datas, test_datas = a_dataset.transfer_false_test_DHSR(
        if_reduct_train=True)  # 是否约减训练集
    # 选择的服务数目不同,训练对应的模型,并评估效果
    for slt_num in range(1, new_Para.param.slt_item_num + 1):
        train_data, test_data = train_datas[slt_num - 1], test_datas[slt_num -
                                                                     1]
        # old_new = 'new','new_sigmoid', 'new_reduct'效果最好
        dhsr_recommend_model = DHSR_model(old_new='new_reduct',
                                          slt_num=slt_num)
        dhsr_model = dhsr_recommend_model.get_model()
        dhsr_model = train_model(
            dhsr_recommend_model, dhsr_model, train_data, test_data,
            *new_Para.param.train_paras)  # 'monitor loss&acc'
        evalute_by_epoch(dhsr_recommend_model,
                         dhsr_model,
                         dhsr_recommend_model.model_name,
                         test_data,
                         evaluate_by_slt_apiNum=True)
        dhsr_recommend_model.save_sth()
        print('DHSR, slt_num:{}, train_predict,done!'.format(slt_num))
예제 #6
0
def DINRec(a_dataset, new_old='new'):
    train_data, test_data = a_dataset.train_data, a_dataset.test_data
    CI_recommend_model = CI_Model(new_old)  # 'old'
    CI_recommend_model.prepare()
    CI_model_obj = CI_recommend_model.get_model()
    CI_model_obj = train_model(
        CI_recommend_model, CI_model_obj, train_data, test_data,
        *new_Para.param.train_paras
    )  # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc'

    DINRec_model = DIN_Rec(CI_recommend_model,
                           new_Para.param.predict_fc_unit_nums)
    DINRec_model.prepare()
    DINRec_model_obj = DINRec_model.get_model()
    DINRec_model_obj = train_model(
        DINRec_model, DINRec_model_obj, train_data, test_data,
        *new_Para.param.train_paras
    )  # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc'
    evalute_by_epoch(DINRec_model,
                     DINRec_model_obj,
                     DINRec_model.simple_name,
                     test_data,
                     if_save_recommend_result=True,
                     evaluate_by_slt_apiNum=True)
예제 #7
0
def train_early_stop(recommend_model, model, train_data, test_data):
    """
    训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试
    :return:
    """
    if_Train = True if data_repository.get_args().pairwise else False
    train_labels = train_data[-1]
    train_instances_tuple = recommend_model.get_instances(
        *train_data[:-1], pairwise_train_phase_flag=if_Train)

    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model
    if data_repository.get_args().pairwise:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss=lambda y_true, y_pred: y_pred,
                            metrics=['accuracy'])
    else:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss='binary_crossentropy',
                            metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=10,
                                   verbose=2,
                                   mode='min')
    hist = train_model.fit(
        [*train_instances_tuple],
        train_labels,
        epochs=data_repository.get_args().num_epochs,
        batch_size=data_repository.get_args().small_batch_size,
        callbacks=[early_stopping],
        validation_split=data_repository.get_args().validation_split,
        shuffle=True)  #
    model.save_weights(data_repository.get_ds().new_model_para_path.format(
        recommend_model.model_dir, 'min_loss'))  # !!! 改正

    model_name = recommend_model.get_simple_name() + recommend_model.get_name(
    ) + '_min_loss'
    save_loss_acc(hist, model_name, if_multi_epoch=True)

    epoch_evaluate_result = evalute_by_epoch(recommend_model, model,
                                             model_name, test_data)
    return model
예제 #8
0
def train_best_NDCG_model(recommend_model,
                          model,
                          train_data,
                          test_data,
                          true_candidates_dict=None,
                          CI_start_test_epoch=0,
                          earlyStop_epochs=5):
    """
    训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型
    :param recommend_model:  整体的推荐模型
    :param model:  model_core
    :param train_data:
    :param test_data:
    :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练
    :param true_candidates_dict:
    :return:
    """
    print('training_save_best_NDCG_model...')
    epoch_evaluate_results = []

    # 模型
    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model

    # 数据
    train_instances_dict = recommend_model.get_instances(
        train_data,
        pairwise_train_phase_flag=data_repository.get_args().pairwise)
    train_labels = train_data.get('label')
    if data_repository.get_args(
    ).final_activation == 'softmax':  # 针对softmax变换labels
        train_labels = utils.to_categorical(train_labels, num_classes=2)

    best_epoch, best_NDCG_5 = 0, 0
    for epoch in range(data_repository.get_args().num_epochs):
        if epoch == 0:  # 首次训练要编译
            # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy'
            # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy'])
            train_model.compile(optimizer=recommend_model.optimizer,
                                loss='binary_crossentropy',
                                metrics=['accuracy'])
            print('whole_model compile,done!')
        print('Epoch {}'.format(epoch))

        hist = train_model.fit(
            train_instances_dict,
            np.array(train_labels),
            batch_size=data_repository.get_args().batch_size,
            epochs=1,
            verbose=1,
            shuffle=True,
            validation_split=data_repository.get_args().validation_split)
        print('Epoch {}, train done!'.format(epoch))

        # 记录:数据集情况,模型架构,训练设置
        record_name = recommend_model.get_name() + data_repository.get_args(
        ).train_name if epoch == 0 else ''  # 记录在测试集的效果,写入evalute.csv
        save_loss_acc(hist, record_name, epoch=epoch)  # 每个epoch记录

        # CI的前3轮效果差,一般不用测,提高速度
        first_test_epoch = CI_start_test_epoch if isinstance(
            recommend_model, CI_Model) else 0
        if epoch < first_test_epoch:
            epoch_evaluate_results.append(None)
            continue

        # epoch测试
        epoch_evaluate_result = evalute_by_epoch(
            recommend_model,
            model,
            record_name,
            test_data,
            record_time=True if epoch == 0 else False,
            true_candidates_dict=true_candidates_dict)
        epoch_evaluate_results.append(epoch_evaluate_result)

        # 优于目前的best_NDCG_5才存储模型参数 TODO
        if epoch_evaluate_result[0][3] >= best_NDCG_5:
            best_NDCG_5 = epoch_evaluate_result[0][3]
            best_epoch = epoch
            model.save_weights(
                data_repository.get_ds().new_model_para_path.format(
                    recommend_model.model_dir, epoch))
        else:
            if epoch - best_epoch >= earlyStop_epochs:  # 大于若干个epoch,效果没有提升,即时终止
                break

    # 记录最优epoch和最优NDCG@5
    with open(
            data_repository.get_ds().new_best_epoch_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_epoch))
    with open(
            data_repository.get_ds().new_best_NDCG_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_NDCG_5))
    print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5))

    # 记录最优指标
    csv_table_name = 'best_indicaters\n'
    summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch],
            data_repository.get_args().topKs)

    # 看word embedding矩阵是否发生改变,尤其是padding的0
    # print('some embedding parameters after {} epoch:'.format(epoch))
    # print (recommend_model.embedding_layer.get_weights ()[0][:2])

    # 把记录的非最优的epoch模型参数都删除
    try:
        for i in range(data_repository.get_args().num_epochs):
            temp_path = data_repository.get_ds().new_model_para_path.format(
                recommend_model.model_dir, i)
            if i != best_epoch and os.path.exists(temp_path):
                os.remove(temp_path)
        model.load_weights(data_repository.get_ds().new_model_para_path.format(
            recommend_model.model_dir, best_epoch))
    finally:
        return model
예제 #9
0
 def test_model(self,test_data):
     evalute_by_epoch(self, self, self.model_name, test_data)