Пример #1
0
def rank_by_feature(
        save_folder,
        work_df,
        curr_feat,
        inner_fold,
        round_):

    fin_res_dict = {}
    asc_df = get_trec_prepared_df_form_res_df(work_df, curr_feat)
    curr_file_name = inner_fold + '_0' + str(round_) + '_' +  curr_feat + '_Asc.txt'
    with open(os.path.join(save_folder, curr_file_name), 'w') as f:
        f.write(convert_df_to_trec(asc_df))
    res_dict = get_ranking_effectiveness_for_res_file_per_query(
        file_path=save_folder,
        filename=curr_file_name,
        qrel_filepath=qrel_filepath,
        calc_ndcg_mrr=True)
    fin_res_dict['Asc'] = res_dict['all']['NDCG@5']

    work_df[curr_feat] = work_df[curr_feat].apply(lambda x: x*(-1))
    dec_df = get_trec_prepared_df_form_res_df(work_df, curr_feat)
    curr_file_name = inner_fold + '_0' + str(round_) + '_' + curr_feat + '_Dec.txt'
    with open(os.path.join(save_folder, curr_file_name), 'w') as f:
        f.write(convert_df_to_trec(dec_df))
    res_dict = get_ranking_effectiveness_for_res_file_per_query(
        file_path=save_folder,
        filename=curr_file_name,
        qrel_filepath=qrel_filepath,
        calc_ndcg_mrr=True)
    fin_res_dict['Dec'] = res_dict['all']['NDCG@5']
    if fin_res_dict['Asc'] > fin_res_dict['Dec']:
        curr_file_name = inner_fold + '_0' + str(round_) + '_' + curr_feat + '_best.txt'
        with open(os.path.join(save_folder, curr_file_name), 'w') as f:
            f.write(convert_df_to_trec(asc_df))
    else:
        curr_file_name = inner_fold + '_0' + str(round_) + '_' + curr_feat + '_best.txt'
        with open(os.path.join(save_folder, curr_file_name), 'w') as f:
            f.write(convert_df_to_trec(dec_df))
    return fin_res_dict
Пример #2
0
def get_result_for_feature_set(
        base_res_folder,
        train_df,
        valid_df,
        curr_feature_list,
        tree_num,
        leaf_num,
        qrel_filepath):

    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(train_df, feature_list=curr_feature_list))

    with open(os.path.join(base_res_folder, 'valid.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(valid_df, feature_list=curr_feature_list))

    model_filename = learn_lambdamart_model(
        train_file=os.path.join(base_res_folder, 'train.dat'),
        models_folder=base_res_folder,
        tree_num=tree_num,
        leaf_num=leaf_num)

    predictions_filename = run_lambdamart_model(
        test_file=os.path.join(base_res_folder, 'valid.dat'),
        model_file=model_filename,
        predictions_folder=base_res_folder)

    predications = get_predictions_list(predictions_filename)

    valid_df['ModelScore'] = predications
    valid_df['ModelScore'] = valid_df['ModelScore'].apply(lambda x: float(x))
    curr_res_df = get_trec_prepared_df_form_res_df(
        scored_docs_df=valid_df,
        score_colname='ModelScore')
    curr_file_name = 'Curr_valid_res.txt'
    with open(os.path.join(base_res_folder, curr_file_name), 'w') as f:
        f.write(convert_df_to_trec(curr_res_df))

    res_dict = calc_ndcg_at_x_for_file(
        file_path=base_res_folder,
        filename=curr_file_name,
        qrel_filepath=qrel_filepath)

    return res_dict
def run_grid_search_over_params_for_config(base_feature_filename,
                                           snapshot_limit, retrieval_model,
                                           normalize_method,
                                           snap_chosing_method,
                                           tarin_leave_one_out,
                                           feat_group_list, calc_ndcg_mrr):

    # optional_c_list = [0.2, 0.1, 0.01, 0.001]
    ## num 1
    # optional_feat_groups_list = ['All','Static','MG','LG','M','RMG','Static_LG','Static_MG'
    #                                 ,'Static_M', 'Static_RMG']
    ## num 2
    # optional_feat_groups_list = ['Static','MGXXSnap', 'MXXSnap','RMGXXSnap','Static_MGXXSnap'
    #                                     ,'Static_MXXSnap', 'Static_RMGXXSnap','MGXXSnap_MXXSnap_RMGXXSnap']
    ## num 3
    if feat_group_list is None:
        optional_feat_groups_list = ['Historical']
    else:
        optional_feat_groups_list = feat_group_list
    save_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/per_snap_lambdamart_res/ret_res/'
    save_summary_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/per_snap_lambdamart_res/'
    if '2008' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels.adhoc"
    elif 'ASRC' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents.rel"
    elif 'BOT' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents_fixed.relevance"
    elif 'HERD_CONTROL' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/control.rel"
    elif 'UNITED' in base_feature_filename:
        qrel_filepath = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/united.rel'
    else:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels_cw12.adhoc"

    if snap_chosing_method == 'Months':
        snap_limit_options = ['All']
    elif snap_chosing_method == 'SnapNum':
        snap_limit_options = ['All']
    else:
        raise Exception("Unknown snap_chosing_method!")

    model_base_filename = base_feature_filename.replace(
        'All_features_with_meta.tsv', ''
    ) + 'SNL' + str(
        snapshot_limit) + "_" + retrieval_model + "_By" + snap_chosing_method

    if not os.path.exists(os.path.join(save_folder, model_base_filename)):
        os.mkdir(os.path.join(save_folder, model_base_filename))
    save_folder = os.path.join(save_folder, model_base_filename)

    model_base_filename += '_' + normalize_method
    if tarin_leave_one_out == True:
        model_base_filename += '_LoO'
    additional_measures = []
    if calc_ndcg_mrr == True:
        additional_measures = ['NDCG@1', 'NDCG@3', 'MRR', 'nMRR']
    model_summary_df = pd.DataFrame(
        columns=['FeatureGroup', 'Map', 'P@5', 'P@10'] + additional_measures)
    next_idx = 0
    per_q_res_dict = {}
    feat_group_list_str = ""
    # for optional_c in optional_c_list:
    for curr_feat_group in optional_feat_groups_list:
        snap_limit = None
        feat_group = curr_feat_group
        test_res_df, tmp_params_df = run_cv_for_config(
            base_feature_filename=base_feature_filename,
            snapshot_limit=snapshot_limit,
            feature_groupname=feat_group,
            retrieval_model=retrieval_model,
            normalize_method=normalize_method,
            qrel_filepath=qrel_filepath,
            snap_chosing_method=snap_chosing_method,
            train_leave_one_out=tarin_leave_one_out,
            snap_calc_limit=snap_limit)

        tmp_params_df['FeatGroup'] = feat_group
        if next_idx == 0:
            curr_res_df = get_trec_prepared_df_form_res_df(
                scored_docs_df=test_res_df,
                score_colname=retrieval_model + 'Score')
            insert_row = [retrieval_model]
            curr_file_name = model_base_filename + '_' + retrieval_model + '.txt'
            with open(os.path.join(save_folder, curr_file_name), 'w') as f:
                f.write(convert_df_to_trec(curr_res_df))

            res_dict = get_ranking_effectiveness_for_res_file_per_query(
                file_path=save_folder,
                filename=curr_file_name,
                qrel_filepath=qrel_filepath,
                calc_ndcg_mrr=calc_ndcg_mrr)
            for measure in ['Map', 'P_5', 'P_10'] + additional_measures:
                insert_row.append(res_dict['all'][measure])
            per_q_res_dict[retrieval_model] = res_dict
            model_summary_df.loc[next_idx] = insert_row
            next_idx += 1
            params_df = tmp_params_df
        else:
            params_df = params_df.append(tmp_params_df, ignore_index=True)

        curr_res_df = get_trec_prepared_df_form_res_df(
            scored_docs_df=test_res_df, score_colname='ModelScore')
        insert_row = [feat_group.replace('_', '+')]
        curr_file_name = model_base_filename + '_' + feat_group + '.txt'
        with open(os.path.join(save_folder, curr_file_name), 'w') as f:
            f.write(convert_df_to_trec(curr_res_df))

        res_dict = get_ranking_effectiveness_for_res_file_per_query(
            file_path=save_folder,
            filename=curr_file_name,
            qrel_filepath=qrel_filepath,
            calc_ndcg_mrr=calc_ndcg_mrr)
        for measure in ['Map', 'P_5', 'P_10'] + additional_measures:
            insert_row.append(res_dict['all'][measure])
        per_q_res_dict[feat_group.replace('_', '+')] = res_dict
        model_summary_df.loc[next_idx] = insert_row
        next_idx += 1

    significance_df = create_sinificance_df(per_q_res_dict, calc_ndcg_mrr)
    model_summary_df = pd.merge(model_summary_df,
                                significance_df,
                                on=['FeatureGroup'],
                                how='inner')

    model_summary_df.to_csv(os.path.join(
        save_summary_folder,
        model_base_filename + feat_group_list_str + '.tsv'),
                            sep='\t',
                            index=False)
    params_df.to_csv(os.path.join(
        save_summary_folder,
        model_base_filename + feat_group_list_str + '_Params.tsv'),
                     sep='\t',
                     index=False)
def train_and_test_model_on_config(base_feature_filename,
                                   snapshot_limit,
                                   feature_list,
                                   start_test_q,
                                   end_test_q,
                                   feature_groupname,
                                   normalize_method,
                                   qrel_filepath,
                                   snap_chosing_method=None,
                                   snap_calc_limit=None):

    base_res_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/per_snap_lambdamart_res/'
    model_inner_folder = base_feature_filename.replace(
        'All_features_with_meta.tsv', '') + 'SNL' + str(snapshot_limit)
    feature_folder = feature_groupname
    # if normalize_relevance == True:
    feature_folder += '_' + normalize_method
    fold_folder = str(start_test_q) + '_' + str(end_test_q) + "_" + str(
        snap_chosing_method)

    for hirarcy_folder in [model_inner_folder, feature_folder, fold_folder]:
        base_res_folder = os.path.join(base_res_folder, hirarcy_folder)
        if not os.path.exists(base_res_folder):
            os.mkdir(base_res_folder)

    best_snap_num = snap_calc_limit

    feat_df, new_feat_list = prepare_svmr_model_data_per_snap(
        base_feature_filename=base_feature_filename,
        snapshot_limit=int(snapshot_limit),
        feature_list=feature_list,
        normalize_method=normalize_method,
        lambdamart=True)

    print("Model Data Prepared...")
    sys.stdout.flush()
    train_df, test_df, valid_df, seed = split_to_train_test(
        start_test_q=start_test_q,
        end_test_q=end_test_q,
        feat_df=feat_df,
        base_feature_filename=base_feature_filename)

    valid_df_cp = valid_df.copy()
    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(train_df,
                                             feature_list=new_feat_list))

    with open(os.path.join(base_res_folder, 'valid.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(valid_df,
                                             feature_list=new_feat_list))

    num_tree_optional_list = [250, 500]
    num_leaf_optional_list = [3, 2, 5]
    best_map = 0.0

    for tree_num in num_tree_optional_list:
        for leaf_num in num_leaf_optional_list:
            print("Running validation tree num: " +
                  str(tree_num)) + " leaf num: " + str(leaf_num)
            model_filename = learn_lambdamart_model(
                train_file=os.path.join(base_res_folder, 'train.dat'),
                models_folder=base_res_folder,
                tree_num=tree_num,
                leaf_num=leaf_num)

            predictions_filename = run_lambdamart_model(
                test_file=os.path.join(base_res_folder, 'valid.dat'),
                model_file=model_filename,
                predictions_folder=base_res_folder)

            predications = get_predictions_list(predictions_filename)

            valid_df['ModelScore'] = predications
            valid_df['ModelScore'] = valid_df['ModelScore'].apply(
                lambda x: float(x))
            curr_res_df = get_trec_prepared_df_form_res_df(
                scored_docs_df=valid_df, score_colname='ModelScore')
            curr_file_name = 'Curr_valid_res.txt'
            with open(os.path.join(base_res_folder, curr_file_name), 'w') as f:
                f.write(convert_df_to_trec(curr_res_df))

            res_dict = get_ranking_effectiveness_for_res_file(
                file_path=base_res_folder,
                filename=curr_file_name,
                qrel_filepath=qrel_filepath)

            if float(res_dict['Map']) > best_map:
                best_map = float(res_dict['Map'])
                best_tree_num = tree_num
                beat_leaf_num = leaf_num

    train_df = train_df.append(valid_df_cp, ignore_index=True)
    train_df.sort_values('QueryNum', inplace=True)

    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(train_df,
                                             feature_list=new_feat_list))

    best_params_str = 'SnapLim: ' + str(
        best_snap_num) + '\n' + "TreeNum: " + str(
            best_tree_num) + '\n' + "LeafNum: " + str(beat_leaf_num)
    with open(os.path.join(base_res_folder, 'hyper_params.txt'), 'w') as f:
        f.write(best_params_str)

    with open(os.path.join(base_res_folder, 'test.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(test_df,
                                             feature_list=new_feat_list))

    print("Strating Train : " + model_inner_folder + ' ' + feature_folder +
          ' ' + fold_folder)
    sys.stdout.flush()
    model_filename = learn_lambdamart_model(train_file=os.path.join(
        base_res_folder, 'train.dat'),
                                            models_folder=base_res_folder,
                                            tree_num=best_tree_num,
                                            leaf_num=beat_leaf_num)

    print("Strating Test : " + model_inner_folder + ' ' + feature_folder +
          ' ' + fold_folder)
    sys.stdout.flush()

    predictions_filename = run_lambdamart_model(
        test_file=os.path.join(base_res_folder, 'test.dat'),
        model_file=model_filename,
        predictions_folder=base_res_folder)

    predications = get_predictions_list(predictions_filename)

    test_df['ModelScore'] = predications
    test_df['ModelScore'] = test_df['ModelScore'].apply(lambda x: float(x))

    params_list = [best_tree_num, beat_leaf_num]
    hyper_params = ['Tree', 'Leaf']
    if best_snap_num is not None:
        hyper_params.append('SnapLimit')
        params_list.append(best_snap_num)
    params_df = pd.DataFrame(columns=['Fold'] + hyper_params)
    params_df.loc[0] = [str(start_test_q) + '_' + str(end_test_q)
                        ] + params_list

    return test_df, params_df
Пример #5
0
def learn_best_num_of_snaps(base_feature_filename, snapshot_limit,
                            feature_list, start_test_q, end_test_q,
                            base_res_folder, qrel_filepath,
                            normalize_relevance, snap_chosing_method):

    if snap_chosing_method == 'SnapNum':
        optional_snap_limit = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 'All']
    elif snap_chosing_method == 'Months':
        optional_snap_limit = [
            '2M', '3M', '5M', '6M', '7M', '8M', '9M', '10M', '1Y', '1.5Y',
            'All'
        ]
    else:
        raise Exception('learn_best_num_of_snaps: Unknown snap_chosing_method')

    tree_num = 250
    leaf_num = 5
    best_snap_lim = None
    best_map = 0.0
    seed = None
    for snap_lim in optional_snap_limit:
        print("Running validation snap limit: " + str(snap_lim))
        sys.stdout.flush()
        feat_df = prepare_svmr_model_data(
            base_feature_filename=base_feature_filename,
            snapshot_limit=int(snapshot_limit),
            feature_list=feature_list,
            normalize_relvance=normalize_relevance,
            limited_snaps_num=snap_lim,
            lambdamart=True)

        train_df, test_df, valid_df, seed = split_to_train_test(
            start_test_q=start_test_q,
            end_test_q=end_test_q,
            feat_df=feat_df,
            seed=seed)

        with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
            f.write(
                turn_df_to_feature_str_for_model(train_df,
                                                 feature_list=feature_list))

        with open(os.path.join(base_res_folder, 'valid.dat'), 'w') as f:
            f.write(
                turn_df_to_feature_str_for_model(valid_df,
                                                 feature_list=feature_list))

        model_filename = learn_lambdamart_model(train_file=os.path.join(
            base_res_folder, 'train.dat'),
                                                models_folder=base_res_folder,
                                                tree_num=tree_num,
                                                leaf_num=leaf_num)

        predictions_filename = run_lambdamart_model(
            test_file=os.path.join(base_res_folder, 'valid.dat'),
            model_file=model_filename,
            predictions_folder=base_res_folder)

        predications = get_predictions_list(predictions_filename)

        valid_df['ModelScore'] = predications
        valid_df['ModelScore'] = valid_df['ModelScore'].apply(
            lambda x: float(x))
        curr_res_df = get_trec_prepared_df_form_res_df(
            scored_docs_df=valid_df, score_colname='ModelScore')
        curr_file_name = 'Curr_valid_res.txt'
        with open(os.path.join(base_res_folder, curr_file_name), 'w') as f:
            f.write(convert_df_to_trec(curr_res_df))

        res_dict = get_ranking_effectiveness_for_res_file(
            file_path=base_res_folder,
            filename=curr_file_name,
            qrel_filepath=qrel_filepath)

        if float(res_dict['Map']) > best_map:
            best_map = float(res_dict['Map'])
            best_snap_lim = snap_lim

    return best_snap_lim
def train_and_test_model_on_config(base_feature_filename,
                                   feature_list,
                                   feature_groupname,
                                   normalize_method,
                                   qrel_filepath,
                                   snap_calc_limit=None):

    base_res_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/lambdamart_res/trained_models/'

    model_inner_folder = base_feature_filename.replace('All_features_',
                                                       '').replace(
                                                           'with_meta.tsv', '')
    feature_folder = feature_groupname.replace('XXSnap', 'XS')
    feature_folder += '_' + normalize_method

    for hirarcy_folder in [model_inner_folder, feature_folder]:
        base_res_folder = os.path.join(base_res_folder, hirarcy_folder)
        if not os.path.exists(base_res_folder):
            os.mkdir(base_res_folder)

    best_snap_num = snap_calc_limit

    feat_df = prepare_svmr_model_data(
        base_feature_filename=base_feature_filename,
        snapshot_limit=int(1),
        feature_list=feature_list,
        normalize_method=normalize_method,
        limited_snaps_num=best_snap_num,
        lambdamart=True)

    print("Model Data Prepared...")
    sys.stdout.flush()
    train_df, test_df, valid_df, seed = split_to_train_test(
        start_test_q=2,
        end_test_q=2,
        feat_df=feat_df,
        base_feature_filename=base_feature_filename)

    train_df = train_df.append(test_df, ignore_index=True)
    train_df.sort_values('QueryNum', inplace=True)

    valid_df_cp = valid_df.copy()
    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(train_df,
                                             feature_list=feature_list))

    with open(os.path.join(base_res_folder, 'valid.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(valid_df,
                                             feature_list=feature_list))

    num_tree_optional_list = [250, 500]
    num_leaf_optional_list = [3, 2, 5]
    best_map = 0.0

    for tree_num in num_tree_optional_list:
        for leaf_num in num_leaf_optional_list:
            print("Running validation tree num: " +
                  str(tree_num)) + " leaf num: " + str(leaf_num)
            model_filename = learn_lambdamart_model(
                train_file=os.path.join(base_res_folder, 'train.dat'),
                models_folder=base_res_folder,
                tree_num=tree_num,
                leaf_num=leaf_num)

            predictions_filename = run_lambdamart_model(
                test_file=os.path.join(base_res_folder, 'valid.dat'),
                model_file=model_filename,
                predictions_folder=base_res_folder)

            predications = get_predictions_list(predictions_filename)

            valid_df['ModelScore'] = predications
            valid_df['ModelScore'] = valid_df['ModelScore'].apply(
                lambda x: float(x))
            curr_res_df = get_trec_prepared_df_form_res_df(
                scored_docs_df=valid_df, score_colname='ModelScore')
            curr_file_name = 'Curr_valid_res.txt'
            with open(os.path.join(base_res_folder, curr_file_name), 'w') as f:
                f.write(convert_df_to_trec(curr_res_df))

            res_dict = calc_ndcg_at_x_for_file(file_path=base_res_folder,
                                               filename=curr_file_name,
                                               qrel_filepath=qrel_filepath)

            if float(res_dict['NDCG@X']) > best_map:
                best_map = float(res_dict['NDCG@X'])
                best_tree_num = tree_num
                beat_leaf_num = leaf_num

    new_feature_list = feature_list[:]

    train_df = train_df.append(valid_df_cp, ignore_index=True)
    train_df.sort_values('QueryNum', inplace=True)

    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(
            turn_df_to_feature_str_for_model(train_df,
                                             feature_list=new_feature_list))

    best_params_str = 'SnapLim: ' + str(
        best_snap_num) + '\n' + "TreeNum: " + str(
            best_tree_num) + '\n' + "LeafNum: " + str(beat_leaf_num)
    with open(os.path.join(base_res_folder, 'hyper_params.txt'), 'w') as f:
        f.write(best_params_str)

    print("Strating Train : " + model_inner_folder + ' ' + feature_folder)
    sys.stdout.flush()

    model_filename = learn_lambdamart_model(train_file=os.path.join(
        base_res_folder, 'train.dat'),
                                            models_folder=base_res_folder,
                                            tree_num=best_tree_num,
                                            leaf_num=beat_leaf_num)
Пример #7
0
def run_grid_search_over_params_for_config(
        base_feature_filename,
        snapshot_limit,
        retrieval_model,
        normalize_method,
        snap_chosing_method,
        tarin_leave_one_out,
        feat_group_list,
        calc_ndcg_mrr,
        backward_elimination,
        snap_num_as_hyper_param,
        snap_choosing_config,
        is_new_server,
        with_bert_as_feature,
        limited_features_list = None,
        feature_for_ablation = None):

    # optional_c_list = [0.2, 0.1, 0.01, 0.001]
    ## num 1
    # optional_feat_groups_list = ['All','Static','MG','LG','M','RMG','Static_LG','Static_MG'
    #                                 ,'Static_M', 'Static_RMG']
    ## num 2
    # optional_feat_groups_list = ['Static','MGXXSnap', 'MXXSnap','RMGXXSnap','Static_MGXXSnap'
    #                                     ,'Static_MXXSnap', 'Static_RMGXXSnap','MGXXSnap_MXXSnap_RMGXXSnap']
    ## num 3
    if feat_group_list is None:
        optional_feat_groups_list = ['Static',
                                     # 'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap_MGXXSnap_RMGXXSnap',


                                     # 'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap',
                                     # 'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap_MGXXSnap',
                                     # 'Static_MGXXSnap',


                                     # 'Static_RMGXXSnap'
                                     ]
    else:
        optional_feat_groups_list = feat_group_list
    save_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/lambdamart_res/ret_res/'
    save_summary_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/lambdamart_res/'
    if is_new_server == True:
        save_folder = '/lv_local/home/zivvasilisky/ziv/results/lambdamart_res/ret_res/'
        save_summary_folder = '/lv_local/home/zivvasilisky/ziv/results/lambdamart_res/'
    if '2008' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels.adhoc"
    elif 'ASRC' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents.rel"
    elif 'BOT' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents_fixed.relevance"
    elif 'HERD_CONTROL' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/control.rel"
    elif 'UNITED' in base_feature_filename:
        qrel_filepath = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/united.rel'
    elif 'COMP2020' in base_feature_filename:
        qrel_filepath = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/curr_comp.rel'
    else:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels_cw12.adhoc"

    if snap_chosing_method == 'Months':
        snap_limit_options = [
            # '3M', '6M', '9M', '1Y', '1.5Y',
            snap_choosing_config]
    elif snap_chosing_method == 'SnapNum':
        snap_limit_options = [
            # 3, 5, 7, 10, 15,
            snap_choosing_config]
    else:
        raise Exception("Unknown snap_chosing_method!")

    model_base_filename = base_feature_filename.replace('All_features_with_meta.tsv', '') + 'SNL' + str(
        snapshot_limit) + "_" + retrieval_model + "_By" + snap_chosing_method + '_' + str(snap_choosing_config)

    retrieval_model_addition = ""
    if tarin_leave_one_out == True:
        model_base_filename += '_LoO'
        retrieval_model_addition += '_LoO'
    if backward_elimination == True:
        model_base_filename += '_BElim'
        retrieval_model_addition += '_BElim'
    if snap_num_as_hyper_param == True:
        model_base_filename += '_SnapLim'
        retrieval_model_addition += '_SnapLim'
    if with_bert_as_feature == True:
        model_base_filename += '_Bert'
        retrieval_model_addition += '_Bert'
    if feature_for_ablation is not None:
        model_base_filename += '_Ablation'
        retrieval_model_addition += '_' + feature_for_ablation
    if limited_features_list is not None:
        model_base_filename += create_feature_list_shortcut_string(limited_features_list)
        retrieval_model_addition += create_feature_list_shortcut_string(limited_features_list)

    if not os.path.exists(os.path.join(save_folder, model_base_filename)):
        os.mkdir(os.path.join(save_folder, model_base_filename))
    save_folder = os.path.join(save_folder, model_base_filename)

    model_base_filename += '_' + normalize_method
    additional_measures = []
    if calc_ndcg_mrr == True:
        additional_measures =  ['NDCG@1', 'NDCG@3', 'MRR', 'nMRR']
    model_summary_df = pd.DataFrame(columns=['FeatureGroup', 'Map', 'P@5', 'P@10'] +additional_measures)
    next_idx = 0
    per_q_res_dict = {}
    feat_group_list_str = ""
    params_df = pd.DataFrame({})
    # for optional_c in optional_c_list:
    for trial_num in range(1,6):
        for curr_feat_group in optional_feat_groups_list:
            feat_group_list_str +=  "__" + curr_feat_group.replace('XXSnap','')
            if 'XXSnap' in curr_feat_group:
                snap_limit_list = snap_limit_options
            else:
                snap_limit_list = [snap_choosing_config]
            for snap_limit in snap_limit_list:
                if snap_limit is None:
                    feat_group = curr_feat_group
                else:
                    feat_group = curr_feat_group + "_" + str(snap_limit)

                test_res_df, tmp_params_df = run_cv_for_config(
                    base_feature_filename=base_feature_filename,
                    snapshot_limit=snapshot_limit,
                    feature_groupname=feat_group,
                    retrieval_model=retrieval_model + retrieval_model_addition,
                    normalize_method=normalize_method,
                    qrel_filepath=qrel_filepath,
                    snap_chosing_method=snap_chosing_method,
                    train_leave_one_out=tarin_leave_one_out,
                    snap_calc_limit=snap_limit,
                    backward_elimination=backward_elimination,
                    snap_num_as_hyper_param=snap_num_as_hyper_param,
                    is_new_server=is_new_server,
                    with_bert_as_feature=with_bert_as_feature,
                    feature_for_ablation=feature_for_ablation,
                    limited_features_list=limited_features_list,
                    trial_num=trial_num)

                tmp_params_df['FeatGroup'] = feat_group
                if 'XXSnap' in feat_group:
                    feat_group = feat_group.replace('XXSnap','') + 'By' + snap_chosing_method

                # if next_idx == 0 and feature_for_ablation is None:
                #     curr_res_df = get_trec_prepared_df_form_res_df(
                #         scored_docs_df=test_res_df,
                #         score_colname=retrieval_model + 'Score')
                #     insert_row = [retrieval_model]
                #     curr_file_name = model_base_filename + '_' + retrieval_model + '_' + str(trial_num) + '.txt'
                #     with open(os.path.join(save_folder, curr_file_name), 'w') as f:
                #         f.write(convert_df_to_trec(curr_res_df))
                #
                #     res_dict = get_ranking_effectiveness_for_res_file_per_query(
                #         file_path=save_folder,
                #         filename=curr_file_name,
                #         qrel_filepath=qrel_filepath,
                #         calc_ndcg_mrr=calc_ndcg_mrr)
                #     for measure in ['Map', 'P_5', 'P_10']+additional_measures:
                #         insert_row.append(res_dict['all'][measure])
                #     per_q_res_dict[retrieval_model] = res_dict
                #     model_summary_df.loc[next_idx] = insert_row
                #     next_idx += 1
                #     params_df = tmp_params_df
                # else:
                #     params_df = params_df.append(tmp_params_df, ignore_index=True)

                curr_res_df = get_trec_prepared_df_form_res_df(
                    scored_docs_df=test_res_df,
                    score_colname='ModelScore')
                insert_row = [feat_group.replace('_', '+')]
                curr_file_name = model_base_filename + '_' + feat_group + '_' + str(trial_num) +'.txt'
                if feature_for_ablation is not None:
                    curr_file_name = curr_file_name.replace('_Ablation', '_Abla_' + feature_for_ablation)
                with open(os.path.join(save_folder, curr_file_name), 'w') as f:
                    f.write(convert_df_to_trec(curr_res_df))

                res_dict = get_ranking_effectiveness_for_res_file_per_query(
                    file_path=save_folder,
                    filename=curr_file_name,
                    qrel_filepath=qrel_filepath,
                    calc_ndcg_mrr=calc_ndcg_mrr)
                for measure in ['Map', 'P_5', 'P_10']+additional_measures:
                    insert_row.append(res_dict['all'][measure])
                per_q_res_dict[feat_group.replace('_', '+')] = res_dict
                model_summary_df.loc[next_idx] = insert_row
                next_idx += 1
Пример #8
0
def train_and_test_model_on_config(
        base_feature_filename,
        snapshot_limit,
        feature_list,
        start_test_q,
        end_test_q,
        feature_groupname,
        normalize_method,
        qrel_filepath,
        snap_chosing_method=None,
        snap_calc_limit=None,
        backward_elimination=False,
        snap_num_as_hyper_param=False,
        is_new_server=False,
        trial_num=0):

    base_res_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/lambdamart_res/'
    if is_new_server == True:
        base_res_folder = '/lv_local/home/zivvasilisky/ziv/results/lambdamart_res/'

    model_inner_folder = base_feature_filename.replace('All_features_', '').replace('with_meta.tsv', '')+ 'SNL' + str(snapshot_limit)
    feature_folder = feature_groupname.replace('XXSnap','XS')
    # if normalize_relevance == True:
    feature_folder += '_' + normalize_method
    fold_folder = str(start_test_q) + '_' + str(end_test_q) #+ "_" + str(snap_chosing_method)

    for hirarcy_folder in [model_inner_folder, feature_folder, fold_folder]:
        base_res_folder = os.path.join(base_res_folder, hirarcy_folder)
        if not os.path.exists(base_res_folder):
            os.mkdir(base_res_folder)


    best_snap_num = snap_calc_limit

    feat_df = prepare_svmr_model_data(
        base_feature_filename=base_feature_filename,
        snapshot_limit=int(snapshot_limit),
        feature_list=feature_list,
        normalize_method=normalize_method,
        limited_snaps_num=best_snap_num,
        lambdamart=True)

    print("Model Data Prepared...")
    sys.stdout.flush()
    train_df, test_df, valid_df, seed = split_to_train_test(
        start_test_q=start_test_q,
        end_test_q=end_test_q,
        feat_df=feat_df,
        base_feature_filename=base_feature_filename,
        trial_num=trial_num)


    valid_df_cp = valid_df.copy()
    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(train_df, feature_list=feature_list))

    with open(os.path.join(base_res_folder, 'valid.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(valid_df, feature_list=feature_list))

    num_tree_optional_list = [250, 500]
    num_leaf_optional_list = [3, 2, 5]
    best_map = 0.0

    for tree_num in num_tree_optional_list:
        for leaf_num in num_leaf_optional_list:
            print("Running validation tree num: " + str(tree_num)) + " leaf num: " + str(leaf_num)
            model_filename = learn_lambdamart_model(
                train_file=os.path.join(base_res_folder, 'train.dat'),
                models_folder=base_res_folder,
                tree_num=tree_num,
                leaf_num=leaf_num)

            predictions_filename = run_lambdamart_model(
                test_file=os.path.join(base_res_folder, 'valid.dat'),
                model_file=model_filename,
                predictions_folder=base_res_folder)

            predications = get_predictions_list(predictions_filename)

            valid_df['ModelScore'] = predications
            valid_df['ModelScore'] = valid_df['ModelScore'].apply(lambda x: float(x))
            curr_res_df = get_trec_prepared_df_form_res_df(
                scored_docs_df=valid_df,
                score_colname='ModelScore')
            curr_file_name = 'Curr_valid_res.txt'
            with open(os.path.join(base_res_folder, curr_file_name), 'w') as f:
                f.write(convert_df_to_trec(curr_res_df))

            res_dict = calc_ndcg_at_x_for_file(
                file_path=base_res_folder,
                filename=curr_file_name,
                qrel_filepath=qrel_filepath)

            if float(res_dict['NDCG@X']) > best_map:
                best_map = float(res_dict['NDCG@X'])
                best_tree_num = tree_num
                beat_leaf_num = leaf_num

    if backward_elimination == True:
        new_feature_list = run_backward_elimination(
            base_res_folder=base_res_folder,
            train_df=train_df,
            valid_df=valid_df,
            feature_list=feature_list,
            tree_num=best_tree_num,
            leaf_num=beat_leaf_num,
            qrel_filepath=qrel_filepath,
            curr_map_score=best_map)
    else:
        new_feature_list = feature_list[:]

    if (snap_num_as_hyper_param == True) and ('XXSnap' in feature_groupname):
        round_num = int(base_feature_filename.split('Round')[1].split('_')[0])
        optional_snap_limit = list(range(2, round_num))
        if len(optional_snap_limit) <= 1:
            best_snap_num = snap_calc_limit
        else:
            optional_snap_limit[-1] = 'All'
            optional_snap_limit = list(reversed(optional_snap_limit))
            curr_map_score = best_map
            tree_num = best_tree_num
            leaf_num = beat_leaf_num
            for snap_lim in optional_snap_limit:
                print("Optimizing snap limit: " + str(snap_lim))
                sys.stdout.flush()
                feat_df = prepare_svmr_model_data(
                    base_feature_filename=base_feature_filename,
                    snapshot_limit=int(snapshot_limit),
                    feature_list=new_feature_list,
                    normalize_method=normalize_method,
                    limited_snaps_num=snap_lim,
                    lambdamart=True)

                train_df, test_df, valid_df, seed = split_to_train_test(
                    start_test_q=start_test_q,
                    end_test_q=end_test_q,
                    feat_df=feat_df,
                    base_feature_filename=base_feature_filename,
                    seed=seed)

                res_dict = get_result_for_feature_set(
                    base_res_folder=base_res_folder,
                    train_df=train_df,
                    valid_df=valid_df,
                    curr_feature_list=new_feature_list,
                    tree_num=tree_num,
                    leaf_num=leaf_num,
                    qrel_filepath=qrel_filepath)

                if float(res_dict['NDCG@X']) > curr_map_score:
                    curr_map_score = float(res_dict['NDCG@X'])
                    best_snap_num = snap_lim

    train_df = train_df.append(valid_df_cp, ignore_index=True)
    train_df.sort_values('QueryNum', inplace=True)

    with open(os.path.join(base_res_folder, 'train.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(train_df, feature_list=new_feature_list))

    best_params_str = 'SnapLim: ' + str(best_snap_num) + '\n' + "TreeNum: " +str(best_tree_num) +'\n' +"LeafNum: " +str(beat_leaf_num)
    with open(os.path.join(base_res_folder, 'hyper_params.txt'), 'w') as f:
        f.write(best_params_str)

    with open(os.path.join(base_res_folder, 'test.dat'), 'w') as f:
        f.write(turn_df_to_feature_str_for_model(test_df, feature_list=new_feature_list))

    print("Strating Train : " + model_inner_folder + ' ' + feature_folder + ' ' + fold_folder)
    sys.stdout.flush()
    model_filename = learn_lambdamart_model(
        train_file=os.path.join(base_res_folder, 'train.dat'),
        models_folder=base_res_folder,
        tree_num=best_tree_num,
        leaf_num=beat_leaf_num)

    print("Strating Test : " + model_inner_folder + ' ' + feature_folder + ' ' + fold_folder)
    sys.stdout.flush()

    predictions_filename = run_lambdamart_model(
        test_file=os.path.join(base_res_folder, 'test.dat'),
        model_file=model_filename,
        predictions_folder=base_res_folder)

    predications = get_predictions_list(predictions_filename)

    test_df['ModelScore'] = predications
    test_df['ModelScore'] = test_df['ModelScore'].apply(lambda x: float(x))

    params_list = [best_tree_num, beat_leaf_num]
    hyper_params = ['Tree', 'Leaf']
    if best_snap_num is not None:
        hyper_params.append('SnapLimit')
        params_list.append(best_snap_num)
    params_df = pd.DataFrame(columns=['Fold'] + hyper_params)
    params_df.loc[0] = [str(start_test_q) + '_' + str(end_test_q)] + params_list

    return test_df, params_df
Пример #9
0
def run_grid_search_over_params_for_config(
        base_feature_filename, snapshot_limit, retrieval_model,
        normalize_method, snap_chosing_method, tarin_leave_one_out,
        feat_group_list, calc_ndcg_mrr, backward_elimination,
        snap_num_as_hyper_param, snap_choosing_config, is_new_server,
        with_bert_as_feature, limited_features_list):

    if feat_group_list is None:
        optional_feat_groups_list = [
            'Static',
            # 'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap_MGXXSnap_RMGXXSnap',
            'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap',
            'Static_MXXSnap_STDXXSnap_MinXXSnap_MaxXXSnap_MGXXSnap',
            'Static_MGXXSnap',
            # 'Static_RMGXXSnap'
        ]
    else:
        optional_feat_groups_list = feat_group_list
    save_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/rank_svm_res/ret_res/'
    save_summary_folder = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/rank_svm_res/'
    if is_new_server == True:
        save_folder = '/lv_local/home/zivvasilisky/ziv/results/rank_svm_res/ret_res/'
        save_summary_folder = '/lv_local/home/zivvasilisky/ziv/results/rank_svm_res/'

    if '2008' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels.adhoc"
    elif 'ASRC' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents.rel"
    elif 'BOT' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/documents_fixed.relevance"
    elif 'HERD_CONTROL' in base_feature_filename:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/control.rel"
    elif 'UNITED' in base_feature_filename:
        qrel_filepath = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/united.rel'
    elif 'COMP2020' in base_feature_filename:
        qrel_filepath = '/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/curr_comp.rel'
    else:
        qrel_filepath = "/mnt/bi-strg3/v/zivvasilisky/ziv/results/qrels/qrels_cw12.adhoc"

    if snap_chosing_method == 'Months':
        snap_limit_options = [snap_choosing_config]
    else:
        raise Exception("Unknown snap_chosing_method!")
    model_base_filename = base_feature_filename.replace(
        'All_features_with_meta.tsv', ''
    ) + 'SNL' + str(
        snapshot_limit
    ) + "_" + retrieval_model + "_By" + snap_chosing_method + "_" + snap_choosing_config

    retrieval_model_addition = ""
    if tarin_leave_one_out == True:
        model_base_filename += '_LoO'
        retrieval_model_addition += '_LoO'
    if backward_elimination == True:
        model_base_filename += '_BElim'
        retrieval_model_addition += '_BElim'
    if snap_num_as_hyper_param == True:
        model_base_filename += '_SnapLim'
        retrieval_model_addition += '_SnapLim'
    if with_bert_as_feature == True:
        model_base_filename += '_Bert'
        retrieval_model_addition += '_Bert'
    if limited_features_list is not None:
        model_base_filename += create_feature_list_shortcut_string(
            limited_features_list)
        retrieval_model_addition += create_feature_list_shortcut_string(
            limited_features_list)

    if not os.path.exists(os.path.join(save_folder, model_base_filename)):
        os.mkdir(os.path.join(save_folder, model_base_filename))
    save_folder = os.path.join(save_folder, model_base_filename)

    model_base_filename += '_' + normalize_method

    additional_measures = []
    if calc_ndcg_mrr == True:
        additional_measures = ['NDCG@1', 'NDCG@3', 'MRR', 'nMRR']
    model_summary_df = pd.DataFrame(
        columns=['FeatureGroup', 'Map', 'P@5', 'P@10'] + additional_measures)
    next_idx = 0
    per_q_res_dict = {}
    feat_group_list_str = ""
    for curr_feat_group in optional_feat_groups_list:
        feat_group_list_str += "__" + curr_feat_group.replace('XXSnap', '')
        if 'XXSnap' in curr_feat_group:
            snap_limit_list = snap_limit_options
        else:
            snap_limit_list = [snap_choosing_config]
        for snap_limit in snap_limit_list:
            if snap_limit is None:
                feat_group = curr_feat_group
            else:
                feat_group = curr_feat_group + "_" + str(snap_limit)

            test_res_df, tmp_params_df = run_cv_for_config(
                base_feature_filename=base_feature_filename,
                snapshot_limit=snapshot_limit,
                feature_groupname=feat_group,
                retrieval_model=retrieval_model + retrieval_model_addition,
                normalize_method=normalize_method,
                qrel_filepath=qrel_filepath,
                snap_chosing_method=snap_chosing_method,
                train_leave_one_out=tarin_leave_one_out,
                snap_calc_limit=snap_limit,
                backward_elimination=backward_elimination,
                snap_num_as_hyper_param=snap_num_as_hyper_param,
                is_new_server=is_new_server,
                with_bert_as_feature=with_bert_as_feature,
                limited_features_list=limited_features_list)

            tmp_params_df['FeatGroup'] = feat_group
            if 'XXSnap' in feat_group:
                feat_group = feat_group.replace(
                    'XXSnap', '') + 'By' + snap_chosing_method

            if next_idx == 0:
                curr_res_df = get_trec_prepared_df_form_res_df(
                    scored_docs_df=test_res_df,
                    score_colname=retrieval_model + 'Score')
                insert_row = [retrieval_model]
                curr_file_name = model_base_filename + '_' + retrieval_model + '.txt'
                with open(os.path.join(save_folder, curr_file_name), 'w') as f:
                    f.write(convert_df_to_trec(curr_res_df))

                res_dict = get_ranking_effectiveness_for_res_file_per_query(
                    file_path=save_folder,
                    filename=curr_file_name,
                    qrel_filepath=qrel_filepath,
                    calc_ndcg_mrr=calc_ndcg_mrr)
                for measure in ['Map', 'P_5', 'P_10'] + additional_measures:
                    insert_row.append(res_dict['all'][measure])
                per_q_res_dict[retrieval_model] = res_dict
                model_summary_df.loc[next_idx] = insert_row
                next_idx += 1
                params_df = tmp_params_df
            else:
                params_df = params_df.append(tmp_params_df, ignore_index=True)

            curr_res_df = get_trec_prepared_df_form_res_df(
                scored_docs_df=test_res_df, score_colname='ModelScore')
            insert_row = [feat_group.replace('_', '+')]
            curr_file_name = model_base_filename + '_' + feat_group + '.txt'
            with open(os.path.join(save_folder, curr_file_name), 'w') as f:
                f.write(convert_df_to_trec(curr_res_df))

            res_dict = get_ranking_effectiveness_for_res_file_per_query(
                file_path=save_folder,
                filename=curr_file_name,
                qrel_filepath=qrel_filepath,
                calc_ndcg_mrr=calc_ndcg_mrr)
            for measure in ['Map', 'P_5', 'P_10'] + additional_measures:
                insert_row.append(res_dict['all'][measure])
            per_q_res_dict[feat_group.replace('_', '+')] = res_dict
            model_summary_df.loc[next_idx] = insert_row
            next_idx += 1

    significance_df = create_sinificance_df(per_q_res_dict,
                                            calc_ndcg_mrr=calc_ndcg_mrr)
    model_summary_df = pd.merge(model_summary_df,
                                significance_df,
                                on=['FeatureGroup'],
                                how='inner')
    model_summary_df.to_csv(os.path.join(
        save_summary_folder,
        model_base_filename + feat_group_list_str + '.tsv'),
                            sep='\t',
                            index=False)
    params_df.to_csv(os.path.join(
        save_summary_folder,
        model_base_filename + feat_group_list_str + '_Params.tsv'),
                     sep='\t',
                     index=False)