def main():

    # Parameters
    data_directory = '../../data/generated-data-r-10-n-6-4/'
    features_path = '../../data/features-generated-data-r-10-n-6-4'
    booking_file = '../../data/booking.csv'
    users_file = '../../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5]
    false_objects_indexes = [6, 7, 8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores_auc = []
    scores_rmse = []
    for i in range(10):
        cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='rf')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv'
    save_scores(scores_rmse, results_file)
示例#2
0
ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
    data_directory, booking_file, users_file, rating_thresholds)

features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts(
    name_vector, ratings_vector, categories_vector, ids_vector, n=10)

ratings_vector = new_ratings_vector
ids_vector = new_ids_vector

scores_auc = []
scores_rmse = []
for i in range(10):
    cv_results_file = '../results/cv-generated-data-r-10-n-02-random-' + str(
        i) + '.csv'
    selection = ObjectSelection(show_selection_results=False,
                                selection_algorithm='random')
    selection.transform(ids=ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
                        users=users_matrix,
                        cv_results_file=cv_results_file,
                        images_indexes=text_indexes,
                        true_objects_indexes=true_objects_indexes,
                        false_objects_indexes=false_objects_indexes,
                        paths=name_vector,
                        z_score=False)
    score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
    scores_auc.append(score_auc)
    scores_rmse.append(score_rmse)
def main():

    # Parameters
    data_directory = '../data/generated-data-r-10-n-8-2/'
    features_path = '../data/features-generated-data-r-10-n-8-2'
    booking_file = '../data/booking.csv'
    users_file = '../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7]
    false_objects_indexes = [8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores = []
    cv_results_file = './results/bf_real.csv'

    #ratings_matrix = ratings_matrix[:30, :30]
    #selection = BasicFactorization(show_selection_results=False, selection_algorithm='random')
    #selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix,
    #                    users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes,
    #                    true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes,
    #                    paths=name_vector, z_score=True)
    #score, score_rmse = selection.evaluate(evaluation_metric='auc')
    #scores.append(score)

    #exit()

    # K Nearest Neighbors
    #cv_results_file = './results/cv-generated-data-nr-2-n-02-l-100-knn.csv'
    scores_auc = []
    scores_rmse = []
    for i in range(1):
        cv_results_file = './results/xxp1-cv-generated-data-r-10-n-8-2-random-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='random')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-rmse.csv'
    save_scores(scores_rmse, results_file)

    exit()

    for i in range(10):
        print()

    for _ in range(0):
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='random')
        # selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file)
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=True)
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores.append(score_auc)

    for i in range(10):
        print()

    for _ in range(10):
        selection = BasicFactorization(show_selection_results=False,
                                       selection_algorithm='random')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector)
        score = selection.evaluate(evaluation_metric='auc')
        scores.append(score)

    exit()

    # Parameters
    #data_directory = '../data/experience-6/'
    #features_path = '../data/features-experience-6'
    data_directory = '../data/generated-data-r-2-n-8-2/'
    features_path = '../data/features-generated-data-r-2-n-8-2'
    booking_file = '../data/booking.csv'
    users_file = '../data/user.csv'
    cv_results_file = 'results/cv-generated-data-r-2-n-8-2-x.csv'
    true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7]
    false_objects_indexes = [8, 9]

    #file_to_delete = data_directory + '.DS_Store'
    #os.remove(file_to_delete)

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]
    rating_thresholds = [1, 2]
    #rating_thresholds = []

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory,
        booking_file,
        users_file,
        rating_thresholds,
        binary=True)

    features = get_features(features_path, name_vector)

    cv_results_file = './results/cv-generated-data-r-2-n-8-2-knn-y.csv'

    selection = ObjectSelection(show_selection_results=False,
                                selection_algorithm='random')
    selection.transform(ids=img_ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
                        users=users_matrix,
                        cv_results_file=cv_results_file,
                        images_indexes=images_indexes,
                        true_objects_indexes=true_objects_indexes,
                        false_objects_indexes=false_objects_indexes,
                        paths=name_vector,
                        use_user_data=True)
    selection.evaluate(evaluation_metric='auc')

    exit()

    selection = BasicFactorizationNmf(show_selection_results=True,
                                      selection_algorithm='random')
    selection.transform(ids=img_ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
                        users=users_matrix,
                        cv_results_file=cv_results_file,
                        images_indexes=images_indexes,
                        true_objects_indexes=true_objects_indexes,
                        false_objects_indexes=false_objects_indexes,
                        paths=name_vector)
    selection.evaluate(evaluation_metric='auc')
示例#4
0
def make_scribblers():
    # JSON scribbler. Produces:
    # - inCertifiedLumiSections [bool]
    json_path = os.path.join(
        os.getcwd(),
        "data/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt")
    json_scribbler = in_certified_lumi_sections(json_path)

    # PUWeight. Produces:
    # - puWeight [float]
    puweight_scribbler = NanoaodtoolsModuleWrapper(
        "postprocessing.modules.common.puWeightProducer",
        "puWeight",
        mc_only=True,
    )

    # lepton SF. Produces:
    # - Muon_effSF
    # - Electron_effSF
    leptonsf_scribbler = NanoaodtoolsModuleWrapper(
        "postprocessing.modules.common.lepSFProducer",
        "lepSF",
        mc_only=True,
    )

    # JEC Uncs. Produces:
    # - Jet_jecUncert*
    jecunc_scribbler = NanoaodtoolsModuleWrapper(
        "postprocessing.modules.jme.jecUncertainties",
        "jecUncert",
        mc_only=True,
    )

    # JetMET Uncs. Produces:
    # - Jet_pt_smeared
    # - MET_{pt,phi}_smeared
    # - Jet_pt_{jer,jes*,unclustEn}{Up,Down}
    # - MET_{pt,phi}_{jer,jes*,unclustEn}{Up,Down}
    jetmetunc_scribbler = NanoaodtoolsModuleWrapper(
        "postprocessing.modules.jme.jetmetUncertainties",
        "jetmetUncertainties",
        mc_only=True,
    )

    # BTagSF. Produces:
    # - Jet_btagSF
    # - Jet_btagSF_{up,down}
    btagsf_scribbler = NanoaodtoolsModuleWrapper(
        "postprocessing.modules.btv.btagSFProducer",
        "btagSF",
        mc_only=True,
    )

    # Object vetoes
    jet_veto = dict(All=(
        'j: j.pt>40.',
        'j: abs(j.eta)<5.',
        'j: j.puId>=1',  # loose
        'j: j.jetId>=1',  # loose
    ))
    muon_veto = dict(All=(
        'u: u.pt>10.',
        'u: abs(u.eta)<2.4',
        'u: abs(u.dxy)<0.118',
        'u: abs(u.dz)<0.882',
    ))
    electron_veto = dict(All=(
        'e: e.pt>10.',
        'e: abs(e.eta)<2.1',
        'e: abs(e.dxy)<0.5',
        'e: abs(e.dz)<1.0',
        'e: e.miniPFRelIso_all<0.1',
        'e: e.lostHits<=1',
        'e: e.cutBased>=1',  # Veto or higher
    ))
    photon_veto = dict(All=(
        'y: y.pt>25.',
        'y: abs(y.eta)<2.5',
        'y: y.cutBased>=1',  # Loose of higher
    ))

    # Object selections
    jet_selection = dict(All=('j: abs(j.eta)<2.4', ))
    muon_selection = dict(All=(
        'u: u.pt>30.',
        'u: abs(u.eta)<2.1',
        'u: u.tightId',
    ))
    electron_selection = dict(All=(
        'e: e.pt>30.',
        'e: abs(e.eta)<2.1',
        'e: e.cutBased>=4',  # Tight or higher
    ))
    photon_selection = dict(All=(
        'y: y.pt>165.',
        'y: abs(y.eta)<1.45',
        'y: y.cutBased>=3',  # Tight or higher
    ))

    obj_attrs = ['pt', 'eta', 'phi']

    return [
        Collection("Jet", attrs=obj_attrs + ["puId", "jetId"]),
        Collection("Muon",
                   attrs=obj_attrs +
                   ["dxy", "dz", "miniPFRelIso_all", "tightId", "jetIdx"]),
        Collection("Electron",
                   attrs=obj_attrs + [
                       "dxy", "dz", "miniPFRelIso_all", "lostHits", "cutBased",
                       "jetIdx"
                   ]),
        Collection("Photon", attrs=obj_attrs + ["cutBased", "jetIdx"]),
        Collection("Tau", attrs=obj_attrs),
        ObjectSelection(in_obj_name="Muon",
                        out_obj_name="MuonVeto",
                        path_cfg=muon_veto),
        ObjectSelection(in_obj_name="MuonVeto",
                        out_obj_name="MuonSelection",
                        path_cfg=muon_selection),
        ObjectSelection(in_obj_name="Electron",
                        out_obj_name="ElectronVeto",
                        path_cfg=electron_veto),
        ObjectSelection(in_obj_name="ElectronVeto",
                        out_obj_name="ElectronSelection",
                        path_cfg=electron_selection),
        ObjectSelection(in_obj_name="Photon",
                        out_obj_name="PhotonVeto",
                        path_cfg=photon_veto),
        ObjectSelection(in_obj_name="PhotonVeto",
                        out_obj_name="PhotonSelection",
                        path_cfg=photon_selection),
        ObjectSelection(in_obj_name="Jet",
                        out_obj_name="JetVeto",
                        path_cfg=jet_veto),
        OverlapRemoval(
            collection_name="JetVeto",
            ref_collection="MuonVeto",
        ),
        OverlapRemoval(
            collection_name="JetVeto",
            ref_collection="ElectronVeto",
        ),
        OverlapRemoval(
            collection_name="JetVeto",
            ref_collection="PhotonVeto",
        ),
        ObjectSelection(in_obj_name="JetVeto",
                        out_obj_name="JetSelection",
                        path_cfg=jet_selection),
        DatasetInfo(),
        json_scribbler,
        cutflowId(),
        MetNoX(),
        #leptonsf_scribbler,
        #jecunc_scribbler,
        #btagsf_scribbler,
    ]