def main(): # Parameters data_directory = '../../data/generated-data-r-10-n-6-4/' features_path = '../../data/features-generated-data-r-10-n-6-4' booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5] false_objects_indexes = [6, 7, 8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='rf') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv' save_scores(scores_auc, results_file) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv' save_scores(scores_rmse, results_file)
ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts( name_vector, ratings_vector, categories_vector, ids_vector, n=10) ratings_vector = new_ratings_vector ids_vector = new_ids_vector scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-02-random-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=text_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse)
def main(): # Parameters data_directory = '../data/generated-data-r-10-n-8-2/' features_path = '../data/features-generated-data-r-10-n-8-2' booking_file = '../data/booking.csv' users_file = '../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7] false_objects_indexes = [8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores = [] cv_results_file = './results/bf_real.csv' #ratings_matrix = ratings_matrix[:30, :30] #selection = BasicFactorization(show_selection_results=False, selection_algorithm='random') #selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, # users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, # true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, # paths=name_vector, z_score=True) #score, score_rmse = selection.evaluate(evaluation_metric='auc') #scores.append(score) #exit() # K Nearest Neighbors #cv_results_file = './results/cv-generated-data-nr-2-n-02-l-100-knn.csv' scores_auc = [] scores_rmse = [] for i in range(1): cv_results_file = './results/xxp1-cv-generated-data-r-10-n-8-2-random-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-auc.csv' save_scores(scores_auc, results_file) results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-rmse.csv' save_scores(scores_rmse, results_file) exit() for i in range(10): print() for _ in range(0): selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') # selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file) selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=True) print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores.append(score_auc) for i in range(10): print() for _ in range(10): selection = BasicFactorization(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector) score = selection.evaluate(evaluation_metric='auc') scores.append(score) exit() # Parameters #data_directory = '../data/experience-6/' #features_path = '../data/features-experience-6' data_directory = '../data/generated-data-r-2-n-8-2/' features_path = '../data/features-generated-data-r-2-n-8-2' booking_file = '../data/booking.csv' users_file = '../data/user.csv' cv_results_file = 'results/cv-generated-data-r-2-n-8-2-x.csv' true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7] false_objects_indexes = [8, 9] #file_to_delete = data_directory + '.DS_Store' #os.remove(file_to_delete) file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] rating_thresholds = [1, 2] #rating_thresholds = [] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds, binary=True) features = get_features(features_path, name_vector) cv_results_file = './results/cv-generated-data-r-2-n-8-2-knn-y.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, use_user_data=True) selection.evaluate(evaluation_metric='auc') exit() selection = BasicFactorizationNmf(show_selection_results=True, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector) selection.evaluate(evaluation_metric='auc')
def make_scribblers(): # JSON scribbler. Produces: # - inCertifiedLumiSections [bool] json_path = os.path.join( os.getcwd(), "data/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt") json_scribbler = in_certified_lumi_sections(json_path) # PUWeight. Produces: # - puWeight [float] puweight_scribbler = NanoaodtoolsModuleWrapper( "postprocessing.modules.common.puWeightProducer", "puWeight", mc_only=True, ) # lepton SF. Produces: # - Muon_effSF # - Electron_effSF leptonsf_scribbler = NanoaodtoolsModuleWrapper( "postprocessing.modules.common.lepSFProducer", "lepSF", mc_only=True, ) # JEC Uncs. Produces: # - Jet_jecUncert* jecunc_scribbler = NanoaodtoolsModuleWrapper( "postprocessing.modules.jme.jecUncertainties", "jecUncert", mc_only=True, ) # JetMET Uncs. Produces: # - Jet_pt_smeared # - MET_{pt,phi}_smeared # - Jet_pt_{jer,jes*,unclustEn}{Up,Down} # - MET_{pt,phi}_{jer,jes*,unclustEn}{Up,Down} jetmetunc_scribbler = NanoaodtoolsModuleWrapper( "postprocessing.modules.jme.jetmetUncertainties", "jetmetUncertainties", mc_only=True, ) # BTagSF. Produces: # - Jet_btagSF # - Jet_btagSF_{up,down} btagsf_scribbler = NanoaodtoolsModuleWrapper( "postprocessing.modules.btv.btagSFProducer", "btagSF", mc_only=True, ) # Object vetoes jet_veto = dict(All=( 'j: j.pt>40.', 'j: abs(j.eta)<5.', 'j: j.puId>=1', # loose 'j: j.jetId>=1', # loose )) muon_veto = dict(All=( 'u: u.pt>10.', 'u: abs(u.eta)<2.4', 'u: abs(u.dxy)<0.118', 'u: abs(u.dz)<0.882', )) electron_veto = dict(All=( 'e: e.pt>10.', 'e: abs(e.eta)<2.1', 'e: abs(e.dxy)<0.5', 'e: abs(e.dz)<1.0', 'e: e.miniPFRelIso_all<0.1', 'e: e.lostHits<=1', 'e: e.cutBased>=1', # Veto or higher )) photon_veto = dict(All=( 'y: y.pt>25.', 'y: abs(y.eta)<2.5', 'y: y.cutBased>=1', # Loose of higher )) # Object selections jet_selection = dict(All=('j: abs(j.eta)<2.4', )) muon_selection = dict(All=( 'u: u.pt>30.', 'u: abs(u.eta)<2.1', 'u: u.tightId', )) electron_selection = dict(All=( 'e: e.pt>30.', 'e: abs(e.eta)<2.1', 'e: e.cutBased>=4', # Tight or higher )) photon_selection = dict(All=( 'y: y.pt>165.', 'y: abs(y.eta)<1.45', 'y: y.cutBased>=3', # Tight or higher )) obj_attrs = ['pt', 'eta', 'phi'] return [ Collection("Jet", attrs=obj_attrs + ["puId", "jetId"]), Collection("Muon", attrs=obj_attrs + ["dxy", "dz", "miniPFRelIso_all", "tightId", "jetIdx"]), Collection("Electron", attrs=obj_attrs + [ "dxy", "dz", "miniPFRelIso_all", "lostHits", "cutBased", "jetIdx" ]), Collection("Photon", attrs=obj_attrs + ["cutBased", "jetIdx"]), Collection("Tau", attrs=obj_attrs), ObjectSelection(in_obj_name="Muon", out_obj_name="MuonVeto", path_cfg=muon_veto), ObjectSelection(in_obj_name="MuonVeto", out_obj_name="MuonSelection", path_cfg=muon_selection), ObjectSelection(in_obj_name="Electron", out_obj_name="ElectronVeto", path_cfg=electron_veto), ObjectSelection(in_obj_name="ElectronVeto", out_obj_name="ElectronSelection", path_cfg=electron_selection), ObjectSelection(in_obj_name="Photon", out_obj_name="PhotonVeto", path_cfg=photon_veto), ObjectSelection(in_obj_name="PhotonVeto", out_obj_name="PhotonSelection", path_cfg=photon_selection), ObjectSelection(in_obj_name="Jet", out_obj_name="JetVeto", path_cfg=jet_veto), OverlapRemoval( collection_name="JetVeto", ref_collection="MuonVeto", ), OverlapRemoval( collection_name="JetVeto", ref_collection="ElectronVeto", ), OverlapRemoval( collection_name="JetVeto", ref_collection="PhotonVeto", ), ObjectSelection(in_obj_name="JetVeto", out_obj_name="JetSelection", path_cfg=jet_selection), DatasetInfo(), json_scribbler, cutflowId(), MetNoX(), #leptonsf_scribbler, #jecunc_scribbler, #btagsf_scribbler, ]