예제 #1
0
def main(configuration_file):
    """Main function to hierarchically classify new camera trap images.
    
    Parameters
    ----------
    configuration_file : string 
        file containing the all paths
    """

    # Load configuration file
    with open("config.yml") as yaml_config:
        config = load(yaml_config)

    # Check if all folders exist and create folder if needed
    for path in config:
        if not os.path.exists(config[path]):
            os.makedirs(config[path])

    # Step 1: resize images
    ########################
    # Input: images and Agouti export file (observations)
    # Output: resized images in similar folder structure as original images

    resize_images(config["general_folder_path"], config["resized_folder_path"])

    # Step 2: preprocess images
    ###########################
    # Input: resized images and Agouti export (observations + assets + pickupsetup)
    # Output: file containing coordinates of the regions of interest

    preprocessing(config["general_folder_path"], config["resized_folder_path"],
                  config["preprocessing_output_path"])

    # Step 3: extract bottleneck features using the pretrained network ResNet50
    ############################################################################
    # Input: resized images and preprocessing output containing the coordinates of the boxes
    # Output: bottleneck features of all images

    extract_bottleneck_features(config["preprocessing_output_path"],
                                config["bottleneck_features_output_path"],
                                config["resized_folder_path"])

    # Step 4 : run top model to classify the new images
    ##################################################
    # Input: extracted bottleneck features
    # Ouput: predicted probabilities

    hierarchical_bottleneck_predict(config["bottleneck_features_output_path"],
                                    config["weight_path"],
                                    config["predictions_output_path"])

    # Step 5 : convert output probabilities to hierarchical classification
    ######################################################################
    # Input: predicted probabilities
    # Output: hierarchical classification of the sequences

    hierarchical_predictions_sequences(
        config["predictions_output_path"],
        config["bottleneck_features_output_path"])
예제 #2
0
def main():
    database.connect(config.database)

    if not fsys.islocked(config.resources + "/.initialization.lock"):
        initialization()
        fsys.lock(config.resources + "/.initialization.lock")

    if not fsys.islocked(config.resources + "/.preprocessing.lock"):
        preprocessing()
        fsys.lock(config.resources + "/.preprocessing.lock")

    app = QtWidgets.QApplication([])
    window = Window()
    window.show()
    sys.exit(app_exit(app))
예제 #3
0
파일: main.py 프로젝트: dotXem/GLYFE
def main(dataset, subject, model, params, exp, mode, log, ph, plot):
    printd(dataset, subject, model, params, exp, mode, log, ph, plot)

    # retrieve model's parameters
    search = locate_search(params)
    params = locate_params(params)
    model_class = locate_model(model)

    # scale variables in minutes to the benchmark sampling frequency
    ph_f = ph // cs.freq
    hist_f = params["hist"] // cs.freq
    day_len_f = cs.day_len // cs.freq

    """ PREPROCESSING """
    train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f)
    """ MODEL TRAINING & TUNING """
    if search:
        params = find_best_hyperparameters(subject, model_class, params, search, ph_f, train, valid, test)

    raw_results = make_predictions(subject, model_class, params, ph_f, train, valid, test, mode=mode)
    """ POST-PROCESSING """
    raw_results = postprocessing(raw_results, scalers, dataset)

    """ EVALUATION """
    results = ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results)
    printd(results.compute_results())
    if plot:
        results.plot(0)
def main_standard(dataset, subject, model, params, exp, eval_set, ph):
    printd(dataset, subject, model, params, exp, eval_set, ph)

    # retrieve model's parameters
    params = locate_params(params)
    model_class = locate_model(model)

    # scale variables in minutes to the benchmark sampling frequency
    ph_f = ph // cs.freq
    hist_f = params["hist"] // cs.freq
    day_len_f = cs.day_len // cs.freq
    """ PREPROCESSING """
    train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f,
                                                day_len_f)
    """ MODEL TRAINING """
    raw_results = make_predictions_pclstm(subject,
                                          model_class,
                                          params,
                                          ph_f,
                                          train,
                                          valid,
                                          test,
                                          scalers,
                                          mode=eval_set)
    """ POST-PROCESSING """
    raw_results = postprocessing(raw_results, scalers, dataset)
    """ EVALUATION """
    ResultsSubject(model,
                   exp,
                   ph,
                   dataset,
                   subject,
                   params=params,
                   results=raw_results).save_raw_results()
예제 #5
0
def main_target_training(source_dataset, target_dataset, target_subject, model, params, eval_mode, exp, plot):
    hist_f = params["hist"] // freq
    train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f)
    raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test,
                                      eval_mode=eval_mode, fit=True, save_model_file=None)

    return evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot,
                      "target_training")
예제 #6
0
 def _load_subject_data(self, subject):
     if subject not in list(self.train.keys()):
         train_sbj, valid_sbj, test_sbj, scalers_sbj = preprocessing(self.dataset, subject, self.ph,
                                                                     self.hist, cs.day_len_f)
         self.train[subject] = train_sbj
         self.valid[subject] = valid_sbj
         self.test[subject] = test_sbj
         self.scalers[subject] = scalers_sbj
예제 #7
0
파일: run.py 프로젝트: ahashisyuu/SA
def run(args):
    if args.mode == 'prepare':
        preprocessing(args)
    else:
        cls = main_model.get(args.model)
        if cls is None:
            return
        model = MainModel(cls=cls, config=args)
        print('--------------------------------------------------------------')
        print('        本次对应的层次为: %s' % args.arrangement)
        print('--------------------------------------------------------------')
        if args.mode == 'train':
            model.train()
        elif args.mode == 'predict':
            results = model.predict(load_best_model=args.load_best_model)
            save_results(results, args)
        else:
            model.evaluate(load_best_model=args.load_best_model)
예제 #8
0
def main_target_global(source_dataset, target_dataset, target_subject, model, params, weights_exp, eval_mode, exp,
                       plot):
    hist_f = params["hist"] // freq
    weights_file = compute_weights_file(model, source_dataset, target_dataset, target_subject, weights_exp)

    train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f)

    raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test,
                                      weights_file=weights_file, eval_mode=eval_mode, fit=False, save_model_file=None)

    return evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot,
                      "target_global")
def main_cgega_iterative_training(dataset,
                                  subject,
                                  model,
                                  params1,
                                  params2,
                                  exp,
                                  eval_set,
                                  ph,
                                  save_iter=False):
    printd(dataset, subject, model, params1, params2, exp, eval_set, ph)

    # retrieve model's parameters
    params1 = locate_params(params1)
    params2 = locate_params(params2)
    model_class = locate_model(model)

    # scale variables in minutes to the benchmark sampling frequency
    ph_f = ph // cs.freq
    hist_f = params1["hist"] // cs.freq
    day_len_f = cs.day_len // cs.freq
    freq_ds = misc.datasets.datasets[dataset]["glucose_freq"]
    """ PREPROCESSING """
    train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f,
                                                day_len_f)
    """ MODEL TRAINING """
    dir = join(cs.path, "processing", "models", "weights", "cg_ega")
    file = join(dir, exp, model_class.__name__ + "_" + dataset + subject)

    results_test, results_valid_iter = progressive_improvement_clinical_acceptability(
        subject, model_class, params1, params2, ph, freq_ds, train, valid,
        test, scalers, file, eval_set)

    results_test = postprocessing(results_test, scalers, dataset)
    results_valid_iter = postprocessing_all_iter(results_valid_iter, scalers,
                                                 dataset)

    ResultsSubject(model,
                   exp,
                   ph,
                   dataset,
                   subject,
                   params=[params1, params2],
                   results=results_test).save_raw_results()
    if save_iter:
        ResultsSubjectPICA(model,
                           exp,
                           ph,
                           dataset,
                           subject,
                           params=[params1, params2],
                           results=results_valid_iter).save_raw_results()
예제 #10
0
def run(args):
    if args.mode == 'prepare':
        preprocessing('./rawData',
                      './data',
                      need_punct=args.need_punct,
                      char_max_len=args.char_max_len,
                      glove_filename=args.glove_file)
    else:
        # loading preprocessed data
        with open('./data/dataset.pkl', 'rb') as fr, \
             open('./data/embedding_matrix.pkl', 'rb') as fr_embed, \
             open('./data/char2index.json', 'r') as fr_char:
            data = pkl.load(fr)
            embedding_matrix = pkl.load(fr_embed)
            char2index = json.load(fr_char)

        train_samples = [data[k + '.xml'] for k in args.train_list]
        dev_samples = [data[k + '.xml'] for k in args.dev_list]
        test_samples = [data[k + '.xml'] for k in args.test_list]

        all_data = BatchDatasets(args.max_len,
                                 args.char_max_len,
                                 need_shuffle=args.need_shuffle,
                                 batch_size=args.batch_size,
                                 k_fold=args.k_fold,
                                 categories_num=args.categories_num,
                                 train_samples=train_samples,
                                 dev_samples=dev_samples,
                                 test_samples=test_samples)

        model = QCN(embedding_matrix=embedding_matrix,
                    args=args,
                    char_num=len(char2index))

        if args.mode == 'train':
            model.train(all_data, args)
        elif args.mode == 'test':
            model.test(all_data, args)
예제 #11
0
def transform(data):

    count_vect = load_count_vectorizer()
    tfidf_transformer = load_tfidf_transformer()

    preprocessed_data = preprocessing([data],
                                      remove_stopwords=True,
                                      lemmatization=True,
                                      remove_accented=True)

    train_cv = count_vect.transform(preprocessed_data)

    X_train_idf = tfidf_transformer.transform(train_cv)

    return X_train_idf
예제 #12
0
def main_target_finetuning(source_dataset, target_dataset, target_subject,
                           Model, params, weights_exp, eval_mode, exp, plot):
    hist_f = params["hist"] // freq
    weights_file = compute_weights_file(Model, source_dataset, target_dataset,
                                        target_subject, weights_exp)

    train, valid, test, scalers = preprocessing(target_dataset, target_subject,
                                                ph_f, hist_f, day_len_f)

    raw_results = make_predictions_tl(target_subject,
                                      Model,
                                      params,
                                      ph_f,
                                      train,
                                      valid,
                                      test,
                                      weights_file=weights_file,
                                      tl_mode="target_finetuning",
                                      eval_mode=eval_mode)

    evaluation(raw_results, scalers, source_dataset, target_dataset,
               target_subject, Model, params, exp, plot, "target_finetuning")
예제 #13
0
def end_to_end(source_dataset, target_dataset, target_subject, model, params, weights_exp, eval_mode, exp,
               plot):
    hist_f = params["hist"] // freq
    save_file = compute_weights_file(model, source_dataset, target_dataset, target_subject, weights_exp)

    train_m, valid_m, test_m, scalers_m = preprocessing_source_multi(source_dataset, target_dataset, target_subject,
                                                                     ph_f, hist_f, day_len_f)
    make_predictions_tl(target_subject, model, params, ph_f, train_m, valid_m, test_m,
                        eval_mode=eval_mode, fit=True, save_model_file=save_file)

    train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f)

    raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test,
                                      weights_file=save_file, eval_mode=eval_mode, fit=False, save_model_file=None)

    evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot,
               "target_global")

    raw_results_2 = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test,
                                        weights_file=save_file, eval_mode=eval_mode, fit=True,
                                        save_model_file=None)

    return evaluation(raw_results_2, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot,
                      "target_finetuning")
예제 #14
0
# LightGBM cluster
lgb_cls_train = pd.read_csv(path_train + "lgb_cls_val.csv")
lgb_cls_test = pd.read_csv(path_test + "lgb_cls_test.csv")

# Catboost standard
cat_std_train = pd.read_csv(path_train + "catboost_val.csv")
cat_std_test = pd.read_csv(path_test + "catboost_test.csv")

# XGBoost Incremental
xgb_train = pd.read_csv(path_train + "xgb_inc_val.csv")
xgb_test = pd.read_csv(path_test + "xgb_inc_test.csv")

train = pd.read_csv("../dataset/original/train.csv")
test = pd.read_csv("../dataset/original/x_test.csv")
df = preprocessing(train, test, useTest=False)

df_scope = df[['Date', 'sku', 'scope']].copy()

# Train
prediction_train = cat_std_train.merge(
    lgb_std_train, how='left', on=['Date', 'sku', 'target', 'real_target'])
prediction_train = lgb_cls_train.merge(
    prediction_train, how='left', on=['Date', 'sku', 'target', 'real_target'])
prediction_train = prediction_train.merge(
    xgb_train, how='left', on=['Date', 'sku', 'target', 'real_target'])

prediction_train.Date = pd.to_datetime(prediction_train.Date)

prediction_train.Date = pd.to_datetime(prediction_train.Date)
prediction_train = prediction_train.merge(df[['Date', 'sku', 'scope']],
예제 #15
0
def main():

    # Read run agument for configuration
    config_path = sys.argv[1]
    cfg_dict = op_util.get_all_configs(config_path)

    cfg_dict['t_date'] = pd.to_datetime("today").strftime("%Y_%m_%d")

    cfg_dict = op_util.get_te_window_from_cfg(cfg_dict)

    #tdict is a dictionary of objects
    #tmap keeps track of the process stage
    tdict = {}
    tmap = {}

    "General preprocessing"
    tmap['general_preprocessing'] = 1
    tdict[0] = attr_gen_preprocess(cfg_dict['DATA_FILE_DICT'] , {})
    tdict[1] = attr_gen_preprocess({} , cfg_dict)

    gen_preproc = general_preprocess(1, tdict, tmap)
    gen_preproc.run()

    print (gen_preproc.data_plus_meta_[1].data_)

    "Preprocessing"
    tdict = gen_preproc.data_plus_meta_
    tmap = gen_preproc.racks_map_
    tmap['preprocessing'] = 2
    cfg_dict = tdict[1].config_

    tdict[2] = attr_preprocess({},cfg_dict )
    preprocess = preprocessing(2, tdict, tmap)
    preprocess.run()

    print (preprocess.data_plus_meta_[2].data_)

    "Imputation"
    tdict = preprocess.data_plus_meta_
    tmap = preprocess.racks_map_
    tmap['imputation'] = 3
    cfg_dict = tdict[2].config_

    tdict[3] = attr_imputation({},cfg_dict )
    impute = imputation(3, tdict, tmap)
    impute.run()

    print (impute.data_plus_meta_[3].data_)

    "Enrichment"
    tdict = impute.data_plus_meta_
    tmap = impute.racks_map_
    tmap['enrich_data'] = 4
    cfg_dict = tdict[3].config_

    tdict[4] = attr_enrich_data({},cfg_dict )
    enrich = enrich_data(4, tdict, tmap)
    enrich.run()

    print (enrich.data_plus_meta_[4].data_)

    "Splitting"
    tdict = enrich.data_plus_meta_
    tmap = enrich.racks_map_
    tmap['split'] = 5
    cfg_dict = tdict[4].config_

    tdict[5] = attr_split_data({},cfg_dict )
    split = split_data(5, tdict, tmap)
    split.run()

    print (split.data_plus_meta_[5].data_.train_set_dict_, split.data_plus_meta_[5].data_.validate_set_dict_)

    "Sampling"
    tdict = split.data_plus_meta_
    tmap = split.racks_map_
    tmap['sample'] = 6
    cfg_dict = tdict[5].config_

    tdict[6] = attr_sample_data({},cfg_dict )
    sample = sample_data(6, tdict, tmap)
    sample.run()

    print (sample.data_plus_meta_[6].data_.train_set_dict_, sample.data_plus_meta_[6].data_.validate_set_dict_, sample.data_plus_meta_[6].data_.predict_set_dict_)

    "FeatureSelection"
    tdict = sample.data_plus_meta_
    tmap = sample.racks_map_
    tmap['feature_select'] = 7
    cfg_dict = tdict[6].config_

    tdict[7] = attr_feature_select({},cfg_dict )
    select_feature = feature_select(7, tdict, tmap)
    select_feature.run()

    print (select_feature.data_plus_meta_[7].data_.train_set_dict_, select_feature.data_plus_meta_[7].data_.validate_set_dict_, select_feature.data_plus_meta_[7].data_.predict_set_dict_)
def main():
    # the features which should be used.
    feature_names = [
        # Features.Face_count,
        # Features.Rot_distance,
        # Features.Face_bb,
        # Features.Face_bb_full_img,
        # Features.Face_bb_quarter_imgs,
        # Features.Face_bb_eighth_imgs,
        # Features.Tilted_edges,
        # Features.Edge_hist_v0,
        # Features.Edge_hist_v1,
        # Features.Edge_hist_v2,
        # Features.Symmetry,
        # Features.Hsv_hist,
         Features.DenseSIFT_L0,
        # Features.DenseSIFT_L1,
        # Features.DenseSIFT_L2,
        # Features.Hog_L0,
        # Features.Hog_L1,
        # Features.Hog_L2,
        # Features.Lbp_L0,
        # Features.Lbp_L1,
        # Features.Lbp_L2,
         Features.Gist,
        # Features.CNN_fc7,
        # Features.CNN_prob
    ]

    runname = 1
    do_preprocessing = False  # use this only at your first run on the dataset
    calc_features = False  # calculates the selected features
    use_second_dev_classification_method = False # True: classifies with second order deviation method
    global dir_root # the root directory of your data
    dir_root = 'C:\Users\Andreas\Desktop\prvc\InterestingnessData2016'

#######################
###STOP EDITING HERE###
#######################

    # root directories for training and test data
    dir_training_data = os.path.join(dir_root, 'devset')
    dir_test_data = os.path.join(dir_root, 'testset')

    # dicts containing path to images as keys and ground truth as values
    img_dirs_training = read_img_dirs_and_gt(dir_training_data)
    img_dirs_test = read_img_dirs(dir_test_data)

    # preprocessing
    if do_preprocessing:
        prvc_preprocessing.preprocessing(img_dirs_training.keys())
        prvc_preprocessing.preprocessing(img_dirs_test)
        print 'preprocessing finished.'

    # calculate features
    if calc_features:
        features_train = feature_calculation.calc_features(img_dirs_training.keys(), feature_names)
        features_test = feature_calculation.calc_features(img_dirs_test, feature_names)
        print 'feature calculation finished.'

    else:
        # load features from file
        features_train = feature_files.load_features(img_dirs_training.keys(), feature_names)
        features_test = feature_files.load_features(img_dirs_test, feature_names)

    print('features loaded.')

    if Features.Face_bb in feature_names:
        # bring bounding box feature matrices to same shape
        # find matrix with maximal columns and reshape other matrix before concatenating them
        features_train = make_face_bb_equal_col_size(features_train)
        features_test = make_face_bb_equal_col_size(features_test)
        features_train, features_test = make_face_bb_train_test_equal_col_size(features_train, features_test)

    X_trains = gen_feature_matrices_per_feature(features_train)
    X_tests = gen_feature_matrices_per_feature(features_test)

    # scale features (because svm is not scale invariant)
    X_trains_scaled = scale_features(X_trains)
    X_tests_scaled = scale_features(X_tests)

    # generate final feature matrix
    X_train = gen_final_feature_matrix(X_trains)
    X_test = gen_final_feature_matrix(X_tests)

    X_train_scaled = gen_final_feature_matrix(X_trains_scaled)
    X_test_scaled = gen_final_feature_matrix(X_tests_scaled)

    #DEBUG save
    #np.savetxt('C:\Users\Andreas\Desktop\\X_train_fc7.txt.gz', X_train)
    #np.savetxt('C:\Users\Andreas\Desktop\\X_train_fc7.txt.gz_scaled.txt.gz', X_train_scaled)
    #np.savetxt('C:\Users\Andreas\Desktop\\X_test_fc7.txt.gz', X_test)
    #np.savetxt('C:\Users\Andreas\Desktop\\X_test_fc7.txt.gz_scaled.txt.gz', X_test_scaled)

    # get interestingness
    y_train = get_target_vec(img_dirs_training)


    #upsampling of class 'interesting' via SMOTE
    #sm = SMOTE()
    #X_train_upsampled, y_train_upsampled = sm.fit_sample(X_train, y_train)
    #X_train = X_train_upsampled
    #y_train = y_train_upsampled

    
    #
    # train and test svm
    #
    #C = 0.125  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') #class_weight='balanced'
    #results_1 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_1 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 0.25  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_2 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_2 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 0.5  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_3 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_3 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 1  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_4 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_4 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 2  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_5 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_5 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 4  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_6 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_6 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 8  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_7 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_7 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)
#
    #C = 16  # SVM regularization parameter
    #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced')  # class_weight='balanced'
    #results_8 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled_8 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
    #                         use_second_dev_classification_method)


    # submission_format = gen_submission_format(results_1)
    # save_submission.save_submission(submission_format, 1)
    # submission_format = gen_submission_format(results_scaled_1)
    # save_submission.save_submission(submission_format, 2)
    #
    # submission_format = gen_submission_format(results_2)
    # save_submission.save_submission(submission_format, 3)
    # submission_format = gen_submission_format(results_scaled_2)
    # save_submission.save_submission(submission_format, 4)
    #
    # submission_format = gen_submission_format(results_3)
    # save_submission.save_submission(submission_format, 5)
    # submission_format = gen_submission_format(results_scaled_3)
    # save_submission.save_submission(submission_format, 6)
    #
    # submission_format = gen_submission_format(results_4)
    # save_submission.save_submission(submission_format, 7)
    # submission_format = gen_submission_format(results_scaled_4)
    # save_submission.save_submission(submission_format, 8)
    #
    # submission_format = gen_submission_format(results_5)
    # save_submission.save_submission(submission_format, 9)
    # submission_format = gen_submission_format(results_scaled_5)
    # save_submission.save_submission(submission_format, 10)
    #
    # submission_format = gen_submission_format(results_6)
    # save_submission.save_submission(submission_format, 11)
    # submission_format = gen_submission_format(results_scaled_6)
    # save_submission.save_submission(submission_format, 12)
    #
    # submission_format = gen_submission_format(results_7)
    # save_submission.save_submission(submission_format, 13)
    # submission_format = gen_submission_format(results_scaled_7)
    # save_submission.save_submission(submission_format, 14)
    #
    # submission_format = gen_submission_format(results_8)
    # save_submission.save_submission(submission_format, 15)
    # submission_format = gen_submission_format(results_scaled_8)
    # save_submission.save_submission(submission_format, 16)

    #LAPI Settings for HSVHist + GIST ---MAP should be 0.1714
    #print("svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1 : 10})")
    #svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1 : 10})
    #results = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    #results_scaled = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, use_second_dev_classification_method)
    
    #svc = svm.SVC(kernel='poly', degree=18, gamma=2)

    #LAPI Settings for DSIFT + GIST ---MAP should be 0.1398
    print("svm.SVC(kernel='poly', degree=3, gamma=32, class_weight={1: 10})")
    svc = svm.SVC(kernel='poly', degree=3, gamma=32, class_weight={1: 10})
    #svc = svm.SVC(kernel='poly', degree=3, gamma=32)
    results = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    results_scaled = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, use_second_dev_classification_method)

    print("svm.SVC(kernel='poly', degree=3, gamma=32, class_weight='balanced')")
    svc = svm.SVC(kernel='poly', degree=3, gamma=32, class_weight='balanced')
    results_2 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method)
    results_scaled_2 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test,
                             use_second_dev_classification_method)

    print("save results")
    submission_format = gen_submission_format(results)
    save_submission.save_submission(submission_format, 1)
    submission_format = gen_submission_format(results_scaled)
    save_submission.save_submission(submission_format, 2)

    submission_format = gen_submission_format(results_2)
    save_submission.save_submission(submission_format, 3)
    submission_format = gen_submission_format(results_scaled_2)
    save_submission.save_submission(submission_format, 4)



    '''
    #read ground truth of testset
    img_dirs_test = read_img_dirs_and_gt(dir_test_data)
    y_test = get_target_vec(img_dirs_test)


    
    print('UNSCALED')
    print('LAPI 1:10')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1: 10})
    scores = cross_val_score(svc, X_test, y_test, cv=3, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2)
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI c=0.1')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2, C=0.1)
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI balanced')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight='balanced')
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI like libsvm balanced')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight='balanced', cache_size=100)
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI like libsvm 1 to 10')
    svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1: 10}, cache_size=100)
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print('LAPI like libsvm 1 to 10, C=0.25')
    svc = svm.SVC(kernel='poly', C=0.25, degree=18, gamma=2, class_weight={1: 10}, cache_size=100)
    scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision')
    print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    '''
    print("finished.")
예제 #17
0
파일: main.py 프로젝트: Owaisaaa/pcLSTM
def main(dataset,
         subject,
         Model,
         params,
         ph,
         eval="valid",
         print=True,
         plot=False,
         save=True,
         excel_file=None):
    printd(dataset, subject, Model.__name__)

    file = os.path.join("data", "dynavolt", dataset,
                        dataset + "_subject" + subject + ".csv")
    """ PREPROCESSING """
    train_sets, valid_sets, test_sets, norm_min, norm_max = preprocessing(
        file, misc.hist, ph, misc.freq, misc.cv)

    # TODO REMOVE - one split testing
    # split_number = 7
    # train_sets, valid_sets, test_sets = [train_sets[split_number]], [valid_sets[split_number]], [
    #     test_sets[split_number]]
    # norm_min, norm_max = [norm_min[split_number]],[norm_max[split_number]]
    """ CROSS-VALIDATION """
    results = []
    for i, [train, valid,
            test] in enumerate(zip(train_sets, valid_sets, test_sets)):
        train_x, train_y = train.iloc[:, :-2], train.iloc[:, -2:]
        valid_x, valid_y = valid.iloc[:, :-2], valid.iloc[:, -2:]
        test_x, test_y = test.iloc[:, :-2], test.iloc[:, -2:]

        model = Model(params)
        if Model.__name__ in misc.nn_models:
            model.fit(x_train=train_x,
                      y_train=train_y,
                      x_valid=valid_x,
                      y_valid=valid_y)
        else:
            model.fit(x=train_x, y=train_y)

        if eval == "valid":
            y_true, y_pred = model.predict(x=valid_x, y=valid_y)
        elif eval == "test":
            y_true, y_pred = model.predict(x=test_x, y=test_y)
        results.append(np.c_[y_true, y_pred])
    """ POST-PROCESSING """
    results = postprocessing(results.copy(),
                             hist=misc.hist,
                             ph=misc.ph,
                             freq=misc.freq,
                             min=norm_min,
                             max=norm_max)
    """ EVALUATION """
    res = Results(Model.__name__,
                  misc.ph,
                  dataset,
                  subject,
                  misc.freq,
                  results=np.array(results))
    metrics = res.get_results()
    if print: printd(metrics)
    if save: res.save()
    if plot: res.plot()
    if excel_file is not None:
        res.to_excel(params, len(res.results), file_name=excel_file)
def compute_glucose_distribution(dataset,
                                 train_valid_or_test="train",
                                 plot=False,
                                 save=False,
                                 hypo_hyper_stats=False):
    """ load data"""
    glucose = []
    for subject in misc.datasets.datasets[dataset]["subjects"]:
        glucose_sbj = []
        train, valid, test, scalers = preprocessing(dataset, subject, 30 // 5,
                                                    180 // 5, 1440 // 5)
        if train_valid_or_test == "train":
            set = train
        elif train_valid_or_test == "valid":
            set = valid
        elif train_valid_or_test == "test":
            set = test
        for set_i, scalers_i in zip(set, scalers):
            glucose_sbj.append(set_i.y.values * scalers_i.scale_[-1] +
                               scalers_i.mean_[-1])
        glucose.append(glucose_sbj)
    """ create average subject histograms """
    nbins = 40
    n_sbj = []
    for glucose_sbj in glucose:
        n_split = []
        for glucose_sbj_split in glucose_sbj:
            (n, bins, _) = plt.hist(glucose_sbj_split,
                                    bins=nbins,
                                    range=[0, 400],
                                    density=True,
                                    stacked=True)
            plt.close()
            n_split.append(n)
        n_sbj.append(np.mean(n_split, axis=0))
    """ compute distributions """
    n_arr = np.array(n_sbj) * 400 / nbins
    middle_bins = ((bins[1:] + bins[:-1]) / 2)
    mean = np.mean(n_arr, axis=0)
    std = np.std(n_arr, axis=0)
    """ plot """
    if plot:
        plt.figure()
        plt.plot(middle_bins, mean, color='#CC4F1B')

        plt.fill_between(middle_bins,
                         mean - std,
                         mean + std,
                         alpha=0.5,
                         edgecolor='#CC4F1B',
                         facecolor='#FF9848')
        plt.title(
            "Distribution des échantillons de glycémie pour le jeu de données "
            + dataset + ".")
        plt.xlabel("glycémie [mg/dL]")
        plt.ylabel("probabilité")
    """ save """
    if save:
        df = pd.DataFrame(
            data=np.c_[middle_bins, mean, mean - std, mean + std],
            columns=["middle_bins", "mean", "plus-std", "minus-std"])
        df.to_csv(path.join(
            cs.path, "tmp", "figures_data", "glucose_distribution_" + dataset +
            "_" + train_valid_or_test + ".dat"),
                  index_label="index")
    """ hypo hyper stats """
    if hypo_hyper_stats:
        print(np.sum(mean[np.where(bins <= 70)[0][:-1]]) * 100)
        print(np.sum(mean[np.where(bins >= 180)[0][:-1]]) * 100)

    return n_arr, bins, middle_bins, mean, std
예제 #19
0
def main(dataset,
         subject,
         model,
         params,
         exp,
         mode,
         log,
         ph,
         plot,
         save=False):
    printd(dataset, subject, model, params, exp, mode, log, ph, plot)

    # retrieve model's parameters
    search = locate_search(params)
    params = locate_params(params)
    model_class = locate_model(model)

    # scale variables in minutes to the benchmark sampling frequency
    ph_f = ph // cs.freq
    hist_f = params["hist"] // cs.freq
    day_len_f = cs.day_len // cs.freq
    """ PREPROCESSING """
    train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f,
                                                day_len_f)
    start = time.time()
    """ MODEL TRAINING & TUNING """
    if search:
        params = find_best_hyperparameters(subject, model_class, params,
                                           search, ph_f, train, valid, test)

    if save:
        dir = os.path.join(cs.path, "processing", "models", "weights",
                           model_class.__name__, exp)
        file = os.path.join(dir,
                            model_class.__name__ + "_" + dataset + subject)
    else:
        file = None

    raw_results = make_predictions(subject,
                                   model_class,
                                   params,
                                   ph_f,
                                   train,
                                   valid,
                                   test,
                                   mode=mode,
                                   save_model_file=file)
    """ POST-PROCESSING """
    raw_results = postprocessing(raw_results, scalers, dataset)
    """ EVALUATION """
    results = ResultsSubject(model,
                             exp,
                             ph,
                             dataset,
                             subject,
                             params=params,
                             results=raw_results)
    printd(results.compute_mean_std_results())
    end = time.time()
    printd("Time elapsed : " + str(end - start) + " seconds")
    if plot:
        results.plot(0)
예제 #20
0
def run_main(model_params,
             useTest=False,
             useScope=True,
             save=False,
             completeCV=False,
             dataAugm=True,
             drop_cols=[],
             cluster=None,
             name='',
             categorical_features=['sku', 'pack', 'brand']):

    abs_path = Path(__file__).absolute().parent
    train_path = os.path.join(abs_path, "dataset/original/train.csv")
    test_path = os.path.join(abs_path, "dataset/original/x_test.csv")
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    useTest = useTest
    useScope = useScope
    isEvaluation = False
    useSampleWeights, weights_type = True, 2
    save = save
    completeCV = completeCV
    dataAugm = dataAugm

    if completeCV:
        useTest = False
        useScope = False

    df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm)

    df, categorical_f = add_all_features(df)

    categorical_f = list(set(categorical_features + categorical_f))
    drop_cols = drop_cols
    categorical_f = [x for x in categorical_f if x not in drop_cols]

    df = df.sort_values('Date')

    #   --------------- Model -----------------

    CLUSTER = cluster
    NAME = name

    if NAME == 'lgb_std' or NAME == 'lgb_cls':
        model = LightGBM(**model_params)

    elif NAME == 'catboost':
        model = CatBoost(**model_params)

    print('Start the model ' + NAME)
    model = model
    model_gen = Generator(
        df,
        model,
        categorical_features=categorical_f,
        drop_columns=drop_cols,
        isScope=useScope,
        sample_weights_type=weights_type,
        evaluation=isEvaluation,
        useTest=useTest,
        cluster=CLUSTER,
        name=NAME,
        completeCV=completeCV,
        dataAugmentation=dataAugm,
    )

    model_gen.run_generator(save)
    model_gen.plot_feature_importance()
    print(model_gen.compute_MAPE())
예제 #21
0
'TEICOPLANINA_.MG.',
'TIGECICLINA_.MG.',
'TOBRAMICINA_.MG.',
'TOBRAMICINA_NEB_.MG.',
'VANCOMICINA_.MG.']

# Load data
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test_challenge.csv')

# ------------------------------------------------
# Preprocessing
# ------------------------------------------------

# General preprocessing
df_train_cln = preprocessing(df_train)
df_test_cln = preprocessing(df_test)

# PCA
resulting_features_names = ['PC1_DIAGNOSTIC', 'PC2_DIAGNOSTIC']
pc_diagnosis = PCA_r(df_train_cln, features_diagnostic, 2, resulting_features_names)

resulting_features_names = ['PC1_ANTIBIOTIC', 'PC2_ANTIBIOTIC']
pc_antibiotics = PCA_r(df_train_cln, features_antibiotic, 2, resulting_features_names)

# Adding PCA columns to original dataset
df_train_cln = pd.concat([df_train_cln, pc_diagnosis, pc_antibiotics], axis=1)

# ------------------------------------------------
# Modelling
# ------------------------------------------------
예제 #22
0
def run_xgboost(useTest=False,
                useScope=False,
                completeCV=False,
                dataAugm=False,
                save=True):

    train = pd.read_csv("../dataset/original/train.csv")
    test = pd.read_csv("../dataset/original/x_test.csv")

    useTest = useTest
    useScope = useScope
    isEvaluation = False
    useSampleWeights, weights_type = True, 2
    save = save

    completeCV = completeCV  # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train
    # e predice via via tutte le settimane successive incrementando il train

    dataAugm = dataAugm  # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo
    # non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane]

    if isEvaluation:
        useTest = False
        useScope = False

    if completeCV:
        useTest = False
        useScope = False

    df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm)

    df, categorical_f = add_all_features(df)
    categorical_f = ['sku', 'pack', 'brand'] + categorical_f

    df = df.sort_values('Date')

    df_scope = df[['Date', 'sku', 'scope']].copy()

    def wmape_train_(y_true, data):
        """
        IMPORTANTE: sortare prima gli elementi del df per ('sku', 'Date'): df.sort_values(['sku','Date']

        Give less importance to previous [in time] values, exponentially
        :param y_true:
        :param y_pred:
        :return:
        """
        # global s
        y_true = np.array(y_true)
        y_pred = data.get_label()

        N = int(y_true.shape[0] / 133)
        weight = np.arange(y_true.shape[0])
        weight = weight % N
        weight = weight / N
        grad = -100 * ((y_true - y_pred) / y_true) * (np.exp(weight))
        hess = 100 / (y_true) * (np.exp(weight))
        return grad, hess

    def ohe_categorical(df, categorical_features):
        for c in categorical_features:
            dummy = pd.get_dummies(df[c], prefix=c)
            df[dummy.columns] = dummy
        return df

    df = ohe_categorical(df, ['cluster', 'heavy_light'])
    cat_cols = ['pack', 'brand', 'scope', 'heavy_light', 'cluster', 'year']
    df = df.drop(cat_cols, axis=1)

    if useTest:
        df = df.sort_values('Date')
        test_dates = df[df.Date >= '2019-06-29']
        test_dates = test_dates.drop_duplicates('Date').Date
        gen = dfs_gen(df, test_dates)
    else:
        train = df[~df.target.isna()]
        if completeCV:
            if dataAugm:
                dates = train[train.Date >= '2016-12-10'].Date.sort_values(
                ).drop_duplicates(keep='first')
            else:
                dates = train.Date.sort_values().drop_duplicates(keep='first')
            val_dates = dates[1:]
        else:
            _, _, val_dates = train_validation_split(train)
        gen = dfs_gen(train, val_dates)

    params = {
        'obj': wmape_train_,
        'learning_rate': 0.1,
        'max_depth': 10,
        # 'min_child_weight': 3,
        # 'tree_method': 'hist'
    }

    #  RUNNING MODEL

    prediction_df = pd.DataFrame()

    feature_importances = []

    prev_df_test = pd.DataFrame()
    drop_target = ['real_target', 'target', 'Date', 'sku']
    xgb_model = None
    for i, (df_train, df_test) in enumerate(gen):
        if i == 0:
            xgb_model = xgb.train(params,
                                  dtrain=xgb.DMatrix(
                                      df_train.drop(drop_target, axis=1),
                                      df_train.target),
                                  num_boost_round=700)

            feature_importances.append(xgb_model.get_fscore())

        else:
            # xgb_model.fit(prev_df_test.drop(drop_target, axis=1), prev_df_test.target, xgb_model='xgb_model_online.model')
            params.update({
                # 'learning_rate': 0.05,
                'updater': 'refresh',
                'process_type': 'update',
                'refresh_leaf': True,
                # 'reg_lambda': 3,  # L2
                # 'reg_alpha': 3,  # L1
                'silent': False,
            })

            xgb_model = xgb.train(params,
                                  dtrain=xgb.DMatrix(
                                      df_train.drop(drop_target, axis=1),
                                      df_train.target),
                                  num_boost_round=400,
                                  xgb_model=xgb_model)

        df_test['prediction'] = xgb_model.predict(
            xgb.DMatrix(df_test.drop(drop_target, axis=1)))
        # print(df_test[['Date', 'sku', 'target', 'prediction']])

        # xgb_model.save_model('xgb_model_online.model')
        prediction_df = pd.concat([
            prediction_df,
            df_test[['Date', 'sku', 'real_target', 'target', 'prediction']]
        ])

        prev_df_test = df_test.drop(['prediction'], axis=1).copy()

    feature_importances.append(xgb_model.get_fscore())

    prediction_df['real_prediction'] = np.expm1(prediction_df.prediction)
    prediction_df = prediction_df.merge(df_scope,
                                        how='left',
                                        on=['Date', 'sku'])

    if not useTest:
        train = df[~df.target.isna()]
        _, _, val_dates = train_validation_split(train)
        mask_val = (prediction_df.Date.isin(val_dates)) & (prediction_df.scope
                                                           == 1)
        print(
            f'MAPE {MAPE(prediction_df[mask_val].real_target, prediction_df[mask_val].real_prediction)}'
        )

    if save:
        if useTest:
            prediction_df.drop('scope', axis=1).to_csv(
                "../dataset/prediction/test/xgb_inc_test.csv", index=False)
        else:
            if completeCV:
                prediction_df.drop('scope', axis=1).to_csv(
                    "../dataset/prediction/val/xgb_inc_val.csv", index=False)

    plt.figure(figsize=(20, 10))

    feat_imp = {
        k: v
        for k, v in sorted(feature_importances[1].items(),
                           key=lambda item: item[1])
    }

    x = list(feat_imp.keys())
    y = list(feat_imp.values())
    plt.barh(x, y)
    plt.show()
예제 #23
0
def main(model,
         useTest,
         useScope,
         save,
         completeCV,
         dataAugm,
         categorical_features=['cluster', 'sku', 'pack', 'brand'],
         drop_cols=[
             'scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster'
         ],
         cluster=None,
         name='',
         useSampleWeights=True,
         weights_type=2,
         isEvaluation=False,
         rand_noise=False):

    train = pd.read_csv("dataset/original/train.csv")
    test = pd.read_csv("dataset/original/x_test.csv")

    useTest = useTest
    useScope = useScope
    isEvaluation = isEvaluation
    useSampleWeights, weights_type = useSampleWeights, weights_type
    save = save

    completeCV = completeCV  # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train
    # e predice via via tutte le settimane successive incrementando il train

    dataAugm = dataAugm  # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo
    # non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane]
    rand_noise = rand_noise

    if isEvaluation:
        useTest = False
        useScope = False

    if completeCV:
        useTest = False
        useScope = False

    df = preprocessing(train,
                       test,
                       useTest=useTest,
                       dataAugmentation=dataAugm,
                       rand_noise=rand_noise)

    df, categorical_f = add_all_features(df)
    #categorical_f = ['sku', 'pack', 'brand'] + categorical_f
    categorical_f = categorical_features
    drop_cols = drop_cols

    df = df.sort_values('Date')

    #   --------------- Model -----------------

    #drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster']
    categorical_f = [x for x in categorical_f if x not in drop_cols]

    # df = ohe_categorical(df, [c for c in categorical_f if c != 'sku'])    # Usare per mettere in ohe le features categoriche

    # CLUSTER = [1,2,3]      # Set CLUSTER = None if you want NOT to consider any cluster
    CLUSTER = cluster
    NAME = name

    print('Start the model ' + NAME)
    model = model
    model_gen = Generator(
        df,
        model,
        categorical_features=categorical_f,
        drop_columns=drop_cols,
        isScope=useScope,
        sample_weights_type=weights_type,
        evaluation=isEvaluation,
        useTest=useTest,
        cluster=CLUSTER,
        name=NAME,
        completeCV=completeCV,
        dataAugmentation=dataAugm,
    )

    model_gen.run_generator(save)

    print(model_gen.compute_MAPE())
예제 #24
0
completeCV = False  # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train
# e predice via via tutte le settimane successive incrementando il train

dataAugm = False  # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo
# non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane]

if isEvaluation:
    useTest = False
    useScope = False

if completeCV:
    useTest = False
    useScope = False

df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')

#   --------------- Model -----------------

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster']
categorical_f = [x for x in categorical_f if x not in drop_cols]

#CLUSTER = [1,2,3]      # Set CLUSTER = None if you want NOT to consider any cluster
CLUSTER = None
NAME = 'lightgbm'
params = {
예제 #25
0
def run_gte_feature():
    abs_path = Path(__file__).absolute().parent
    train_path = os.path.join(abs_path, "../dataset/original/train.csv")
    test_path = os.path.join(abs_path, "../dataset/original/x_test.csv")

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    df = preprocessing(train, test, useTest=True, dataAugmentation=True)
    # df, categorical_f = add_all_features(df)
    df_cluster = get_cluster()
    df = df.merge(df_cluster, how='left', on='sku')

    def simple_gen(df):
        df = df.sort_values('Date')
        dates = df[df.Date >= '2016-12-10']['Date'].drop_duplicates().values
        dates = dates[1:]
        for d in dates:
            yield df[df.Date < d], df[df.Date == d]

    gen = simple_gen(df)

    group_and_priors = {
        ('pack'): None,
        ('brand'): None,
        ('cluster'): None,
        ('pack', 'brand'): ['gte_pack', 'gte_brand'],
        ('pack', 'cluster'): ['gte_pack', 'gte_cluster'],
        ('brand', 'cluster'): ['gte_brand', 'gte_cluster'],
        ('pack', 'brand', 'cluster'): ['gte_pack_brand', 'gte_pack_cluster'],
    }

    df_gte = pd.DataFrame()

    window = 8
    prior_precision = 50
    for t, v in tqdm(gen):
        date = v.Date.drop_duplicates(keep='first')
        features = []
        for group_cols, prior_cols in group_and_priors.items():
            if isinstance(group_cols, str):
                f_name = "gte_" + group_cols
            else:
                f_name = "gte_" + '_'.join(group_cols)
            features.append(f_name)
            gte = GaussianTargetEncoder(group_cols, 'target', prior_cols)

            dates = t.Date.drop_duplicates()
            if len(dates) > window:
                t = t[t.Date.isin(dates[-window:])]

            #print(f'Encoding Train: days < {date} : rows {t.shape[0]}')
            t.loc[:, features[-1]] = gte.fit_transform(
                t, prior_precision=prior_precision, window=window)

            #print(f'Encoding Validation = {date} \n')
            v.loc[:, features[-1]] = gte.transform(
                v, prior_precision=prior_precision)
        df_gte = pd.concat([df_gte, v])

    gte_cols = [x for x in df_gte.columns if 'gte' in x]

    save_path = os.path.join(
        abs_path, f"gte_features_w{window}_prp{prior_precision}.csv")
    df_gte[['Date', 'sku', 'target', 'real_target'] + gte_cols].to_csv(
        save_path, index=False)