예제 #1
0
def main(targets):
    '''
    Reads targets and executes appropriate files for given data. 
    '''

    # make the clean target
    if 'clean' in targets:
        shutil.rmtree('data/raw', ignore_errors=True)
        shutil.rmtree('data/cleaned', ignore_errors=True)
        shutil.rmtree('data/out', ignore_errors=True)
        shutil.rmtree('data/test', ignore_errors=True)

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        load(**cfg)

        cfg = load_params(CLEAN_PARAMS)
        clean_data(**cfg)

    # make the test target
    if 'test-project' in targets:
        cfg = load_params(TEST_DATA_PARAMS)
        load(**cfg)

        cfg = load_params(TEST_CLEAN_PARAMS)
        clean_data(**cfg)

        cfg = load_params(TEST_FEATURE_PARAMS)
        make_features(**cfg)

        cfg = load_params(TEST_MODEL_PARAMS)
        driver(**cfg)

    # make the full data target
    if 'full-project' in targets:
        cfg = load_params(DATA_PARAMS)
        load(**cfg)

        cfg = load_params(CLEAN_PARAMS)
        clean_data(**cfg)

        cfg = load_params(FEATURE_PARAMS)
        make_features(**cfg)

        cfg = load_params(MODEL_PARAMS)
        driver(**cfg)

    # if data is cleaned and just model pipeline is to be run
    if 'model' in targets:
        cfg = load_params(TEST_MODEL_PARAMS)
        driver(**cfg)

    return
예제 #2
0
def prepare_data(normalize):
    data = features.make_features(normalize).dropna()
    y = data["label"]
    X = data.drop(["label"], axis=1)
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    print(X)
    return X, y
예제 #3
0
def train(dataset, labels):
    """ """
    pollutants = ["NO2", "PM10", "PM25"]
    # split dataset
    NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset)
    # build data dict
    ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(dataset))))
    # build features dict
    f = {}
    for poll in pollutants:
        f[poll] = {}
        f[poll]["X"] = make_features(ds[poll], **features_config[poll])
        f[poll]["Y"] = get_Y(labels, ds[poll])
    # train model for each pollutant
    model_dict = {}
    for poll in pollutants:
        xgb_model = xgb.XGBRegressor(max_depth=6,
                                     n_estimators=200,
                                     reg_lambda=1)
        # train model
        xgb_model.fit(f[poll]["X"], f[poll]["Y"])
        # mse on training set
        y_pred = xgb_model.predict(f[poll]["X"])
        mse = mean_squared_error(f[poll]["Y"], y_pred)
        print("%s: MSE on training set: %.3f" % (poll, mse))
        # store model
        model_dict[poll] = xgb_model
    # return model dict
    return model_dict
def main(train_file, val_file, test_file, feature_file):
    # train
    data = import_formatted_data(train_file)
    X, y = split_x_y(data)
    transformed_X = make_features(X)

    with open(feature_file, "r") as f:
        features = [int(x) for x in f.readline().strip().split(" ")]

    selected_X = transformed_X[:, features]

    mod = SGDRegressor(max_iter=5000, penalty="l1")
    mod.fit(selected_X, y)
    train_preds = mod.predict(selected_X)

    val_data = import_formatted_data(val_file)
    val_X, val_y = split_x_y(val_data)
    transformed_val_X = make_features(val_X)
    selected_val_X = transformed_val_X[:, features]
    val_predictions = mod.predict(selected_val_X)

    test_data = import_formatted_data(test_file)
    test_X, test_y = split_x_y(test_data)
    transformed_test_X = make_features(test_X)
    selected_test_X = transformed_test_X[:, features]
    test_predictions = mod.predict(selected_test_X)

    print "Training"
    print "MSE:", mean_squared_error(y, train_preds)
    print "Mean, variance of real y:", mean(y), var(y)
    print "Mean, variance of pred y:", mean(train_preds), var(train_preds)

    print "Validation"
    print "MSE:", mean_squared_error(val_y, val_predictions)
    print "Mean, variance of real y:", mean(val_y), var(val_y)
    print "Mean, variance of Pred y:", mean(val_predictions), var(
        val_predictions)

    print "Test"
    print "MSE:", mean_squared_error(test_y, test_predictions)
    print "Mean, variance of real y:", mean(test_y), var(test_y)
    print "Mean, variance of Pred y:", mean(test_predictions), var(
        test_predictions)
예제 #5
0
def predict(model_dict, dataset):
    """ """
    # split dataset
    NO2_df, PM10_df, PM25_df = split_pollutant_dataset(dataset)
    # build features
    NO2_f = make_features(NO2_df, **features_config["NO2"])
    PM10_f = make_features(PM10_df, **features_config["PM10"])
    PM25_f = make_features(PM25_df, **features_config["PM25"])
    # apply each model
    Y_pred_NO2 = pd.DataFrame(model_dict["NO2"].predict(NO2_f),
                              columns=["TARGET"],
                              index=NO2_f.index)
    Y_pred_PM10 = pd.DataFrame(model_dict["PM10"].predict(PM10_f),
                               columns=["TARGET"],
                               index=PM10_f.index)
    Y_pred_PM25 = pd.DataFrame(model_dict["PM25"].predict(PM25_f),
                               columns=["TARGET"],
                               index=PM25_f.index)
    # concatenate result
    Y_pred = pd.concat([Y_pred_NO2, Y_pred_PM10, Y_pred_PM25], axis=0)
    # return
    return Y_pred
예제 #6
0
def train_pipeline(training_pipeline_params: TrainingPipelineParams):
    logger.info(f"Start training with params: {training_pipeline_params}")

    load_data(training_pipeline_params.input_data_path,
              training_pipeline_params.input_data_url)
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"Raw data shape: {data.shape}")

    train_df, val_df = split_train_val_data(
        data, training_pipeline_params.splitting_params)
    logger.info(f"Train df shape: {train_df.shape}")
    logger.info(f"Val df shape: {val_df.shape}")

    pipeline = build_transformer(training_pipeline_params.feature_params)
    pipeline.fit(train_df)
    logger.info(f"Transform fitted.")

    train_features = make_features(pipeline, train_df)
    train_target = extract_target(train_df,
                                  training_pipeline_params.feature_params)
    logger.info(f"Train features shape: {train_features.shape}")

    val_features = make_features(pipeline, val_df)
    val_target = extract_target(val_df,
                                training_pipeline_params.feature_params)
    logger.info(f"Val features shape: {train_features.shape}")

    model = get_model(training_pipeline_params.train_params)
    model = train_model(train_features, train_target, model)
    logger.info(f"Model trained.")

    predictions = predict_model(val_features, model)

    metrics = evaluate_model(predictions, val_target)

    path_to_model = save_artifacts(metrics, model, pipeline,
                                   training_pipeline_params)

    return path_to_model, metrics
예제 #7
0
def dataflow(X, y=None, cmd_plot=False):
    '''
	Primary function responsible for predictions and GUI output from a pre-processed file.
	Returns signals used for plotting of features as well as generated summary statistics.
	'''
    epochs = epochs_from_prep(X.copy(),
                              None,
                              settings.EPOCH_LENGTH,
                              settings.OVERLAP_FACTOR,
                              settings.SAMPLE_RATE,
                              filter=False,
                              removal=True)
    epochs = dataset(epochs, shuffle=False, exclude_ptt=False,
                     only_rwa=True).epochs
    epochs = gru(load_graph=True, path=settings.BEST_MODEL).predict(epochs)
    epochs.sort(key=lambda x: x.index_start, reverse=False)
    yhat, timecol = reconstruct(X, epochs)
    full = epochs_from_prep(X,
                            None,
                            settings.EPOCH_LENGTH,
                            settings.OVERLAP_FACTOR,
                            settings.SAMPLE_RATE,
                            filter=False,
                            removal=False)
    full.sort(key=lambda x: x.index_start, reverse=False)
    wake, nrem, rem, illegal = timeseries(full)
    summary = summary_statistics(timecol, yhat, wake, nrem, rem, illegal)
    X, y, mask = make_features(X, y, settings.SAMPLE_RATE, removal=False)
    X = transpose(X)
    ss = X[6].copy().astype(float)
    for i, _ in enumerate(ss):
        if X[7, i]:
            ss[i] = 2.0
        elif X[5, i]:
            ss[i] = 0.0
    data = X[0] / settings.SAMPLE_RATE, [X[1], X[2], X[3], X[4], ss, yhat], [
        'RR', 'RWA', 'PTT', 'PWA', 'Sleep stage', 'Arousals'
    ], region(X[5]), region(X[7]), None, None, int(X[0, -1] /
                                                   settings.SAMPLE_RATE)
    if cmd_plot:
        d = list(data)
        if y is not None:
            d[1] += [y]
            d[2] += ['y']
            d[2][5] = 'yhat'
        plot_results(*d)
    return data, summary
예제 #8
0
def predict_pipeline(params: PredictionPipelineParams):
    logger.info(f"Start prediction.")

    data = read_data(params.data_path)
    logger.info(f"Data loaded. Raw data shape: {data.shape}")

    pipeline = load_transformer(params.transformer_path)
    logger.info(f"Transformer loaded: {pipeline}")

    model = load_model(params.model_path)
    logger.info(f"Model loaded: {model}")

    train_features = make_features(pipeline, data)
    logger.info(f"Test features shape: {train_features.shape}")

    predictions = predict_model(train_features, model)
    predictions_path = save_prediction(predictions, params.output_path)
    logger.info(f"Predictions saved in {predictions_path}")
예제 #9
0
labels_ph = tf.placeholder(tf.int32,(None))
wav_ph = tf.placeholder(tf.float32,(None,sample_rate))
bg_wavs_ph = tf.placeholder(tf.float32,[None,sample_rate])

keep_prob = tf.placeholder(tf.float32) # will be 0.5 for training, 1 for test
learning_rate_ph = tf.placeholder(tf.float32,[],name="learning_rate_ph")
is_training_ph = tf.placeholder(tf.bool)
use_full_layer = tf.placeholder(tf.bool)
slow_down = tf.placeholder(tf.bool)
# scale_means_ph = tf.placeholder(tf.float32)
# scale_stds_ph = tf.placeholder(tf.float32)

processed_wavs = pp.tf_preprocess(wav_ph,bg_wavs_ph,is_training_ph,slow_down,extreme=FLAGS.extreme_time)

features = make_features(processed_wavs,is_training_ph,FLAGS.features)

output_neurons = len(all_words) if style == "full" else len(wanted_words)
full_output_neurons = len(all_words)
final_layer, full_final_layer, open_max_layer = make_model(FLAGS.model,features,keep_prob,output_neurons,full_output_neurons,is_training_ph)

final_layer = tf.cond(use_full_layer,lambda: full_final_layer, lambda: final_layer)

probabilities = tf.nn.softmax(final_layer)

loss_mean = tf.losses.sparse_softmax_cross_entropy(labels=labels_ph, logits=final_layer)
# full_loss_mean = tf.losses.sparse_softmax_cross_entropy(labels=labels_ph,logits=full_final_layer)

total_loss = tf.losses.get_total_loss()

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
예제 #10
0
파일: main.py 프로젝트: vpellegrain/Plume
NO2_df, PM10_df, PM25_df = split_pollutant_dataset(df)

# split in train / dev for each pollutant
NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train,
                                     zone_station_dev)
PM10_train, PM10_dev = split_train_dev(PM10_df, zone_station_train,
                                       zone_station_dev)
PM25_train, PM25_dev = split_train_dev(PM25_df, zone_station_train,
                                       zone_station_dev)

# make features and get labels

# NO2
NO2_train_f, NO2_dev_f = make_features(NO2_train,
                                       NO2_dev,
                                       normalize=False,
                                       rolling_mean=True,
                                       deltas=[12])
Y_NO2_train = get_Y(Y, NO2_train)
Y_NO2_dev = get_Y(Y, NO2_dev)
X_NO2 = pd.concat([NO2_train_f, NO2_dev_f], axis=0, copy=False)
Y_NO2 = pd.concat([Y_NO2_train, Y_NO2_dev], axis=0, copy=False)
NO2_test_fold = build_test_fold(Y_NO2_train, Y_NO2_dev)

# PM10
PM10_train_f, PM10_dev_f = make_features(PM10_train, PM10_dev)
Y_PM10_train = get_Y(Y, PM10_train)
Y_PM10_dev = get_Y(Y, PM10_dev)
X_PM10 = pd.concat([PM10_train_f, PM10_dev_f], axis=0, copy=False)
Y_PM10 = pd.concat([Y_PM10_train, Y_PM10_dev], axis=0, copy=False)
PM10_test_fold = build_test_fold(Y_PM10_train, Y_PM10_dev)
예제 #11
0
def train_predict(train,
                  test,
                  Y_train,
                  model_dict=None,
                  output_path=None,
                  pm=False,
                  model="rf"):
    """ """
    pollutants = ["NO2", "PM"] if pm else ["NO2", "PM10", "PM25"]
    print("%i regressor will be trained for each pollutant of %s" %
          (len(pollutants), pollutants))
    # split dataset, build data dict
    train_ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(train, pm))))
    test_ds = dict(
        ((poll, df)
         for poll, df in zip(pollutants, split_pollutant_dataset(test, pm))))
    # build features dict
    f = {}
    for poll in pollutants:
        f[poll] = {}
        f[poll]["X_train"], f[poll]["X_test"] = make_features(
            train_ds[poll], dev=test_ds[poll], **features_config[poll])
        if Y_train is not None:
            f[poll]["Y"] = get_Y(Y_train, train_ds[poll])
    # train model for each pollutant
    if model_dict is None:
        model_dict = {}
        for poll in pollutants:
            # shuffle X,Y
            X, Y = shuffle_XY(f[poll]["X_train"], f[poll]["Y"])
            # init model
            if model == "rf":
                reg = RandomForestRegressor(**rf_config)
            else:
                reg = xgb.XGBRegressor(max_depth=6, **xgb_config[poll])
            # train model
            print("Training a %s model on pollutant %s ..." % (model, poll))
            reg.fit(X, Y)
            print("Training done on %s" % poll)
            # store model
            model_dict[poll] = reg
        if output_path is not None:
            print("Saving the dictionnary of models in %s" % output_path)
            with open(output_path, "wb") as fout:
                pickle.dump(model_dict, fout)
    # predict on train set
    preds = []
    for poll in pollutants:
        # mse on training set
        Y_pred_poll = pd.DataFrame(model_dict[poll].predict(
            f[poll]["X_train"]),
                                   columns=["TARGET"],
                                   index=f[poll]["X_train"].index)
        preds.append(Y_pred_poll)
        mse = mean_squared_error(f[poll]["Y"], Y_pred_poll)
        print("%s: MSE on training set: %.3f" % (poll, mse))
    # concat and compute global MSE
    Y_pred = pd.concat(preds, axis=0).sort_index()
    mse = mean_squared_error(Y_train, Y_pred)
    print("GLOBAL MSE on training set: %.3f" % mse)
    # predict on test set
    print("Computing prediction on test data...")
    preds = []
    for poll in pollutants:
        Y_pred_poll = pd.DataFrame(model_dict[poll].predict(f[poll]["X_test"]),
                                   columns=["TARGET"],
                                   index=f[poll]["X_test"].index)
        preds.append(Y_pred_poll)
    # concatenate pred for each pollutant and sort index
    Y_pred = pd.concat(preds, axis=0).sort_index()
    print("Prediction done.")
    #
    return Y_pred
예제 #12
0
# training
model.fit(PM25_seq_stat_train,
          Y_seq_PM25_train,
          nb_epoch=20,
          batch_size=32,
          verbose=2,
          validation_data=(PM25_seq_stat_dev, Y_seq_PM25_dev),
          callbacks=[tensordboard_cb])

# compare xgboost

NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train,
                                     zone_station_dev)

NO2_train_f, NO2_dev_f = make_features(NO2_train,
                                       NO2_dev,
                                       rolling_mean=True,
                                       deltas=[24, 36, 48, 96])
Y_NO2_train = get_Y(Y, NO2_train)
Y_NO2_dev = get_Y(Y, NO2_dev)

PM25_train, PM25_dev = split_train_dev(PM25_df, zone_station_train,
                                       zone_station_dev)

PM25_train_f, PM25_dev_f = make_features(PM25_train,
                                         PM25_dev,
                                         rolling_mean=True,
                                         deltas=[24, 36, 48, 96])
Y_PM25_train = get_Y(Y, PM25_train)
Y_PM25_dev = get_Y(Y, PM25_dev)

import xgboost as xgb
예제 #13
0
shift_config = {
    "temperature": [8, 14, 20, 96],
    "cloudcover": [2, 5, 48],
    "pressure": [2, 24, 72],
    "windbearingsin": [2, 6],
    "windbearingcos": [6, 6],
    "windspeed": [2, 4]
}

NO2_train, NO2_dev = split_train_dev(NO2_df, zone_station_train, zone_station_dev)


NO2_train_f, NO2_dev_f = make_features(
    NO2_train, NO2_dev,
    rolling_mean=True, roll_mean_conf=roll_mean_conf,
    # shift_config=shift_config,
    temp_dec_freq=12, log=False,
    remove_temporal=True,
    rolling_std=True,
    deltas_std=[24, 48, 96, 120])

Y_NO2_train = get_Y(Y, NO2_train)
Y_NO2_dev = get_Y(Y, NO2_dev)

# xgboost
xgb_model = xgb.XGBRegressor(max_depth=7, n_estimators=200, reg_lambda=1)

xgb_model.fit(NO2_train_f, Y_NO2_train,
              eval_set=[(NO2_dev_f, Y_NO2_dev)],
              eval_metric="rmse")

evaluate_mse(xgb_model, NO2_train_f, NO2_dev_f,
예제 #14
0
if __name__ == "__main__":
    if len(argv) != 5:
        print """USAGE: format_data.py <target sequence index> <guide sequence index> <label sequence index>
        Writes four files name raw.tab, train.tab, val.tab, test.tab. Train, valm and test have expanded data with
        features, while raw will just contain the target, guide, and label."""
        exit()
    filename, target_i, guide_i, label_i = argv[1:5]  #,argv[2],argv[3],argv[4]
    target_i, guide_i, label_i = int(target_i), int(guide_i), int(label_i)

    data = import_azimuth_data(filename, target_i, guide_i,
                               label_i)  # Target, guide, label

    mix_data = shuffle_data(data)

    feature_data = make_features(mix_data)

    train, val, test = split_train_val_test(feature_data)

    with open("raw.tab", "w") as f:
        for row in data:
            entry = [str(x) for x in row]
            f.write("\t".join(entry) + "\n")

    with open("train.tab", "w") as f:
        for row in train:
            entry = [str(x) for x in row]
            f.write("\t".join(entry) + "\n")

    with open("val.tab", "w") as f:
        for row in val:
예제 #15
0
import sys
from features import make_features
import scipy.io.wavfile as wav
import numpy as np
from keras.models import model_from_json

folder_name = "5second"

# read audio file and calculate mfcc
audio_file_name = sys.argv[1]
(rate, sig) = wav.read("1.wav")
mfcc_feat = make_features(sig,
                          rate,
                          winlen=0.1,
                          winstep=0.05,
                          lowfreq=50,
                          highfreq=5000)
mfcc_feat[~np.isnan(mfcc_feat)]

# create feature
img_rows, img_cols = 401, 13
X_test = np.array([], dtype='float32')
mfcc_feat = mfcc_feat[0:401, :]
image = np.array([mfcc_feat])
if len(image) == 1:
    if len(X_test) == 0:
        X_test = np.array([image])
    else:
        X_test = np.vstack([X_test, np.array([image])])

X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
예제 #16
0
def main():

    all_data = compile_and_normalize()

    mix_data = shuffle_data(all_data)

    feature_data = make_features(mix_data)

    train_data, val_data, test_data = split_train_val_test(feature_data)


    modelfiles = [
        "data/CRISPOR_readFraction_off_target/CRISPOR_readFraction_off_target.joblib",
        "data/Azimuth/Azimuth.joblib",
        "data/Res6tg/Res6tg.joblib",
        "data/Rule_set_1_log2change_on_target/Rule_set_1_log2change_on_target.joblib"
    ]

    featurefiles = [
        "data/CRISPOR_readFraction_off_target/CRISPOR_readFraction_off_target_features.txt",
        "data/Azimuth/Azimuth_features.txt",
        "data/Res6tg/Res6tg_features.txt",
        "data/Rule_set_1_log2change_on_target/Rule_set_1_log2change_on_target_features.txt"
    ]

    ensemble = []

    for modelfile, featurefile in zip(modelfiles, featurefiles):
        print modelfile
        print featurefile
        model = load(modelfile)

        features = get_features(featurefile)

        ensemble.append((model,features))



    train_x, train_y = split_x_y(train_data)
    val_x, val_y = split_x_y(val_data)
    test_x, test_y = split_x_y(test_data)

    featureselector = SelectFromModel(RandomForestRegressor(), max_features=100)
    regressor = RandomForestRegressor()

    featureselector.fit(train_x, train_y)
    features = featureselector.get_support(indices=True)

    selected_train_x = train_x[:, features]
    selected_val_x = val_x[:, features]
    selected_test_x = test_x[:,features]

    regressor.fit(selected_train_x, train_y)

    train_predictions = regressor.predict(selected_train_x)
    val_predictions = regressor.predict(selected_val_x)
    test_predictions = regressor.predict(selected_test_x)

    train_error = mean_squared_error(train_y, train_predictions)
    validataion_error = mean_squared_error(val_y, val_predictions)
    test_error = mean_squared_error(test_y, test_predictions)

    for model, features in ensemble:
        add_train_x = add_model_feature(model, features, train_x)
        add_val_x = add_model_feature(model, features, val_x)
        add_test_x = add_model_feature(model, features, test_x)

    #model, features = select_features_and_model(add_train_x, train_y, add_val_x, val_y,"all_data_toplayer")

    modelfile = "all_data_toplayer.joblib"
    featurefile = "all_data_toplayer_features.txt"
    model = load(modelfile)
    features = get_features(featurefile)

    train_pred = model.predict(add_train_x[:,features])
    val_pred = model.predict(add_val_x[:,features])
    test_pred = model.predict(add_test_x[:,features])

    train_MSE = mean_squared_error(train_y, train_pred)
    val_MSE = mean_squared_error(val_y, val_pred)
    test_MSE = mean_squared_error(test_y, test_pred)

    with open("all_data_top_layer_MSE.txt", "w") as f:
        f.write("ensemble train MSE: " + str(train_MSE) + "\n")
        f.write("ensemble val MSE: " + str(val_MSE) + "\n")
        f.write("ensemble test MSE: " + str(test_MSE) + "\n")
        f.write("train MSE: " + str(train_error) + "\n")
        f.write("val MSE: " + str(validataion_error) + "\n")
        f.write("test MSE: " + str(test_error) + "\n")
예제 #17
0
import features
import parse_scores as parse

score_path = 'score_data/'
profile_path = 'profile_data/'

records = parse.parse_score_file(score_path + 'franco_scores.csv')
features = features.make_features(profile_path +
                                  'data_analysis_random_profiles.json')

record_dict = {}
for record in records:
    record_dict[(record[0], record[1])] = [record[2], None]
for feature in features:
    if feature.id in record_dict.keys():
        record_dict[feature.id][1] = feature

id_dict = {}
for key in record_dict.keys():
    profile_id = key[0]
    if profile_id in id_dict.keys():
        id_dict[profile_id] += [record_dict[key]]
    else:
        id_dict[profile_id] = [record_dict[key]]

id_groups = [[pair for pair in group if pair[1]] for group in id_dict.values()]
data = [pair for pair in record_dict.values() if pair[1]]