Пример #1
0
def main():
    """
    """

    #a = mlfdb.mlfdb()
    a = db.mlfdb()
    io = _io.IO()
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    dst_dataset = options.dataset+'-validation'
    logging.info('Splitting time range {} - {} from {} to {} as validation set'.format(starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'), options.dataset, dst_dataset))

    _, param_names = io.read_parameters(options.parameters, drop=2)
    param_names += ['count_flash', 'precipitation3h', 'precipitation6h']
    meta_columns = ['loc_id', 'time', 'lon', 'lat']

    count = 0
    day_step = 1
    hour_step = 0

    start = starttime
    end = start + timedelta(days=day_step, hours=hour_step)
    if end > endtime: end = endtime

    while end <= endtime:
        logging.info('Processing time range {} - {}'.format(start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

        try:
            data = a.get_rows(options.dataset,
                              starttime=start,
                              endtime=end,
                              rowtype='feature',
                              return_type='pandas',
                              parameters=param_names)

        except ValueError as e:
            print(e)
            start = end
            end = start + timedelta(days=day_step, hours=hour_step)
            continue

        logging.debug(data.iloc[0:3])
        #logging.debug('Features metadata shape: {} | Features shape: {}'.format(f_metadata.shape, f_data.shape))
        logging.info('Processing {} rows...'.format(len(data)))

        if len(data) > 0:
            count += a.add_rows_from_df(df=data,
                                        _type='feature',
                                        dataset=dst_dataset
                                        )

        start = end
        end = start + timedelta(days=day_step, hours=hour_step)

    logging.info('Inserted {} rows into dataset {}'.format(count, options.dst_dataset))
Пример #2
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO()

    starttime = dt.datetime.strptime('2010-01-01', "%Y-%m-%d")
    endtime = dt.datetime.strptime('2019-01-01', "%Y-%m-%d")

    logging.info('Reading data...')
    bq.set_params(starttime,
                  endtime,
                  batch_size=2500000,
                  project=options.project,
                  dataset=options.src_dataset,
                  table=options.src_table)
    data = bq.get_rows()


    # data = bq.get_rows(starttime,
    #                    endtime,
    #                    project=options.project,
    #                    dataset=options.src_dataset,
    #                    table=options.src_table)
    logging.info('Data loaded.')

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=False,
                                train_type_column='train_type'
                                )

    #print(data.shape
    data = io.calc_delay_avg(data)
    data = io.classify(data)
    log_class_dist(data.loc[:,'class'])

    print(options.no_balance)
    balanced_data = data
    if not options.no_balance:
        count = data.groupby('class').size().min()
        balanced_data = pd.concat([data.loc[data['class'] == 0].sample(n=count),
                                   data.loc[data['class'] == 1].sample(n=count),
                                   data.loc[data['class'] == 2].sample(n=count),
                                   data.loc[data['class'] == 3].sample(n=count)])
        print(balanced_data.head(5))
        print(balanced_data.groupby('class').size())

    balanced_data.set_index(['time', 'trainstation'], inplace=True)

    logging.info('Saving data...')
    #print(data)
    bq.dataset_to_table(balanced_data, options.dst_dataset, options.dst_table)
Пример #3
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO()

    times = []
    times.append({
        'starttime': dt.datetime.strptime('2014-01-01', "%Y-%m-%d"),
        'endtime': dt.datetime.strptime('2014-02-01', "%Y-%m-%d")
    })

    times.append({
        'starttime': dt.datetime.strptime('2016-06-01', "%Y-%m-%d"),
        'endtime': dt.datetime.strptime('2016-07-01', "%Y-%m-%d")
    })
    times.append({
        'starttime': dt.datetime.strptime('2017-02-01', "%Y-%m-%d"),
        'endtime': dt.datetime.strptime('2017-03-01', "%Y-%m-%d")
    })

    logging.info('Using times: {}'.format(times))

    for t in times:
        start = t['starttime']
        end = t['endtime']

        logging.info('Processing time range {} - {}'.format(
            start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

        logging.info('Reading data...')
        data = bq.get_rows(start,
                           end,
                           parameters=['*'],
                           project=options.project,
                           dataset=options.src_dataset,
                           table=options.src_table)

        #print(data.shape
        data.set_index(['time', 'trainstation'], inplace=True)
        #print(data)
        bq.dataset_to_table(data, options.dst_dataset, options.dst_table)
Пример #4
0
def main():
    """
    Put labels from csv file to db
    """

    io = oi.IO(gs_bucket=options.gs_bucket)
    a = mlfdb.mlfdb(config_filename=options.db_config_file)

    # sc = pyspark.SparkContext("local")
    # sc = pyspark.SparkContext('spark://q2-m.c.trains-197305.internal:7077')
    # SparkSession.builder.config(conf=SparkConf())
    sc = spark.sparkContext
    #conf = pyspark.SparkConf()
    #conf.setMaster('yarn')
    #sc = pyspark.SparkContext(conf)

    # Remove old dataset
    if options.replace:
        logging.info('Removing old dataset...')
        a.remove_dataset(options.dataset, type='label')

    # Get stations
    stations = io.get_train_stations(filename='data/stations.json')

    locations = []
    names = []
    for name, latlon in stations.items():
        names.append(name)

    ids = a.get_locations_by_name(names)

    # Process files
    count = sc.accumulator(0)
    if options.filename is not None:
        files = [options.filename]
    else:
        files = io.get_files_to_process('data', 'csv')

    logging.info('Processing files: {}'.format(','.join(files)))

    sc.parallelize(files).foreach(lambda filename: count.add(
        process_file(filename, options.dataset, ids, a, io)))

    logging.info('Added {} samples to db'.format(count.value))
Пример #5
0
def main():
    """
    Get forecasted delay for every station
    """

    io = _io.IO()
    viz = _viz.Viz()

    params, _ = io.read_parameters(options.parameters_filename)
    model = io.load_scikit_model(options.model_file)

    print(params)
    print(len(params))
    print(len(model.feature_importances_))
    fname = 'results/manual/rfc_feature_importance.png'
    viz.rfc_feature_importance(model.feature_importances_,
                               fname,
                               feature_names=params[2:] +
                               ['precipitation3h', 'precipitation6h'],
                               fontsize=18)
Пример #6
0
def main():
    """
    Main program
    """

    local_device_protos = device_lib.list_local_devices()
    logging.info(
        [x.name for x in local_device_protos if x.device_type == 'GPU'])

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz(io)

    starttime, endtime = io.get_dates(options)
    #save_path = options.save_path+'/'+options.config_name

    logging.info('Using dataset {}.{} and time range {} - {}'.format(
        options.feature_dataset, options.feature_table,
        starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d')))

    all_param_names = list(
        set(options.label_params + options.feature_params +
            options.meta_params))
    aggs = io.get_aggs_from_param_names(options.feature_params)

    logging.info('Reading data...')
    bq.set_params(batch_size=2500000,
                  loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  only_winters=options.only_winters)

    data = bq.get_rows(starttime, endtime)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=[],
                                aggs=aggs)

    if options.y_avg_hours is not None:
        data = io.calc_running_delay_avg(data, options.y_avg_hours)

    if options.y_avg:
        data = io.calc_delay_avg(data)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    if options.month:
        logging.info('Adding month to the dataset...')
        data['month'] = data['time'].map(lambda x: x.month)
        options.feature_params.append('month')

    if options.normalize:
        logging.info('Normalizing data...')
        xscaler = StandardScaler()
        yscaler = StandardScaler()

        labels = data.loc[:, options.label_params].astype(
            np.float32).values.reshape((-1, 1))
        scaled_labels = pd.DataFrame(yscaler.fit_transform(labels),
                                     columns=['delay'])

        non_scaled_data = data.loc[:, options.meta_params + ['class']]
        scaled_features = pd.DataFrame(xscaler.fit_transform(
            data.loc[:, options.feature_params]),
                                       columns=options.feature_params)

        data = pd.concat([non_scaled_data, scaled_features, scaled_labels],
                         axis=1)

        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(xscaler, fname, fname)
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, fname, fname)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

        non_processed_data = data.loc[:, options.meta_params +
                                      options.label_params]
        processed_data = data.loc[:, options.feature_params]
        ipca.fit(processed_data)
        processed_features = pd.DataFrame(ipca.transform(processed_data))

        data = pd.concat([non_processed_data, processed_data], axis=1)

        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    # Divide data to normal and delayed cases
    data_test = data[(data.loc[:, 'class'] >= options.class_limit)]
    data = data[(data.loc[:, 'class'] < options.class_limit)]

    data_train, data_val = train_test_split(data, test_size=0.33)
    data_train_x = data_train.loc[:, options.feature_params].values
    data_train_y = data_train.loc[:, options.label_params].values
    data_val_x = data_val.loc[:, options.feature_params].values
    data_val_y = data_val.loc[:, options.label_params].values

    # Initialization
    logging.info('Building model...')
    model = convlstm.Autoencoder(data_train_x.shape[1]).get_model()

    losses, val_losses, accs, val_accs, steps = [], [], [], [], []

    boardcb = TensorBoard(log_dir=options.log_dir,
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

    logging.info('Data shape: {}'.format(
        data_train.loc[:, options.feature_params].values.shape))

    history = model.fit(data_train_x,
                        data_train_x,
                        validation_data=(data_val_x, data_val_x),
                        epochs=3,
                        callbacks=[boardcb])  #, batch_size=64)

    history_fname = options.save_path + '/history.pkl'
    io.save_keras_model(options.save_file, history_fname, model,
                        history.history)

    # Reconstruction errors
    logging.info('Plotting reconstruction errors...')

    errors = {}
    logging.info('Train:')
    errors = get_reconst_error(model, data_train_x, data_train_y.ravel(),
                               errors, 'Train')

    logging.info('Validation:')
    errors = get_reconst_error(model, data_val_x, data_val_y.ravel(), errors,
                               'Validation')

    logging.info('Test:')
    data_test_x = data_test.loc[:, options.feature_params].values
    data_test_y = data_test.loc[:, options.label_params].values

    errors = get_reconst_error(model, data_test_x, data_test_y.ravel(), errors,
                               'Test')

    for i in np.arange(4):
        fname = options.output_path + '/reconstruction_error_{}.png'.format(i)
        viz.reconstruction_error(errors, desired_class=i, filename=fname)

    fname = options.output_path + '/reconstruction_error_all.png'.format(i)
    viz.reconstruction_error(errors, filename=fname)
Пример #7
0
def main():
    """
    Main program
    """

    local_device_protos = device_lib.list_local_devices()
    logging.info(
        [x.name for x in local_device_protos if x.device_type == 'GPU'])

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)

    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    # In classification use always class as label param
    option.label_params = 'class'

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    logging.info('Building model...')
    model = convlstm.Classifier().get_model()

    logging.info('Reading data...')
    bq.set_params(starttime,
                  endtime,
                  batch_size=2500000,
                  loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names)

    data = bq.get_rows()

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=['train_count', 'delay'],
                                aggs=aggs)

    log_class_dist(data.loc[:, 'class'])

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    if options.normalize:
        logging.info('Normalizing data...')
        xscaler = StandardScaler()

        non_scaled_data = data.loc[:, options.meta_params + ['class']]
        scaled_features = pd.DataFrame(xscaler.fit_transform(
            data.loc[:, options.feature_params]),
                                       columns=options.feature_params)

        data = pd.concat([non_scaled_data, scaled_features], axis=1)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

        non_processed_data = data.loc[:, options.meta_params +
                                      options.label_params]
        processed_data = data.loc[:, options.feature_params]
        ipca.fit(processed_data)
        processed_features = pd.DataFrame(ipca.transform(processed_data))

        data = pd.concat([non_processed_data, processed_data], axis=1)

        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    data_train, data_test = train_test_split(data, test_size=0.33)

    # Define model
    batch_size = io.get_batch_size(data_train,
                                   options.pad_strategy,
                                   quantile=options.quantile)
    batch_size = 512
    logging.info('Batch size: {}'.format(batch_size))

    # Initialization
    losses, val_losses, accs, val_accs, steps = [], [], [], [], []

    boardcb = TensorBoard(log_dir=options.log_dir,
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

    logging.info('Data shape: {}'.format(
        data_train.loc[:, options.feature_params].values.shape))
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(data_train.loc[:, 'class'].values),
        data_train.loc[:, 'class'].values)
    weights = {}
    i = 0
    for w in class_weights:
        weights[i] = w
        i += 1

    logging.info('Class weights: {}'.format(weights))

    data_gen = TimeseriesGenerator(
        data_train.loc[:, options.feature_params].values,
        to_categorical(data_train.loc[:, 'class'].values),
        length=24,
        sampling_rate=1,
        batch_size=batch_size)

    data_test_gen = TimeseriesGenerator(
        data_test.loc[:, options.feature_params].values,
        to_categorical(data_test.loc[:, 'class'].values),
        length=24,
        sampling_rate=1,
        batch_size=batch_size)

    logging.info('X batch size: {}'.format(data_gen[0][0].shape))
    logging.info('Y batch size: {}'.format(data_gen[1][0].shape))

    history = model.fit_generator(data_gen,
                                  validation_data=data_test_gen,
                                  epochs=3,
                                  class_weight=class_weights,
                                  callbacks=[boardcb])  #, batch_size=64)

    model_fname = options.save_path + '/model.json'
    weights_fname = options.save_path + '/weights.h5'
    history_fname = options.save_path + '/history.pkl'
    io.save_model(model_fname, weights_fname, history_fname, model,
                  history.history)

    scores = model.evaluate_generator(data_test_gen)
    i = 0
    error_data = {}
    for name in model.metrics_names:
        logging.info('{}: {:.4f}'.format(name, scores[i]))
        error_data[name] = [scores[i]]
        i += 1

    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    pred_proba = model.predict_generator(data_test_gen)
    pred = np.argmax(pred_proba, axis=1)

    log_class_dist(pred)
    #print(history.history)
    fname = options.output_path + '/learning_over_time.png'
    viz.plot_nn_perf(history.history,
                     metrics={
                         '[%]': {
                             'acc': 'Accuracy',
                             'F1': 'F1 Score',
                             'Precision': 'Precision',
                             'Recall': 'Recall'
                         }
                     },
                     filename=fname)
Пример #8
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    print('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    rmses, maes, r2s, vars, start_times, end_times, end_times_obj = [], [], [], [], [], [], []

    start = starttime
    end = endtime
    print('Processing time range {} - {}'.format(
        start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

    try:
        print('Reading data...')
        data = bq.get_rows(start,
                           end,
                           loc_col='trainstation',
                           project=options.project,
                           dataset=options.feature_dataset,
                           table=options.feature_table,
                           parameters=all_param_names)
        data = io.filter_train_type(labels_df=data,
                                    train_types=options.train_types,
                                    sum_types=True,
                                    train_type_column='train_type',
                                    location_column='trainstation',
                                    time_column='time',
                                    sum_columns=['delay'],
                                    aggs=aggs)

        if options.y_avg_hours is not None:
            data = io.calc_running_delay_avg(data, options.y_avg_hours)

        data.sort_values(by=['time', 'trainstation'], inplace=True)

        if options.impute:
            print('Imputing missing values...')
            data.drop(columns=['train_type'], inplace=True)
            data = imputer.fit_transform(data)
            data.loc[:, 'train_type'] = None

        if options.model == 'ard' and len(data) > options.n_samples:
            print('Sampling {} values from data...'.format(options.n_samples))
            data = data.sample(options.n_samples)

        #l_data = data.loc[:,options.meta_params + options.label_params]
        #f_data = data.loc[:,options.meta_params + options.feature_params]

    except ValueError as e:
        f_data, l_data = [], []

    #f_data.rename(columns={'trainstation':'loc_name'}, inplace=True)

    #logging.debug('Labels shape: {}'.format(l_data.shape))
    print('Processing {} rows...'.format(len(data)))
    #assert l_data.shape[0] == f_data.shape[0]

    target = data.loc[:, options.label_params].astype(np.float32).values
    #print(f_data.columns)
    #features = f_data.drop(columns=['loc_name', 'time']).astype(np.float32).values
    features = data.loc[:, options.feature_params].astype(np.float32).values

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.33)

    logging.debug('Features shape: {}'.format(X_train.shape))

    n_samples, n_dims = X_train.shape

    if options.normalize:
        print('Normalizing data...')
        print(X_train)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

    if options.pca:
        print('Doing PCA analyzis for the data...')
        X_train = ipca.fit_transform(X_train)
        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        X_test = ipca.fit_transform(X_test)

    logging.debug('Features shape after pre-processing: {}'.format(
        X_train.shape))

    print('Training...')
    print(X_train.shape)
    input_dim = X_train.shape[1]
    #k1 = gpflow.kernels.Matern52(input_dim, lengthscales=0.3)
    #k_seasonal = gpflow.kernels.Periodic(input_dim=input_dim, period=2190, name='k_seasonal')
    #k_small = gpflow.kernels.Periodic(input_dim=input_dim, period=120, name='k_small')
    k_weather = gpflow.kernels.RBF(input_dim=input_dim, ARD=True)
    #k_noise = gpflow.kernels.White(input_dim=input_dim)

    #k = k_seasonal + k_weather + k_noise
    k = k_weather
    Z = np.random.rand(150, input_dim)

    if options.cv:
        logging.info('Doing random search for hyper parameters...')

        param_grid = {"length_scale": [0.1, 1, 2], "whiten": [True, False]}

        model = GP(dim=input_dim, Z=Z)

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        sys.exit()
    else:
        model = GP(dim=input_dim, Z=Z)
        model.fit(X_train.astype(np.float64),
                  y_train.reshape((-1, 1)).astype(np.float64))

        model.save(options.save_file)

        print('Training finished')
        print(model.model)

        #    Z_list = options.z_list.split(',')

        #for size in Z_list:

        #    with tf.Session() as sess:
        #custom_config = gpflow.settings.get_settings()
        #custom_config.verbosity.tf_compile_verb = True

        #with gpflow.settings.temp_settings(custom_config), gpflow.session_manager.get_session().as_default():

        #Z = X_train[::5].copy()
        # Z = np.random.rand(int(size), 19)
        # print('Training with inducing points: {}'.format(Z.shape))
        #
        # # model = gpflow.models.SVGP(X_train.astype(np.float64),
        # #                            y_train.reshape((-1,1)).astype(np.float64),
        # #                            kern=k,
        # #                            likelihood=gpflow.likelihoods.Gaussian(),
        # #                            Z=Z,
        # #                            #Z=X_train.copy(),
        # #                            minibatch_size=100,
        # #                            whiten=options.normalize
        # #                            )
        # #                            #model.likelihood.variance = 0.01
        # #
        # # model.compile(session=sess)
        # # opt = gpflow.train.ScipyOptimizer()
        # # opt.minimize(model)
        #
        # model = GP(dim=19,
        #            Z=Z
        #            )
        # model.fit(X_train.astype(np.float64),
        #           y_train.reshape((-1,1)).astype(np.float64))
        #
        # model.save(options.save_file)
        #
        # print('Training finished')
        # print(model.model)

        #fname=options.output_path+'/svga_performance.png'
        #viz.plot_svga(model, fname)

        # k_long_term = 66.0**2 * RBF(length_scale=67.0)
        # k_seasonal = 2.4**2 * RBF(length_scale=90.0)* ExpSineSquared(length_scale=150, periodicity=1.0, periodicity_bounds=(0,10000))
        # k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78)
        # k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(noise_level=0.19**2)
        # #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        # kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        #
        # model = GaussianProcessRegressor(kernel=kernel_gpml, #alpha=0,
        #                                  optimizer=None, normalize_y=True)

        # Metrics
        y_pred, var = model.predict_f(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        vars.append(var.mean())
        start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times_obj.append(end)

        print('RMSE: {:.2f}'.format(rmse))
        print('MAE: {:.2f}'.format(mae))
        print('Variance: {:.2f}-{:.2f}'.format(var.min(), var.max()))
        print('R2 score: {:.2f}'.format(r2))

    #io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file)
    if options.model == 'rf':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    try:
        fname = options.output_path + '/learning_over_time.png'
        viz.plot_learning_over_time(end_times_obj,
                                    rmses,
                                    maes,
                                    r2s,
                                    filename=fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'start_times': start_times,
        'end_times': end_times,
        'rmse': rmses,
        'mae': maes,
        'var': vars,
        'r2': r2s
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)
Пример #9
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    # Get params
    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    io._download_from_bucket(options.save_file, options.save_file)
    logging.info('Loadung model from {}...'.format(options.save_file))
    predictor = io.load_scikit_model(options.save_file)

    # Init error dicts
    avg_delay = {}
    avg_pred_delay = {}
    station_count = 0
    all_times = set()

    station_rmse = {}
    station_median_abs_err = {}
    station_r2 = {}

    # If stations are given as argument use them, else use all stations
    logging.info('Loading stations from {}...'.format(options.stations_file))
    stationList = io.get_train_stations(options.stations_file)
    if options.stations is not None:
        stations = options.stations.split(',')
    else:
        stations = stationList.keys()

    # Get data
    #stationName = '{} ({})'.format(stationList[station]['name'], station)
    #logging.info('Processing station {}'.format(stationName))

    # Read data and filter desired train types (ic and commuter)
    logging.info('Loading data...')
    data = bq.get_rows(starttime,
                       endtime,
                       loc_col='trainstation',
                       project=options.project,
                       dataset='trains_testset',
                       table='features_1',
                       parameters=all_param_names,
                       locations=stations)

    data = io.filter_train_type(labels_df=data,
                                train_types=['K', 'L'],
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=['delay'],
                                aggs=aggs)

    assert len(data) > 0, "Empty data"

    if options.y_avg_hours is not None:
        data = io.calc_running_delay_avg(data, options.y_avg_hours)

    data.sort_values(by=['time', 'trainstation'], inplace=True)
    logging.info('Processing {} rows...'.format(len(data)))

    # Pick times for creating error time series
    all_times = data.loc[:, 'time'].unique()
    #station_count += 1

    # Pick feature and label data from all data
    l_data = data.loc[:, options.meta_params + options.label_params]
    f_data = data.loc[:, options.meta_params + options.feature_params]

    target = l_data['delay'].astype(np.float64).values.ravel()
    features = f_data.drop(columns=['trainstation', 'time']).astype(
        np.float64).values

    # Get data
    logging.info('Predicting...')
    y_pred = predictor.predict(features)

    # Calculate quantiles
    logging.info('Calculating fractiles...')
    groups, avg, pred = io.pred_fractiles(l_data, y_pred, stationList)

    # Go through stations
    for station in stations:

        data = pred.loc[pred['trainstation'] == station, :]
        times = data.loc[:, 'time']

        if len(data) < 1:
            continue

        group = pred.loc[pred['trainstation'] == station, 'group'].values[0]
        stationName = '{} ({} | Group {})'.format(stationList[station]['name'],
                                                  station, group)

        logging.info('Processing station {} (having {} rows)...'.format(
            station, len(data)))

        logging.info('Calculating errors for given station...')
        rmse = math.sqrt(
            metrics.mean_squared_error(data.loc[:, 'delay'],
                                       data.loc[:, 'pred_delay']))
        median_abs_err = metrics.median_absolute_error(
            data.loc[:, 'delay'], data.loc[:, 'pred_delay'])
        r2 = metrics.r2_score(data.loc[:, 'delay'], data.loc[:, 'pred_delay'])

        # Put errors to timeseries
        station_rmse[station] = rmse
        station_median_abs_err[station] = median_abs_err
        station_r2[station] = r2

        logging.info('RMSE for station {}: {}'.format(stationName, rmse))
        logging.info('Mean absolute error for station {}: {}'.format(
            stationName, median_abs_err))
        logging.info('R2 score for station {}: {}'.format(stationName, r2))

        # Create csv and upload it to pucket
        times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in times]
        delay_data = {
            'times': times_formatted,
            'delay': data.loc[:, 'delay'].values,
            'predicted delay': data.loc[:, 'pred_delay'].values,
            'low': data.loc[:, 'pred_delay_low'].values,
            'high': data.loc[:, 'pred_delay_high'].values
        }
        fname = '{}/delays_{}.csv'.format(options.vis_path, station)
        io.write_csv(delay_data, fname, fname)

        # Draw visualisation
        fname = '{}/{}.png'.format(options.vis_path, station)
        viz.plot_delay(times, data.loc[:, 'delay'].values,
                       data.loc[:, 'pred_delay'].values,
                       'Delay for station {}'.format(stationName), fname,
                       data.loc[:, 'pred_delay_low'].values,
                       data.loc[:, 'pred_delay_high'].values)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    # Save all station related results to csv and upload them to bucket
    fname = '{}/station_rmse.csv'.format(options.vis_path)
    io.dict_to_csv(station_rmse, fname, fname)
    fname = '{}/station_median_absolute_error.csv'.format(options.vis_path)
    io.dict_to_csv(station_median_abs_err, fname, fname)
    fname = '{}/station_r2.csv'.format(options.vis_path)
    io.dict_to_csv(station_r2, fname, fname)

    # Create timeseries of avg actual delay and predicted delay
    all_times = sorted(list(all_times))
    avg_delay = avg.loc[:, 'avg_delay'].dropna().values.ravel()
    avg_pred_delay = avg.loc[:, 'avg_pred_delay'].dropna().values.ravel()

    # Calculate average over all times and stations
    rmse = math.sqrt(metrics.mean_squared_error(avg_delay, avg_pred_delay))
    median_abs_err = metrics.median_absolute_error(avg_delay, avg_pred_delay)
    r2 = metrics.r2_score(avg_delay, avg_pred_delay)

    logging.info('RMSE for average delay over all stations: {}'.format(rmse))
    logging.info(
        'Mean absolute error for average delay over all stations: {}'.format(
            median_abs_err))
    logging.info('R2 score for average delay over all stations: {}'.format(r2))

    # Write average data into file
    avg_errors = {
        'rmse': rmse,
        'mae': median_abs_err,
        'r2': r2,
        'nro_of_samples': len(avg_delay)
    }
    fname = '{}/avg_erros.csv'.format(options.vis_path)
    io.dict_to_csv(avg_errors, fname, fname)

    # Create timeseries of average delay and predicted delays over all stations
    all_times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in all_times]
    delay_data = {
        'times': all_times_formatted,
        'delay': avg_delay,
        'predicted delay': avg_pred_delay
    }

    # write csv
    fname = '{}/avg_delays_all_stations.csv'.format(options.vis_path)
    io.write_csv(delay_data, fname, fname)

    for i in np.arange(0, 3):
        fname = '{}/avg_group_{}.png'.format(options.vis_path, (i + 1))
        times = groups[i].index.values
        if len(times) < 2:
            continue

        g_avg_delay = groups[i].loc[:, 'avg_delay'].values.ravel()
        g_avg_pred_delay = groups[i].loc[:, 'avg_pred_delay'].values.ravel()
        g_avg_pred_delay_low = groups[
            i].loc[:, 'avg_pred_delay_low'].values.ravel()
        g_avg_pred_delay_high = groups[
            i].loc[:, 'avg_pred_delay_high'].values.ravel()

        viz.plot_delay(times, g_avg_delay, g_avg_pred_delay,
                       'Average delay for group {}'.format(i + 1), fname,
                       g_avg_pred_delay_low, g_avg_pred_delay_high)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    # visualise
    fname = '{}/avg_all_stations.png'.format(options.vis_path)
    viz.plot_delay(all_times, avg_delay, avg_pred_delay,
                   'Average delay for all station', fname)
    io._upload_to_bucket(filename=fname, ext_filename=fname)
Пример #10
0
    pass


options = Options()
options.starttime = '2010-01-01'
options.endtime = '2018-01-01'
options.config_filename = 'cnf/rf.ini'
options.config_name = 'all_params_1'
options.stations_file = 'cnf/stations.json'
options.stations = None  #'PSL,OL,TPE,OV,PM,II,KEM,HKI'
options.gs_bucket = 'trains-data'

_config.read(options)

bq = _bq.BQHandler()
io = _io.IO(gs_bucket=options.gs_bucket)
viz = _viz.Viz()

starttime, endtime = io.get_dates(options)

# Get params
all_param_names = options.label_params + options.feature_params + options.meta_params
aggs = io.get_aggs_from_param_names(options.feature_params)

print('Loading stations from {}...'.format(options.stations_file))
stationList = io.get_train_stations(options.stations_file)
if options.stations is not None:
    stations = options.stations.split(',')
else:
    stations = stationList.keys()
Пример #11
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz(io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset,
                                                                  starttime.strftime('%Y-%m-%d'),
                                                                  endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'bgm':
        model = BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_process",
                                        n_components=options.n_components)
    elif options.model == 'rfc':
        model = RandomForestClassifier(n_jobs=-1)
    else:
        raise('Model not specificied or wrong. Add "model: bgm" to config file.')

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten = options.whiten,
                              copy = False)

    sum_columns = []
    if options.reason_code_table is not None:
        sum_columns = ['count']

    logging.info('Processing time range {} - {}'.format(starttime.strftime('%Y-%m-%d %H:%M'),
                                                        endtime.strftime('%Y-%m-%d %H:%M')))

    logging.info('Reading data...')
    data = bq.get_rows(starttime,
                       endtime,
                       loc_col='trainstation',
                       project=options.project,
                       dataset=options.feature_dataset,
                       table=options.feature_table,
                       parameters=all_param_names,
                       reason_code_table=options.reason_code_table)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    logging.info('Processing {} rows...'.format(len(data)))

    if options.month:
        logging.info('Adding month to the dataset...')
        data['month'] = data['time'].map(lambda x: x.month)
        options.feature_params.append('month')

    if options.balance:
        logging.info('Balancing data...')
        count = data.groupby('class').size().min()
        data = pd.concat([data.loc[data['class'] == 0].sample(n=count),
                          data.loc[data['class'] == 1].sample(n=count),
                          data.loc[data['class'] == 2].sample(n=count),
                          data.loc[data['class'] == 3].sample(n=count)])

    target = data.loc[:,options.label_params].astype(np.int32).values.ravel()
    features = data.loc[:,options.feature_params].astype(np.float32).values
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.10)

    logging.debug('Features shape: {}'.format(X_train.shape))
    io.log_class_dist(y_train, np.arange(4))

    n_samples, n_dims = X_train.shape

    if options.normalize:
        logging.info('Normalizing data...')
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        X_train = ipca.fit_transform(X_train)
        fname = options.output_path+'/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        X_test = ipca.fit_transform(X_test)

    logging.debug('Features shape after pre-processing: {}'.format(X_train.shape))

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        if options.model == 'bgm':
            param_grid = {"n_components": [1, 2, 4, 8, 16],
                          "covariance_type": ['full', 'tied', 'diag', 'spherical'],
                          "init_params": ['kmeans', 'random']
                          }
        elif options.model == 'rfc':
            raise("Not implemented. Get back to work!")
        else:
            raise("No param_grid set for given model ({})".format(options.model))

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        fname = options.output_path+'/random_search_cv_results.txt'
        report_cv_results(random_search.cv_results_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        sys.exit()
    else:
        logging.info('Training...')
        model.fit(X_train, y_train)

    # Metrics
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)

    logging.info('Accuracy: {}'.format(acc))

    io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file)

    # Performance
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    logging.info('Accuracy: {}'.format(acc))
    logging.info('Precision: {}'.format(precision))
    logging.info('Recall: {}'.format(recall))
    logging.info('F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, labels=np.arange(4))

    error_data = {'acc': [acc],
                  'precision': [precision],
                  'recall': [recall],
                  'f1': [f1]}
    fname = '{}/training_time_validation_errors.csv'.format(options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(4), filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(4), True, filename=fname)

    # Save models
    io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file)

    if options.normalize:
        fname=options.save_path+'/xscaler.pkl'
        io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    if options.model == 'rfc':
        fname = options.output_path+'/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_, fname, feature_names=options.feature_params)
Пример #12
0
def main():
    """
    Get data from db and save it as csv
    """

    #a = mlfdb.mlfdb()
    a = _bq.BQHandler()
    io = _io.IO(gs_bucket='trains-data')
    viz = _viz.Viz()

    if not os.path.exists(options.save_path):
        os.makedirs(options.save_path)

    starttime, endtime = io.get_dates(options)

    logging.debug(options.what)
    what = options.what.split(',')
    logging.debug(what)

    all_param_names = [
        'time', 'trainstation', 'train_type', 'train_count', 'total_delay',
        'delay', 'name', 'lat', 'lon'
    ]
    logging.info('Loading classification dataset from db')
    logging.info('Using time range {} - {}'.format(
        starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d')))

    # Read data and filter desired train_types (ic and commuter)
    l_data = a.get_rows(starttime,
                        endtime,
                        loc_col='trainstation',
                        project='trains-197305',
                        dataset='trains_2009_18',
                        table='features',
                        parameters=all_param_names)

    # data = io.filter_train_type(labels_df=data,
    #                             train_types=['K','L'],
    #                             sum_types=True,
    #                             train_type_column='train_type',
    #                             location_column='trainstation',
    #                             time_column='time',
    #                             sum_columns=['delay'],
    #                             aggs=aggs)

    # l_data.rename(columns={0: 'trainstation', 1:'time', 2: 'lon', 3: 'lat', 4: 'train type', 5: 'delay', 6: 'train count', 7: 'total delay'}, inplace=True)

    #l_data.set_index(pd.DatetimeIndex(pd.to_datetime(l_data.loc[:,'time'].astype(int), unit='s')), inplace=True)
    #l_data.set_index('time', drop=False, inplace=True)

    passangers = io.filter_train_type(labels_df=l_data,
                                      train_types=['L', 'K'],
                                      sum_types=True)
    l_data.set_index(pd.to_datetime(l_data.loc[:, 'time']), inplace=True)
    #passangers.set_index(pd.to_datetime(passangers.loc[:,'time']), inplace=True)

    # ################################################################################
    if 'histograms' in what:

        # All delays
        filename = options.save_path + '/hist_all_delays_all.png'
        viz.hist_all_delays(
            l_data.loc[:,
                       ['train_type', 'train_count', 'delay', 'total_delay']],
            filename)

        # Different train types

        for name, t in train_types.items():
            filename = options.save_path + '/hist_all_delays_{}.png'.format(
                name)
            df = l_data[l_data.loc[:, 'train_type'].isin([t])]
            viz.hist_all_delays(df.loc[:, statlist], filename)

        # All passanger trains
        filename = options.save_path + '/hist_all_delays_passanger.png'
        viz.hist_all_delays(passangers.loc[:, statlist], filename)

        # all parameters
        passangers.replace(-99, np.nan, inplace=True)
        delayed_data = passangers[passangers.loc[:, 'delay'] > 50]
        d = {'A': passangers, 'B': delayed_data}
        comp_data = pd.concat(d.values(), axis=1, keys=d.keys())
        filename = options.save_path + '/histograms_compare.png'
        viz.all_hist(comp_data, filename=filename)

    # ################################################################################
    if 'history' in what:

        # Mean delays over time

        # All trains
        filename = options.save_path + '/mean_delays_over_time_all.png'
        s = l_data.groupby(l_data.index)[statlist].mean()
        viz.plot_delays(s, filename)

        # for passanger trains
        filename = options.save_path + '/mean_delays_over_time_passanger.png'
        s = passangers.groupby(passangers.index)[statlist].mean()
        viz.plot_delays(s, filename)

        # for different train_types
        for name, t in train_types.items():
            filename = options.save_path + '/mean_delays_over_time_{}.png'.format(
                name)
            df = l_data[l_data.loc[:, 'train_type'].isin([t])]
            s = df.groupby(df.index)[statlist].mean()
            viz.plot_delays(s, filename)

        # Median delays over time

        # All trains
        filename = options.save_path + '/median_delays_over_time_all.png'
        s = l_data.groupby(l_data.index)[statlist].median()
        viz.plot_delays(s, filename)

        # for passanger trains
        filename = options.save_path + '/median_delays_over_time_passanger.png'
        s = passangers.groupby(passangers.index)[statlist].median()
        viz.plot_delays(s, filename)

        # for different train_types
        for name, t in train_types.items():
            filename = options.save_path + '/median_delays_over_time_{}.png'.format(
                name)
            df = l_data[l_data.loc[:, 'train_type'].isin([t])]
            s = df.groupby(df.index)[statlist].median()
            viz.plot_delays(s, filename)

    # ################################################################################
    if 'heatmap' in what:

        # locs = a.get_locations_by_dataset(options.dataset,
        #                                   starttime=starttime,
        #                                   endtime=endtime,
        #                                   rettype='dict')
        # # Heatmap bad some stations
        #locs = l_data.loc[:, 'trainstation'].unique().values.ravel()
        locs = io.get_train_stations('cnf/stations.json')
        #print(locs)

        if not os.path.exists(options.save_path + '/heatmap'):
            os.makedirs(options.save_path + '/heatmap')

        heatmap_year(l_data, passangers, 2018, locs)

        for year in np.arange(2010, 2019, 1):
            heatmap_year(l_data, passangers, year, locs)

    # ################################################################################
    if 'detailed_heatmap' in what:
        locs = a.get_locations_by_dataset(options.dataset,
                                          starttime=starttime,
                                          endtime=endtime,
                                          rettype='dict')
        # Heatmap bad some stations

        if not os.path.exists(options.save_path + '/detailed_heatmap'):
            os.makedirs(options.save_path + '/detailed_heatmap')

        d = starttime
        while d < endtime:
            heatmap_day(l_data, passangers, d, locs)
            d += dt.timedelta(days=1)
Пример #13
0
def main():
    """
    Main program
    """
    local_device_protos = device_lib.list_local_devices()
    logging.info(
        [x.name for x in local_device_protos if x.device_type == 'GPU'])

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    #save_path = options.save_path+'/'+options.config_name

    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    logging.info('Reading data...')
    bq.set_params(starttime,
                  endtime,
                  batch_size=2500000,
                  loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  only_winters=options.only_winters)

    data = bq.get_rows()

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=['train_count', 'delay'],
                                aggs=aggs)

    if options.y_avg_hours is not None:
        data = io.calc_running_delay_avg(data, options.y_avg_hours)

    if options.y_avg:
        data = io.calc_delay_avg(data)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    if options.normalize:
        logging.info('Normalizing data...')
        xscaler = StandardScaler()
        yscaler = StandardScaler()

        non_scaled_data = data.loc[:, options.meta_params]
        labels = data.loc[:, options.label_params].astype(
            np.float32).values.reshape((-1, 1))

        yscaler.fit(labels)
        scaled_labels = pd.DataFrame(yscaler.transform(labels),
                                     columns=['delay'])
        scaled_features = pd.DataFrame(xscaler.fit_transform(
            data.loc[:, options.feature_params].astype(np.float32)),
                                       columns=options.feature_params)

        data = pd.concat([non_scaled_data, scaled_features, scaled_labels],
                         axis=1)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

        non_processed_data = data.loc[:, options.meta_params +
                                      options.label_params]
        processed_data = data.loc[:, options.feature_params]
        ipca.fit(processed_data)
        processed_features = pd.DataFrame(ipca.transform(processed_data))

        data = pd.concat([non_processed_data, processed_data], axis=1)

        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    data_train, data_test = train_test_split(data, test_size=0.33)
    X_test, y_test = io.extract_batch(data_test,
                                      options.time_steps,
                                      batch_size=None,
                                      pad_strategy=options.pad_strategy,
                                      quantile=options.quantile,
                                      label_params=options.label_params,
                                      feature_params=options.feature_params)

    # Define model
    batch_size = io.get_batch_size(data_train,
                                   options.pad_strategy,
                                   quantile=options.quantile)
    logging.info('Batch size: {}'.format(batch_size))
    model = LSTM.LSTM(options.time_steps,
                      len(options.feature_params),
                      1,
                      options.n_hidden,
                      options.lr,
                      options.p_drop,
                      batch_size=batch_size)

    # Initialization
    rmses, mses, maes, steps, train_mse = [], [], [], [], []
    saver = tf.train.Saver()
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    summary_writer = tf.summary.FileWriter(options.log_dir,
                                           graph=tf.get_default_graph())

    #tf.summary.scalar('Training MSE', model.loss)
    tf.summary.scalar('Validation_MSE', model.mse)
    tf.summary.scalar('Validation_RMSE', model.rmse)
    tf.summary.scalar('Validation_MAE', model.mae)
    tf.summary.histogram('y_pred_hist', model.y_pred)
    merged_summary_op = tf.summary.merge_all()
    train_summary_op = tf.summary.scalar('Training_MSE', model.loss)

    train_step = 0
    start = 0
    while True:
        # If slow is set, go forward one time step at time,
        # else proceed whole batch size
        if options.slow:
            X_train, y_train = io.extract_batch(
                data_train,
                options.time_steps,
                start=start,
                pad_strategy=options.pad_strategy,
                quantile=options.quantile,
                label_params=options.label_params,
                feature_params=options.feature_params)
        else:
            X_train, y_train = io.extract_batch(
                data_train,
                options.time_steps,
                train_step,
                pad_strategy=options.pad_strategy,
                quantile=options.quantile,
                label_params=options.label_params,
                feature_params=options.feature_params)

        if (len(X_train) < options.time_steps):
            break

        if options.cv:
            logging.info('Doing random search for hyper parameters...')

            param_grid = {
                "C": [0.001, 0.01, 0.1, 1, 10],
                "epsilon": [0.01, 0.1, 0.5],
                "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'],
                "degree": [2, 3, 4],
                "shrinking": [True, False],
                "gamma": [0.001, 0.01, 0.1],
                "coef0": [0, 0.1, 1]
            }

            random_search = RandomizedSearchCV(model,
                                               param_distributions=param_grid,
                                               n_iter=int(
                                                   options.n_iter_search),
                                               n_jobs=-1)

            random_search.fit(X_train, y_train)
            logging.info("RandomizedSearchCV done.")
            fname = options.output_path + '/random_search_cv_results.txt'
            report_cv_results(random_search.cv_results_, fname)
            io._upload_to_bucket(filename=fname, ext_filename=fname)
            sys.exit()
        else:
            if train_step == 0:
                logging.info('Training...')

            feed_dict = {model.X: X_train, model.y: y_train}
            _, loss, train_summary = sess.run(
                [model.train_op, model.loss, train_summary_op],
                feed_dict=feed_dict)

            summary_writer.add_summary(train_summary, train_step * batch_size)

        # Metrics
        feed_dict = {model.X: X_test, model.y: y_test}
        #model.cell_init_state: state}

        val_loss, rmse, mse, mae, y_pred, summary = sess.run(
            [
                model.loss, model.rmse, model.mse, model.mae, model.y_pred,
                merged_summary_op
            ],
            feed_dict=feed_dict)

        train_mse.append(loss)
        mses.append(mse)
        rmses.append(rmse)
        maes.append(mae)
        steps.append(train_step)

        summary_writer.add_summary(summary, train_step * batch_size)
        if train_step % 50 == 0:
            logging.info("Step {}:".format(train_step))
            logging.info("Training loss: {:.4f}".format(loss))
            logging.info("Validation MSE: {:.4f}".format(val_loss))
            logging.info('Validation RMSE: {}'.format(rmse))
            logging.info('Validation MAE: {}'.format(mae))
            logging.info('................')
            saver.save(sess, options.save_file)

        train_step += 1
        start += 1
        # <-- while True:

    saver.save(sess, options.save_file)
    if options.normalize:
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, fname, fname)
    io._upload_dir_to_bucket(options.save_path, options.save_path)

    try:
        fname = options.output_path + '/learning_over_time.png'
        metrics = [{
            'metrics': [{
                'values': mses,
                'label': 'Validation MSE'
            }, {
                'values': train_mse,
                'label': 'Train MSE'
            }],
            'y_label':
            'MSE'
        }, {
            'metrics': [{
                'values': rmses,
                'label': 'Validation RMSE'
            }],
            'y_label': 'RMSE'
        }, {
            'metrics': [{
                'values': maes,
                'label': 'Validation MAE'
            }],
            'y_label': 'MAE'
        }]
        viz.plot_learning(metrics, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'steps': steps,
        'mse': mses,
        'rmse': rmses,
        'mae': maes,
        'train_mse': train_mse
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)