Exemplo n.º 1
0
def rgf_state_prediction(state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format(
                    state, city)):
                print(city, 'done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                model = RGFRegressor(max_leaf=300,
                                     algorithm="RGF_Sib",
                                     test_interval=100,
                                     loss="LS",
                                     verbose=False)

                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]
                try:
                    model.fit(X_train, tgt)
                except ValueError as err:
                    print(
                        '-----------------------------------------------------'
                    )
                    print(city, 'ERRO')
                    print(
                        '-----------------------------------------------------'
                    )
                    break
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                pred_m = model.predict(X_test[:(len(tgtt))])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format(
                'saved_models/rgf', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Exemplo n.º 2
0
def lasso_single_state_prediction(state, lookback, horizon, predictors):
    ##LASSO WITHOUT CLUSTER SERIES
    cities = list(get_cities_from_state('Ceará'))

    for city in cities:
        if os.path.isfile(
                '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl'
                .format(state, city)):
            print(city, 'done')
            continue
        data = combined_data(city, DATA_TYPES)
        data = data[predictors]
        data.drop('casos', axis=1, inplace=True)

        target = 'casos_est'
        data_lag = build_lagged_features(data, lookback)
        data_lag.dropna()
        targets = {}
        for d in range(1, horizon + 1):
            if d == 1:
                targets[d] = data_lag[target].shift(-(d - 1))
            else:
                targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

        X_data = data_lag.drop(target, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            data_lag[target],
                                                            train_size=0.7,
                                                            test_size=0.3,
                                                            shuffle=False)

        city_name = get_city_names([city, 0])[0][1]
        preds = np.empty((len(data_lag), horizon))
        metrics = pd.DataFrame(index=('mean_absolute_error',
                                      'explained_variance_score',
                                      'mean_squared_error',
                                      'mean_squared_log_error',
                                      'median_absolute_error', 'r2_score'))
        for d in range(1, horizon + 1):
            model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False)

            tgt = targets[d][:len(X_train)]
            tgtt = targets[d][len(X_train):]
            try:
                model.fit(X_train, tgt)
            except ValueError as err:
                print('-----------------------------------------------------')
                print(city, 'ERRO')
                print('-----------------------------------------------------')
                break
            pred = model.predict(X_data[:len(targets[d])])

            dif = len(data_lag) - len(pred)
            if dif > 0:
                pred = list(pred) + ([np.nan] * dif)
            preds[:, (d - 1)] = pred
            pred_m = model.predict(X_test[:(len(tgtt))])
            metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
                'saved_models/lasso_no_cluster', state, city))
        plot_prediction(preds,
                        targets[1],
                        city_name,
                        len(X_train),
                        path='lasso_no_cluster')
        # plt.show()
    return None
Exemplo n.º 3
0
def lasso_single_prediction(city, state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))
    data, group = get_cluster_data(geocode=city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=predictors)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        data_lag[target],
                                                        train_size=0.7,
                                                        test_size=0.3,
                                                        shuffle=False)

    if sum(y_train) == 0:
        print('aaaah', city)
        return None
    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error',
                                  'explained_variance_score',
                                  'mean_squared_error',
                                  'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    for d in range(1, horizon + 1):
        model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False)

        tgt = targets[d][:len(X_train)]
        tgtt = targets[d][len(X_train):]
        try:
            model.fit(X_train, tgt)
            print(city, 'done')
        except ValueError as err:
            print('-----------------------------------------------------')
            print(city, 'ERRO')
            print('-----------------------------------------------------')
            break
        pred = model.predict(X_data[:len(targets[d])])

        dif = len(data_lag) - len(pred)
        if dif > 0:
            pred = list(pred) + ([np.nan] * dif)
        preds[:, (d - 1)] = pred
        pred_m = model.predict(X_test[:(len(tgtt))])
        metrics[d] = calculate_metrics(pred_m, tgtt)

    metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
        'saved_models/lasso', state, city))
    plot_prediction(preds, targets[1], city_name, len(X_train))
    return None
Exemplo n.º 4
0
def calculate_mape(state, lookback, horizon):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=['alerta'],
                                            cols=['casos_est', 'casos'])
        for city in cluster:
            print(city)

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            try:
                metrics = pd.read_pickle(
                    '~/Documentos/resultados_infodengue/lasso/{}/lasso_metrics_{}.pkl'
                    .format(state, city))
            except EOFError:
                print('---------------------------------')
                print('ERROR', 'eof', city)
                print('----------------------------------')

            if metrics.shape[1] != 4:
                print('---------------------------------')
                print('ERROR', 'shape', city)
                print('----------------------------------')
                continue

            values = []
            for d in range(1, horizon + 1):
                mae = metrics[d]['mean_absolute_error']
                tgtt = targets[d][len(X_train):]

                factor = (len(tgtt) / (len(tgtt) - 1)) * sum(
                    [abs(i - (tgtt[pos])) for pos, i in enumerate(tgtt[1:])])
                if factor == 0:
                    values.append(np.nan)
                else:
                    values.append(mae / factor)

            metrics.loc['mean_absolute_scaled_error'] = values
            metrics.to_pickle(
                '~/Documentos/resultados_infodengue/lasso/{}/lasso_metrics_{}.pkl'
                .format(state, city))
    return None