예제 #1
0
 def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label):
     """ SKLearn을 사용해서 Pandas를 Proprocessing
         label은 Preprocessing 하면 안됨
     Args:
       params:
         * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
         * _df_csv_read_ori : pandas dataframe
         * _label
     Returns:
       Preprocessing DataFrame
     """
     if _preprocessing_type == None or _preprocessing_type == 'null':
         logging.info("No Preprocessing")
         result_df =  _df_csv_read_ori
     else :
         logging.info("Preprocessing type : {0}".format(_preprocessing_type))
         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
         for i, v in _df_csv_read_ori.dtypes.iteritems():
             if v in numerics:
                 if i not in _label:
                     #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
                     #_preprocessing_type = ['maxabs_scale']
                     if 'scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'minmax_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'robust_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'normalize' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0))
                     if 'maxabs_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0))
         result_df = _df_csv_read_ori
     return result_df
예제 #2
0
def parameter_scan(img_id, kind,
                   cols='x y'.split(), only_core=True, do_scale=True,
                   proba_cut=0.9, factor=0.1):
    p4id = markings.TileID(img_id, scope='planet4')
    functions = dict(blotch=p4id.plot_blotches,
                     fan=p4id.plot_fans)
    min_samples_base = round(factor * p4id.n_marked_classifications)
    min_cluster_size_vals = [min_samples_base,
                             round(1.5 * min_samples_base)]
    min_samples_vals = [1,
                        min_samples_base,
                        round(1.5 * min_samples_base)]
    data = p4id.filter_data(kind)
    X = data[cols].as_matrix()
    if do_scale:
        X = robust_scale(X)
    fig, ax = plt.subplots(nrows=len(min_cluster_size_vals),
                           ncols=len(min_samples_vals) + 1)
    axes = ax.flatten()
    for ax, (mcs, ms) in zip(axes,
                             product(min_cluster_size_vals,
                                     min_samples_vals)):
        logger.debug("Running with %i and %i.", mcs, ms)
        if ms > mcs:
            ax.set_title('ms > mcs')
            ax.set_axis_off()
            continue
        # elif ms == mcs and ms == 2 * min_samples_base:
        #     p4id.show_subframe(ax=ax)
        #     continue
        clusterer = HDBScanner(X, mcs, ms, proba_cut=proba_cut,
                               only_core=only_core,
                               metric='manhattan')
        reduced_data = post_processing(kind, data, clusterer)
        plot_results(clusterer, data, p4id, kind, reduced_data, ax=ax)
        ax.set_title('MCS: {}, MS: {}\nn_clusters: {}, averaged: {}'
                     .format(mcs, ms, clusterer.n_clusters,
                             len(reduced_data)),
                     fontsize=6)

        threshold = pd.Series(clusterer.hdbscan.outlier_scores_).quantile(0.9)
        outliers = np.where(clusterer.hdbscan.outlier_scores_ > threshold,
                            True, False)
        ax.scatter(data.loc[outliers, 'x'],
                   data.loc[outliers, 'y'],
                   marker='x', s=15, linewidth=1, c='red', alpha=0.75)
    p4id.show_subframe(ax=axes[-1])
    functions[kind](ax=axes[-2], lw=0.25)
    fig.suptitle("n_class: {}, ncols: {}, factor: {}, scale: {}"
                 .format(p4id.n_marked_classifications, len(cols),
                         factor, do_scale))
    savepath = ("plots/{}/{}_lencols{}_factor{}_scale{}.png"
                .format(kind, img_id, len(cols), factor, do_scale))
    fig.savefig(savepath, dpi=200)
        )

    if args.fit:
        #
        # fit the hybrid model
        #

        # prepare input song features and playlist targets at training
        X_fit, Y_fit = shape_data(
            playlists_idx, songs_idx, idx2song, features,
            mode='train', subset=fit_idx
        )

        # preprocess input features if required
        if model.standardize:
            X_fit = prep.robust_scale(X_fit)

        if model.normalize:
            X_fit = prep.normalize(X_fit, norm=model.normalize)

        # fit the classifier
        fit(
            model=model,
            fit_input=X_fit.astype(theano.config.floatX),
            fit_target=Y_fit.astype(np.int8),
            out_dir=out_dir,
            random_state=rng
        )

    if args.test:
        #
    def show_result(self):
        self.hideAll()
        self.verticalLayoutWidget.show()
        self.plainTextResult.show()
        self.resultWidget.show()
        self.resultWidget.horizontalHeader().setStyleSheet(
            "QHeaderView::section {background-color:#D9E5FF;color:#000000;}")
        self.resultWidget.horizontalHeader().setStretchLastSection(True)
        self.resultWidget.verticalHeader().setStyleSheet(
            "QHeaderView::section {background-color:#D9E5FF;color:#000000;}")
        if self.brand == "E":
            self.result = pd.DataFrame(columns=["지점명", "폐점확률"])
            self.df = pd.read_excel("./Data/preprocessing/E/binning.xlsx")
            LR = joblib.load("./model/emart.pk1")

            for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)):

                if i < 138:
                    self.result.loc[i, "지점명"] = self.df.iloc[i, 1]
                    self.result.loc[i, "폐점확률"] = v[0]
            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)

            self.resultWidget.resizeRowsToContents()

            f = open("./txt_file/emart.txt", "r", encoding="UTF-8")
            lines = f.readlines()
            lines = "".join(lines)
            self.plainTextResult.setPlainText(lines)
            f.close()
        elif self.brand == "ET":
            self.result = pd.DataFrame(columns=["지점명", "전환확률"])
            self.df = pd.read_excel("./Data/preprocessing/ET/binning.xlsx")
            LR = joblib.load("./model/emart_tr.pk1")
            for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)):

                if i < 22:
                    self.result.loc[i, "지점명"] = self.df.iloc[i, 1]
                    self.result.loc[i, "전환확률"] = v[1]
            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)

            self.resultWidget.resizeRowsToContents()

            f = open("./txt_file/emart_tr.txt", "r", encoding="UTF-8")
            lines = f.readlines()
            lines = "".join(lines)
            self.plainTextResult.setPlainText(lines)
            f.close()

        elif self.brand == "H":
            self.result = pd.DataFrame(columns=["지점명", "폐점확률"])
            self.df = pd.read_excel("./Data/preprocessing/H/binning.xlsx")
            LR = joblib.load("./model/homeplus.pk1")

            for i, v in enumerate(np.round(LR.predict_proba(scale(self.df.iloc[:, 2:-1])), 3)):

                if i < 119:
                    self.result.loc[i, "지점명"] = self.df.iloc[i, 1]
                    self.result.loc[i, "폐점확률"] = v[0]
            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)

            self.resultWidget.resizeRowsToContents()

            f = open("./txt_file/homeplus.txt", "r", encoding="UTF-8")
            lines = f.readlines()
            lines = "".join(lines)
            self.plainTextResult.setPlainText(lines)
            f.close()

        elif self.brand == "HS":
            self.result = pd.DataFrame(columns=["지점명", "전환확률"])
            self.df = pd.read_excel("./Data/preprocessing/HS/binning.xlsx")
            LR = joblib.load("./model/special.pk1")
            for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)):

                if i < 119:
                    self.result.loc[i, "지점명"] = self.df.iloc[i, 1]
                    self.result.loc[i, "전환확률"] = v[1]
            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)

            self.resultWidget.resizeRowsToContents()

            f = open("./txt_file/homeplus_special.txt", "r", encoding="UTF-8")
            lines = f.readlines()
            lines = "".join(lines)
            self.plainTextResult.setPlainText(lines)
            f.close()

        elif self.brand == "L":
            self.result = pd.DataFrame(columns=["지점명", "폐점확률"])
            self.df = pd.read_excel("./Data/preprocessing/L/binning.xlsx")
            LR = joblib.load("./model/lotte.pk1")

            for i, v in enumerate(LR.predict_proba(self.df.iloc[:, 2:-1])):
                if i < 109:
                    self.result.loc[i, "지점명"] = self.df.iloc[i, 1]
                    self.result.loc[i, "폐점확률"] = np.round(v[0], 3)
            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)

            self.resultWidget.resizeRowsToContents()

            f = open("./txt_file/lotte.txt", "r", encoding="UTF-8")
            lines = f.readlines()
            lines = "".join(lines)
            self.plainTextResult.setPlainText(lines)
            f.close()

        if self.checkBox_2.isChecked():
            self.result.sort_values(by=self.result.keys()[-1], ascending=False, inplace=True)

            columns = list(map(str, self.result.keys()))
            self.result = self.result.astype(str)
            item_count = len(self.result[columns[0]])
            self.resultWidget.setRowCount(item_count)
            self.resultWidget.setColumnCount(len(columns))
            self.resultWidget.setHorizontalHeaderLabels(columns)
            for j in range(item_count):
                row = self.result.iloc[j, :]
                for i in range(len(row)):
                    item = QTableWidgetItem(row[i])
                    item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight)
                    self.resultWidget.setItem(j, i, item)
        if self.checkBox.isChecked():
            self.result.to_excel("./save_file/result_{}.xlsx".format(self.brand), index=False)

        for i in reversed(range(self.resultLayout.count())):
            self.resultLayout.itemAt(i).widget().setParent(None)
        fig = plt.Figure()
        ax = fig.add_subplot(111)
        a = self.result.sort_values(by=self.result.keys()[-1], ascending=False)
        ax.barh(a.iloc[:5, 0][::-1], a.iloc[:5, 1][::-1].astype(float), 0.4)
        ax.set_xticks(np.arange(0, 1.1, step=0.2))
        ax.xaxis.set_tick_params(labelsize=7)
        ax.yaxis.set_tick_params(labelsize=7, rotation=60)
        ax.set(title="<{} 상위 5개 지점>".format(a.keys()[-1]))

        canvas = FigureCanvas(fig)
        canvas.draw()
        self.resultLayout.addWidget(canvas)
        canvas.show()

        self.checkBox.setChecked(False)
        self.checkBox_2.setChecked(False)
예제 #5
0
# RationalQuadratic = gp.kernels.RationalQuadratic()
regr1 = gp.GaussianProcessRegressor(alpha=1e-5, n_restarts_optimizer=5)
regr2 = gp.GaussianProcessRegressor(alpha=1e-5, n_restarts_optimizer=5)

# Read data
mat = scipy.io.loadmat("Data_and_training_sample.mat")
mask = np.array(mat["mask"], dtype=bool)
t1_test = np.array(mat["t1"], dtype=float)
t1_test = np.reshape(t1_test, (-1, t1_test.shape[-1]))
t2_test = np.array(mat["t2"], dtype=float)
t2_test = np.reshape(t2_test, (-1, t2_test.shape[-1]))
idx = np.where(mask)[0]

# Normalise data
t1 = pre.robust_scale(t1)
t2 = pre.robust_scale(t2)
t1_tr = t1[idx.transpose()]
t2_tr = t2[idx.transpose()]
del mat

regr1.fit(t1_tr, t2_tr)
t1_hat = np.empty((0, t2.shape[1]))
tic = time.time()
for i in range(1 + t1.shape[0] / 31000):
    a = i * 31000
    b = a + 31000
    temp1 = t1[a:b, :]
    t1_hat = np.append(t1_hat, regr1.predict(temp1), axis=0)
    toc = time.time()
    et = toc - tic
예제 #6
0
def extreme_random_byhour(df, evaluate_var='DUMMY_30_DAY'):
    # Log de Precios
    # df['PESPANIA'] = np.log(df['PESPANIA'])
    # df['PPORTUGAL'] = np.log(df['PPORTUGAL'])
    del df['PPORTUGAL']

    df['FECHA'] = df['ANIO'].map(str) + '-' + df['MES'].map(
        str) + '-' + df['DIA'].map(str)
    df['FECHA'] = pd.to_datetime(df['FECHA'], format='%Y-%m-%d')

    df['WEEKDAY'] = df['FECHA'].dt.dayofweek

    # df['DUMMY_2010_REGIMEN'] = pd.Series(0, index=df.index)
    # df.loc[df['FECHA'] >= '2010-01-01', 'DUMMY_2010_REGIMEN'] = 1
    # df = df[df['FECHA'] >= '2010-01-01']

    df['ANIO'] = df['ANIO'].map(int)
    df['MES'] = df['MES'].map(int)
    df['DIA'] = df['DIA'].map(int)

    # df = df.groupby(['FECHA']).mean().reset_index()

    del df['HORA']
    del df['FECHA']
    del df['FECHA_HORA']
    del df['DIA']

    # TARGET VARIABLE

    dummy_important = [
        'DUMMY', 'DUMMY_5_DAY', 'DUMMY_10_DAY', 'DUMMY_15_DAY', 'DUMMY_20_DAY',
        'DUMMY_30_DAY'
    ]
    dummy_important.remove(evaluate_var)
    for i in dummy_important:
        del df[i]

    # DIFFERENCIATE

    # DIFERENCIA PESPANIA
    '''
    df['PESPANIA'] = df['PESPANIA'] - df['PESPANIA'].shift(1)
    df = df.dropna(axis=0)

    # DIFERENCIA RESTO

    need_differenciation = ['TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR', 'CICLO_COMBINADO', 'FUEL_PRIMA',
                            'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM', 'TME_MADRID', 'TMAX_MADRID', 'TME_BCN',
                            'TMAX_BCN', 'TMIN_BCN', 'GDP']


    for i in need_differenciation:
        name = 'D_' + str(i)
        df[name] = df[i] - df[i].shift(1)
        del df[i]

    df = df.dropna()
    '''

    # DUMMIES
    dummy_var = ['ANIO', 'MES', 'WEEKDAY']
    for i in dummy_var:
        name = str(i)
        dummy = pd.get_dummies(df[i], prefix=name)
        df = pd.concat([df, dummy], axis=1)
        del dummy
        del df[i]

    # LAGS
    lag_AR = 28
    for i in range(1, lag_AR + 1, 1):
        name = 'PESPANIA_lag_' + str(i)
        df[name] = df['PESPANIA'].shift(i)

    lag_number = 24
    lag_variables = [
        'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES', 'TOTAL_DEMANDA_NAC_ES',
        'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES', 'TOTAL_POT_IND_ES',
        'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR',
        'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO', 'FUEL_SIN_PRIMA',
        'FUEL_PRIMA', 'REG_ESPECIAL', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM'
    ]

    for i in range(1, lag_number, 1):
        for j in lag_variables:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    lag_number = 24
    climaticas = [
        'TME_MADRID', 'TMAX_MADRID', 'TMIN_MADRID', 'PP_MADRID', 'TME_BCN',
        'TMAX_BCN', 'TMIN_BCN', 'PP_BCN'
    ]
    for i in range(1, lag_number + 1, 1):
        for j in climaticas:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    lag_number = 24
    portugal = ['TOTAL_DEMANDA_POR', 'TOTAL_PRODUCCION_POR']
    for i in range(1, lag_number + 1, 1):
        for j in portugal:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    df = df.dropna(how='any', axis=0)

    normal = df[df[evaluate_var] == 0]
    anormal = df[df[evaluate_var] == 1]

    del normal[evaluate_var]
    del anormal[evaluate_var]

    # NORMALIZE
    column_names = normal.columns.values.tolist()
    normal = preprocessing.robust_scale(normal)
    normal = pd.DataFrame(normal, columns=[column_names])

    column_names = anormal.columns.values.tolist()
    anormal = preprocessing.robust_scale(anormal)
    anormal = pd.DataFrame(anormal, columns=[column_names])

    total_values = len(df.index)
    print('total rows ', total_values)
    anormal_values = len(anormal.index)
    print('anormal rows ', anormal_values)

    proportion = anormal_values / total_values
    print('proportion of anormal ', proportion)

    normalY = normal[['PESPANIA']]
    normalX = normal
    del normalX['PESPANIA']

    anormalY = anormal[['PESPANIA']]
    anormalX = anormal
    del anormalX['PESPANIA']

    names = normalX.columns.values
    fileNames = np.array(names)

    # Solo tomamos test y train del normal con el mismo tamaño del test que la muestra de anormales
    X_train, X_test, y_train, y_test = train_test_split(normalX,
                                                        normalY,
                                                        test_size=proportion,
                                                        random_state=42)

    nTreeList = range(2000, 2001, 1)
    for iTrees in nTreeList:
        tresholds = np.linspace(0.1, 1.0, 200)

        min_samples_leaf = round(len(X_train.index) * 0.005)
        print('min_samples_leaf ', min_samples_leaf)
        min_samples_split = min_samples_leaf * 10
        print('min_samples_split ', min_samples_split)
        print('iTrees ', iTrees)
        depth = 50
        maxFeat = (round((len(df.columns) / 3)))
        print('Feature Set ', maxFeat)

        fileModel = ensemble.GradientBoostingRegressor(
            learning_rate=0.01,
            n_estimators=500,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_depth=depth,
            verbose=1)

        fileModel1 = ensemble.ExtraTreesRegressor(
            criterion='mse',
            bootstrap=False,
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            n_estimators=iTrees,
            max_depth=depth,
            max_features=maxFeat,
            oob_score=False,
            random_state=531,
            verbose=1)

        fileModel2 = ensemble.RandomForestRegressor(
            n_estimators=iTrees,
            max_depth=depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            verbose=1,
            max_features=maxFeat)

        fileModel.fit(X_train.values, y_train.values)
        prediction_normal = fileModel.predict(X_test)

        print('MSE NORMAL ', mean_squared_error(y_test, prediction_normal))
        print('R2 NORMAL ', r2_score(y_test, prediction_normal))

        prediction_normal = pd.DataFrame(prediction_normal, index=y_test.index)
        prediction_normal = pd.concat([y_test, prediction_normal], axis=1)
        prediction_normal.columns = [
            'PESPANIA_REAL_NO_COLUSION', 'PESPANIA_PRED_NO_COLUSION'
        ]
        prediction_normal['DIF_PORC'] = (
            prediction_normal['PESPANIA_REAL_NO_COLUSION'] -
            prediction_normal['PESPANIA_PRED_NO_COLUSION']
        ) / prediction_normal['PESPANIA_PRED_NO_COLUSION']
        print('PRECIO PROMEDIO PREDICHO - NO COLUSION %.5f' %
              prediction_normal['PESPANIA_PRED_NO_COLUSION'].mean())
        print('PRECIO PROMEDIO REAL - NO COLUSION %.5f ' %
              prediction_normal['PESPANIA_REAL_NO_COLUSION'].mean())
        print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)',
              prediction_normal['DIF_PORC'].mean() * 100, '%')
        prediction_normal.to_csv('prediction_normal_dia.csv',
                                 sep=';',
                                 index=False)

        prediction_anormal = fileModel.predict(anormalX)

        print('MSE ANORMAL ', mean_squared_error(anormalY, prediction_anormal))
        print('R2 ANORMAL ', r2_score(anormalY, prediction_anormal))

        prediction_anormal = pd.DataFrame(prediction_anormal,
                                          index=anormalY.index)
        prediction_anormal = pd.concat([anormalY, prediction_anormal], axis=1)
        prediction_anormal.columns = [
            'PESPANIA_REAL_COLUSION', 'PESPANIA_PRED_COLUSION'
        ]
        prediction_anormal['DIF_PORC'] = (
            prediction_anormal['PESPANIA_REAL_COLUSION'] -
            prediction_anormal['PESPANIA_PRED_COLUSION']
        ) / prediction_anormal['PESPANIA_PRED_COLUSION']
        print('PRECIO PROMEDIO PREDICHO - COLUSION %.5f' %
              prediction_anormal['PESPANIA_PRED_COLUSION'].mean())
        print('PRECIO PROMEDIO REAL - COLUSION %.5f' %
              prediction_anormal['PESPANIA_REAL_COLUSION'].mean())
        print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)',
              prediction_anormal['DIF_PORC'].mean() * 100, '%')
        prediction_anormal.to_csv('prediction_anormal_dia.csv',
                                  sep=';',
                                  index=False)

        fig, ax = plot.subplots()
        sns.regplot(y='PESPANIA_PRED_COLUSION',
                    x='PESPANIA_REAL_COLUSION',
                    data=prediction_anormal,
                    ax=ax,
                    label='COLUSION')
        sns.regplot(y='PESPANIA_PRED_NO_COLUSION',
                    x='PESPANIA_REAL_NO_COLUSION',
                    data=prediction_normal,
                    ax=ax,
                    label='NON-COLUSION')
        diag_line, = ax.plot(ax.get_xlim(),
                             ax.get_ylim(),
                             ls="--",
                             c=".3",
                             label='perfect prediction')
        plot.legend(loc='best')
        plot.title('Differences between Prices using ERF')
        plot.show()

        fig, ax = plot.subplots()
        prediction_anormal = prediction_anormal.reset_index()
        sns.regplot(y='PESPANIA_PRED_COLUSION',
                    x='index',
                    data=prediction_anormal,
                    ax=ax,
                    label='PREDICTED')
        sns.regplot(y='PESPANIA_REAL_COLUSION',
                    x='index',
                    data=prediction_anormal,
                    ax=ax,
                    label='REAL')
        # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction')
        plot.legend(loc='best')
        plot.title('Differences between Prices')
        plot.show()

        featureImportance = fileModel.feature_importances_

        featureImportance = featureImportance / featureImportance.max()
        sorted_idx = np.argsort(featureImportance)
        fi = featureImportance[sorted_idx]
        fi = fi[-10:]
        barPos = np.arange(sorted_idx.shape[0]) + 0.5
        barPos = barPos[-10:]
        plot.barh(barPos, fi, align='center')
        fileNames = fileNames[sorted_idx]
        fileNames = fileNames[-10:]
        plot.yticks(barPos, fileNames)
        plot.xlabel('Variable Importance')
        plot.show()
        tempDF.index = pd.to_datetime(tempDF.index, yearfirst=True)

        tempDF = tempDF.resample('W').agg(np.nansum)
        tempDF.replace(0, np.nan, inplace=True)

        firstIndex = tempDF.first_valid_index()
        lastIndex = tempDF.last_valid_index()
        tempDFCut = tempDF.loc[firstIndex:lastIndex]

        tempDFCut = tempDFCut.ewm(
            span=np.max([1, len(tempDFCut.index) * ewmParameter])).mean()
        tempDF = tempDF.ewm(
            span=np.max([1, len(tempDFCut.index) * ewmParameter])).mean()

        trajectoriesSet[column] = [
            list(robust_scale(tempDFCut[column].values))
        ]
        trajectoriesSmoothOriginal[column] = list(tempDF[column].values)
        trajectoriesRaw[column] = list(data[column].fillna(0))

    maxLength = max([len(value[0]) for _, value in trajectoriesSet.items()])

    trajectoriesSetProcessed = {}
    for key, value in trajectoriesSet.items():
        value = value[0]

        if len(value) == maxLength:
            trajectoriesSetProcessed[key] = np.array(value).reshape(
                1, len(value))
            continue
예제 #8
0
def investigate(FEATURES,
                PERFORMANCE,
                random_state=0,
                scale_features='maxabs',
                scale_performance=None,
                images_dir=cfg.finvestig_images_dir,
                data_dir=cfg.finvestig_data_dir,
                results_dir=cfg.finvestig_results_dir):
    '''This function is used to produce a list of the top features as chosen by
	the random forest.  For each feature, the forest calculates a score. To construct
	the most important features, all features with a score above a threshold are
	chosen. Currently, the threshold is set to 2*mean, where the mean is the arithmetic
	mean taken over the various scores.

	:param FEATURES: Features file. This should be in CSV format, with column 0
	         being the instance name and row zero being the names of the
	         features.
	:param PERFORMANCE: Performance file. This should be in CSV format, with column
	         0 being the instance name and row zero being the names of the various solvers.
	:param random_state: Specify the random seed (int) to be used in training the
	         Random Forest. default=0
	:param scale_features: There are various ways to scale the features data. The
	             scaling is done column-wise (i.e. on each feature individually).
	             default='maxabs'.

	             - maxabs = Scale to [-1,1]
	             - scale = Zero mean and unit stdev
	             - minmax = Translate and scale to [0,1]
	             - normalize = Normalize each feature to unit norm
	             - robust = Shift outliers in according to interquartile range
	:param scale_performance: There are various ways to scale the performance data.
	              The scaling is done row-wise (i.e. on each instance individually).
	              default=None.

	              - maxabs = Scale to [-1,1]
	              - scale = Zero mean and unit stdev
	              - minmax = Translate and scale to [0,1]
	              - normalize = Normalize each row to unit norm
	              - default_scale = Add 1000 to each entry, and row-wise divide by default performance
	:param images_dir: Directory to dump images.
	:param data_dir: Directory to dump data.
	:param results_dir: Directory to dump results.
	:return: The filename of every saved output automatically has the input file
	     names used to produce it.
		 
	     - Text 1: Reduced by Random Forest Regressor space.  This is a subset
	       of the original Feature space, with most important features chosen as
	       the subset.  Most important is a heuristic chosen by the Random Forest.
	       Automatically saved to CSV format in data_dir.
	'''

    ###################################################################
    # Section 1A: Grabs Data
    ###################################################################

    stamp = '%s_%s' % (os.path.basename(FEATURES).split('.')[0],
                       os.path.basename(PERFORMANCE).split('.')[0])

    with open("%s" % (FEATURES)) as f:
        reader = csv.reader(f, delimiter=",")
        data_f = list(reader)
    #instances = [os.path.basename(line[0]).split('.')[0] for line in data[1:]]
    features = [line for line in data_f[1:]]
    feature_names = [line for line in data_f[0]]

    with open("%s" % (PERFORMANCE)) as f:
        reader = csv.reader(f, delimiter=",")
        data_p = list(reader)
    performances = [line for line in data_p[1:]]

    ###################################################################
    # Section 1B: Sync up data so that only instances with both
    # feature vectors and performance data is trained on
    ###################################################################
    performances_matched = []
    features_matched = []
    instances_matched = []

    for line in features:
        instance_name = os.path.basename(line[0]).split('.')[0]
        for line in performances:
            if line[0] == instance_name:
                instances_matched.append(instance_name)

    for instance in instances_matched:
        for line in features:
            if instance == os.path.basename(line[0]).split('.')[0]:
                features_matched.append(line)

    for instance in instances_matched:
        for line in performances:
            if instance == line[0]:
                performances_matched.append(line)

    performances_tot = [line[1:] for line in performances_matched]
    features_tot = [
        line[1:-1] for line in features_matched
    ]  #the -1 here removes the empty string coming from feature selection
    # There's some string issue.  The following converts to floats:
    performances_tot = [[float(i) for i in j] for j in performances_tot]
    performances_tot = np.array(performances_tot)

    ###################################################################
    # Section 1C: Scale the feature/performance data
    ###################################################################
    # normalize = scale to unit norm
    # maxabs_scale = scale to [-1,1]
    # scale = zero mean scaled to std one

    if scale_features == 'scale':
        features_tot = preprocessing.scale(features_tot)
    elif scale_features == 'maxabs':
        features_tot = preprocessing.maxabs_scale(features_tot)
    elif scale_features == 'minmax':
        features_tot = preprocessing.minmax_scale(features_tot)
    elif scale_features == 'normalize':
        features_tot = preprocessing.normalize(features_tot)
    elif scale_features == 'robust':
        features_tot = preprocessing.robust_scale(features_tot)

    if scale_performance == 'scale':
        performances_tot = preprocessing.scale(performances_tot, axis=1)
    elif scale_performance == 'maxabs':
        performances_tot = preprocessing.maxabs_scale(performances_tot, axis=1)
    elif scale_performance == 'minmax':
        performances_tot = preprocessing.minmax_scale(performances_tot, axis=1)
    elif scale_performance == 'normalize':
        performances_tot = preprocessing.normalize(performances_tot, axis=1)
    elif scale_performance == 'default_scale':
        performances_tot = [[(float(i) + 1000) / (float(line[0]) + 1000)
                             for i in line] for line in performances_tot]

    performances_tot = np.array(performances_tot)

    ###################################################################
    # Section 2: Find the top features and save reduced feature file to txt
    ###################################################################

    # Train up a Random Forest
    rf_regress = RandomForestRegressor(max_features="sqrt",
                                       random_state=random_state,
                                       max_depth=None,
                                       n_estimators=250,
                                       verbose=0)
    rf_regress.fit(features_tot, performances_tot)

    # Feature Selection
    selector = SelectFromModel(rf_regress, prefit=True, threshold='2*mean')
    Indices = selector.get_support(indices=True)
    top_features = [feature_names[index + 1] for index in Indices]
    np.savetxt('%s/rfr_top_features_%s.txt' % (results_dir, stamp),
               top_features,
               fmt='%s')

    DATA = []
    header = [
        'name',
    ]
    header.extend([i for i in top_features])
    DATA.append(header)

    for line in instances_matched:
        DATA.append([line])

    a = len(instances_matched)
    for j in range(len(data_f[0])):
        for feature in top_features:
            if data_f[0][j] == feature:
                for k in range(a):
                    DATA[k + 1].extend([data_f[k + 1][j]])

    with open('%s/%s_reduced-byRFR.csv' % (data_dir, stamp), 'w') as f:
        writer = csv.writer(f)
        writer.writerows(DATA)
예제 #9
0
    def fit(self):
        """Extract features from data.

        Returns
        -------
        self : returns an instance of self.
        """
        #######################################################################
        # MAIN PARAMETERS
        #######################################################################

        # Bandpass filter
        freq_broad = (0.4, 30)
        # FFT & bandpower parameters
        win_sec = 5  # = 2 / freq_broad[0]
        sf = self.sf
        win = int(win_sec * sf)
        kwargs_welch = dict(window='hamming', nperseg=win, average='median')
        bands = [(0.4, 1, 'sdelta'), (1, 4, 'fdelta'), (4, 8, 'theta'),
                 (8, 12, 'alpha'), (12, 16, 'sigma'), (16, 30, 'beta')]

        #######################################################################
        # HELPER FUNCTIONS
        #######################################################################

        def nzc(x):
            """Calculate the number of zero-crossings along the last axis."""
            return ((x[..., :-1] * x[..., 1:]) < 0).sum(axis=1)

        def mobility(x):
            """Calculate Hjorth mobility on the last axis."""
            return np.sqrt(np.diff(x, axis=1).var(axis=1) / x.var(axis=1))

        def petrosian(x):
            """Calculate the Petrosian fractal dimension on the last axis."""
            n = x.shape[1]
            ln10 = np.log10(n)
            diff = np.diff(x, axis=1)
            return ln10 / (ln10 + np.log10(n / (n + 0.4 * nzc(diff))))

        #######################################################################
        # CALCULATE FEATURES
        #######################################################################

        features = []

        for i, c in enumerate(self.ch_types):
            # Preprocessing
            # - Filter the data
            dt_filt = filter_data(self.data[i, :],
                                  sf,
                                  l_freq=freq_broad[0],
                                  h_freq=freq_broad[1],
                                  verbose=False)
            # - Extract epochs. Data is now of shape (n_epochs, n_samples).
            times, epochs = sliding_window(dt_filt, sf=sf, window=30)

            # Calculate standard descriptive statistics
            hmob = mobility(epochs)

            feat = {
                'std': np.std(epochs, ddof=1, axis=1),
                'iqr': sp_stats.iqr(epochs, rng=(25, 75), axis=1),
                'skew': sp_stats.skew(epochs, axis=1),
                'kurt': sp_stats.kurtosis(epochs, axis=1),
                'nzc': nzc(epochs),
                'hmob': hmob,
                'hcomp': mobility(np.diff(epochs, axis=1)) / hmob
            }

            # Calculate spectral power features (for EEG + EOG)
            freqs, psd = sp_sig.welch(epochs, sf, **kwargs_welch)
            if c != 'emg':
                bp = bandpower_from_psd_ndarray(psd, freqs, bands=bands)
                for j, (_, _, b) in enumerate(bands):
                    feat[b] = bp[j]

            # Add power ratios for EEG
            if c == 'eeg':
                delta = feat['sdelta'] + feat['fdelta']
                feat['dt'] = delta / feat['theta']
                feat['ds'] = delta / feat['sigma']
                feat['db'] = delta / feat['beta']
                feat['at'] = feat['alpha'] / feat['theta']

            # Add total power
            idx_broad = np.logical_and(freqs >= freq_broad[0],
                                       freqs <= freq_broad[1])
            dx = freqs[1] - freqs[0]
            feat['abspow'] = np.trapz(psd[:, idx_broad], dx=dx)

            # Calculate entropy and fractal dimension features
            feat['perm'] = np.apply_along_axis(ent.perm_entropy,
                                               axis=1,
                                               arr=epochs,
                                               normalize=True)
            feat['higuchi'] = np.apply_along_axis(ent.higuchi_fd,
                                                  axis=1,
                                                  arr=epochs)
            feat['petrosian'] = petrosian(epochs)

            # Convert to dataframe
            feat = pd.DataFrame(feat).add_prefix(c + '_')
            features.append(feat)

        #######################################################################
        # SMOOTHING & NORMALIZATION
        #######################################################################

        # Save features to dataframe
        features = pd.concat(features, axis=1)
        features.index.name = 'epoch'

        # Apply centered rolling average (11 epochs = 5 min 30)
        # Triang: [1/6, 2/6, 3/6, 4/6, 5/6, 6/6 (X), 5/6, 4/6, 3/6, 2/6, 1/6]
        rollc = features.rolling(window=11,
                                 center=True,
                                 min_periods=1,
                                 win_type='triang').mean()
        rollc[rollc.columns] = robust_scale(rollc, quantile_range=(5, 95))
        rollc = rollc.add_suffix('_c5min_norm')

        # Now look at the past 5 minutes
        rollp = features.rolling(window=10, min_periods=1).mean()
        rollp[rollp.columns] = robust_scale(rollp, quantile_range=(5, 95))
        rollp = rollp.add_suffix('_p5min_norm')

        # Add to current set of features
        features = features.join(rollc).join(rollp)

        #######################################################################
        # TEMPORAL + METADATA FEATURES AND EXPORT
        #######################################################################

        # Add temporal features
        features['time_hour'] = times / 3600
        features['time_norm'] = times / times[-1]

        # Add metadata if present
        if self.metadata is not None:
            for c in self.metadata.keys():
                features[c] = self.metadata[c]

        # Downcast float64 to float32 (to reduce size of training datasets)
        cols_float = features.select_dtypes(np.float64).columns.tolist()
        features[cols_float] = features[cols_float].astype(np.float32)
        # Make sure that age and sex are encoded as int
        if 'age' in features.columns:
            features['age'] = features['age'].astype(int)
        if 'male' in features.columns:
            features['male'] = features['male'].astype(int)

        # Sort the column names here (same behavior as lightGBM)
        features.sort_index(axis=1, inplace=True)

        # Add to self
        self._features = features
        self.feature_name_ = self._features.columns.tolist()
예제 #10
0
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from numpy import linalg
import pdb

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)

## Scaling the data
data = np.array(robust_scale(data))
scaler = MinMaxScaler()
scaler.fit(data)

print("Data shape: " + str(data.shape))
    
###Remove with KMeans

outlier_remover = KMeans(n_clusters=1)
outlier_remover.fit(data)

cluster_center = np.array(outlier_remover.cluster_centers_[0])
errors = np.sqrt(((data-cluster_center) ** 2).sum(1)).reshape(-1,1)

###Remove with linear regression
예제 #11
0
파일: loan_model.py 프로젝트: pmack1/ML
mean_list = list(set(columns) - set(['MonthlyIncome', 'NumberOfDependents']))
process.fill_missing(df_train = X, df_test = df_test, mean = mean_list, median = 'MonthlyIncome', mode = ['NumberOfDependents'])

#### if skew above 2 add a log feature
for col in X.columns:
    process.transform(X,col)
    process.transform(df_test,col)


# #### Model
classifier.try_models(X,y, ['LR'])
# classifier.try_models(X,y, ['LR', 'RF', 'SGD', 'DT', 'GB', 'AB'])

#### scale data for KNN model
X_scale = X.copy()
preprocessing.robust_scale(X_scale)
classifier.try_models(X_scale, y ,['KNN'])













#Separate training and testing datasets
dfX = df[[
    "O3", "NO2", "SO2", "china", "china1", "china2", "Wind velocity(m/s)",
    "Wind direction_NW", "Wind direction_S", "Wind direction_SE"
]]
dfy = df["PM10"]

#split training and testing sets
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(dfX,
                                                            dfy,
                                                            test_size=0.3,
                                                            random_state=0)
dfX_train.shape, dfX_test.shape, dfy_train.shape, dfy_test.shape

#scale
dfX_train = robust_scale(dfX_train)
dfX_test = robust_scale(dfX_test)
dfy_train = robust_scale(dfy_train)
dfy_test = robust_scale(dfy_test)
#print LinearRegression equation E
model = linear_model.LinearRegression().fit(dfX_train, dfy_train)
print("\n<Linear regression equation E - Training Data Set>")
j = 0
for i in dfX.columns:
    print(i, ": E = a(", "%0.5f" % float(model.coef_[j]), ")+",
          "%0.5f" % float(model.intercept_))
    j = j + 1

y_predict = model.predict(dfX_test)
#RSS calculation
print("\nRSS:", mean_squared_error(dfy_test, y_predict))
예제 #13
0
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from scipy.stats import mode
import warnings
from mpl_toolkits.mplot3d import Axes3D

# suppress warning
warnings.simplefilter("ignore")
# load and preprocess data
data = pd.read_csv("./data.csv", sep=';')
x = data.iloc[:, :-1]
y = data.iloc[:, -1].to_numpy()
x = robust_scale(x)
data_num = len(y)
select = np.random.choice(data_num, data_num // 5)
train_x = x[np.delete(np.arange(data_num), select)]
train_y = y[np.delete(np.arange(data_num), select)]
test_x = x[select]
test_y = y[select]

# use multiple methods
classifier1 = KNeighborsClassifier()
classifier2 = MLPClassifier()
classifier3 = SVC()
classifier4 = DecisionTreeClassifier()
classifier1.fit(train_x, train_y)
classifier2.fit(train_x, train_y)
classifier3.fit(train_x, train_y)
예제 #14
0
def robust_scale_data(data,target_cols):
    #scaler = RobustScaler().fit(data[target_cols])
    #data[target_cols] = scaler.tranform(data[target_cols])
    data[target_cols] = robust_scale(data[target_cols])
    return data
예제 #15
0
def mergeData(file_path_in, file_path_out, is_regression):
    print "Merge data..."

    # Check if directory exist
    for p in file_path_out:
        checkAndCreateDir(p)

    # Merge edsr data
    df_esdr = mergeEsdrData(file_path_in[0])

    # Aggregate ESDR data
    b_hr = 4  # how many hours to look back
    df_esdr = aggregateEsdrData(df_esdr, b_hr)
    idx = df_esdr["EpochTime"].values

    # Aggregate smell data
    b_hr = 4  # how many hours to look back
    f_hr = [-2, 2]  # how many hours to look further
    bin_smell = None if is_regression else [
        10
    ]  # bin smell reports into labels or not
    df_smell, df_smell_raw, bow_smell = aggregateSmellData(
        file_path_in[1], idx, b_hr, f_hr, bin_smell, 3, 5)
    df_bow_smell = pd.DataFrame.from_dict(bow_smell,
                                          orient="index").reset_index()
    df_bow_smell.columns = ["word", "count"]

    # Merge esdr, smell, and tracker data
    df = pd.merge_ordered(df_esdr,
                          df_smell,
                          on="EpochTime",
                          how="outer",
                          fill_method=None)
    df = pd.merge_ordered(df,
                          df_tracker,
                          on="EpochTime",
                          how="outer",
                          fill_method=None)
    df = df.dropna().reset_index(drop=True)

    # Sort by epoch time
    df = df.sort_values("EpochTime")

    # Drop data points before Oct 6th 2016 (the app released date)
    df = df[df["EpochTime"] >= 1475726400].reset_index(drop=True)

    # Compute columns of days of the week and hours of the day
    df_datetime = pd.to_datetime(df["EpochTime"], unit="s")
    df_hd = df_datetime.dt.hour
    df_dw = df_datetime.dt.dayofweek

    # Compute sample weights
    df_w, df_freq = computeSampleWeights(df_smell_raw, df_hd, df_dw)

    # Drop the epochtime column
    df.drop("EpochTime", axis=1, inplace=True)

    # Prevent extreme small values
    df[df < 1e-6] = 0
    df_w[df_w < 1e-6] = 0

    # Transformed data points
    df_tran = pd.DataFrame(preprocessing.robust_scale(df), columns=df.columns)
    df_tran = df_tran.round(6)
    df_tran["NumberOfSmellReports"] = df["NumberOfSmellReports"]

    # Add days of week and hours of day
    df["DayOfWeek"] = df_dw
    df["HourOfDay"] = df_hd
    df_tran["DayOfWeek"] = df_dw
    df_tran["HourOfDay"] = df_hd

    # Write dataframe into a csv file
    df.to_csv(file_path_out[0])
    df_tran.to_csv(file_path_out[1])
    df_w.to_csv(file_path_out[2])
    df.corr().to_csv(file_path_out[3])
    df_freq.to_csv(file_path_out[4])
    df_bow_smell.to_csv(file_path_out[5])
    print "Dataset created at " + file_path_out[0]
    print "Transformed dataset created at " + file_path_out[1]
    print "Sample weights created at " + file_path_out[2]
    print "Original correlations created at " + file_path_out[3]
    print "Frequency of data points created at " + file_path_out[4]
    print "Bag of words for smell description created at " + file_path_out[5]
예제 #16
0
import os, sys
import json, csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
import numpy as np

"""
argument [path_to_csv]
"""

def kmeans(X):
	k_means = cluster.KMeans(15)
	k_means.fit(X)
	print k_means.labels_[::10]

if __name__ == '__main__':
	# data = np.genfromtxt(sys.argv[1], delimiter=',')
	data = pd.read_csv(sys.argv[1]).fillna('0')
	data = data.to_dict(orient='records')

	vec = DictVectorizer()
	featureMatrix = np.array(vec.fit_transform(data).toarray())

	X_scaled = preprocessing.robust_scale(featureMatrix)
	print "\nDimensions of feature matrix: ", X_scaled.shape

	for i in X_scaled:
		print i 

	kmeans(X_scaled)
	# ocsvm(X_scaled)
예제 #17
0
    def xform_data(self, df):
        """
        Some special handling of the price data. First, we don't want prices to be absolute, since we wan't the agent
        to learn actions _relative_ to states; that is, states need to be transformed into "relative" some how. This
        is called "stationary time series"; they fluctuate around y=0, like visualizing audio rather than a line graph.
        Next, we don't want absolute price changes, since that's still not relative enough (prices change in larger
        amounts when the BTC price is already large - we want to learn the pattern, not the numbers). So the solution
        is percent-changes. Now - making everything a percent-change from its past makes it so you can track that
        field's history, but you lose how it relates to the other fields in its cross-section. So here's what we do.
        Anchor all the price fields to the target (close-price); so they're relative w/i the cross-section. Then set
        target to its percent-change over time. Leave the volume stuff alone, we _do_ want that absolute. Then scale
        everything. Crazy, I know; but IMO makes sense. Hit me if you have a better idea.
        """
        columns = []
        ind_ct = self.hypers.indicators_count
        tables_ = data.get_tables(self.hypers.arbitrage)
        for table in tables_:
            for col in table['cols']:
                name_col = f'{table["name"]}_{col}'
                if name_col == data.target:
                    columns.append(self.diff(df[name_col], True))
                elif col in table['price_cols']:
                    columns.append(df[name_col] / df[data.target])
                else:
                    columns.append(df[name_col])

            # Add extra indicator columns
            ohlcv = table.get('ohlcv', {})
            if ohlcv and ind_ct:
                ind = pd.DataFrame()
                # TA-Lib requires specifically-named columns (OHLCV)
                for k, v in ohlcv.items():
                    ind[k] = df[f"{name}_{v}"]

                # Sort these by effectiveness. I'm no expert, so if this seems off please submit a PR! Later after
                # you've optimized the other hypers, come back here and create a hyper for every indicator you want to
                # try (zoom in on indicators)
                best_indicators = [
                    tlib.MOM,
                    tlib.SMA,
                    # tlib.BBANDS,  # TODO signature different; special handling
                    tlib.RSI,
                    tlib.EMA,
                    tlib.ATR
                ]
                for i in range(ind_ct):
                    columns.append(best_indicators[i](
                        ind, timeperiod=self.hypers.indicators_window) /
                                   df[data.target])

        states = np.column_stack(columns)
        prices = df[data.target].values

        # Remove padding at the start of all data. Indicators are aggregate fns, so don't count until we have
        # that much historical data
        if ind_ct:
            states = states[self.hypers.indicators_window:]
            prices = prices[self.hypers.indicators_window:]

        # Pre-scale all price actions up-front, since they don't change. We'll scale changing values real-time elsewhere
        states = preprocessing.robust_scale(states, quantile_range=(1., 99.))

        # Reducing the dimensionality of our states (OHLCV + indicators + arbitrage => 5 or 6 weights)
        # because TensorForce's memory branch changed Policy Gradient models' batching from timesteps to episodes.
        # This takes of way too much GPU RAM for us, so we had to cut back in quite a few areas (num steps to train
        # per episode, episode batch_size, and especially states:
        if self.cli_args.autoencode:
            ae = AutoEncoder()
            states = ae.fit_transform_tied(states)

        return states, prices
예제 #18
0
def standardize(train_X, valid_X, test_X, scaling=1, robust=0):
    # FIXME: standardization process can be performed:
    # 1. in terms of column (input variables)
    # 2. in terms of row (input cases)
    # when different features are in different scales, we need to perform 1
    # when different features are in the same scale, but different cases have extranous variations, such as exposure, volumn, dynamics, etc., we need to perform 2.
    # in the usage of ACE data preprocessing, we need to standardize in terms of cases rather than variables
    # see http://www.faqs.org/faqs/ai-faq/neural-nets/part2/section-16.html for details
    # for a general workable piece of code, we need to implement both kind of scaling
    ###########################################
    # standardization:
    # for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. the whole training X
    # update x_i <= (x_i - mu) / sigma
    # standardize the validation and test set using the same mu and sigma of the training X's
    # mu = numpy.mean(train_X,axis=0)
    # sigma = numpy.std(train_X,axis=0)
    # print "mu %d"%mu.shape
    # print "sigma %d"%sigma.shape
    ## python will automatically broadcast the row vector to the whole matrix, no worry
    # train_X = (train_X - mu) / sigma
    # valid_X = (valid_X - mu) / sigma
    # test_X = (test_X - mu) / sigma
    ##########################################

    # scaling = -1: not scaling at all;
    # scaling = 0, perform standardization along axis=0 - scaling input variables
    # scaling = 1, perform standardization along axis=1 - scaling input cases
    if scaling == 0:
        '''
        for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. X(:,i)
        update x_i <= (x_i - mu) / sigma
        standardize the validation and test set using the same mu and sigma of the training X's
        both valid and test set need to reuse the scaler of training set
        '''
        if robust == 1:
            scaler = preprocessing.RobustScaler().fit(train_X)
        else:
            scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        valid_X = scaler.transform(valid_X)
        test_X = scaler.transform(test_X)
    elif scaling == 1:
        '''
        for every training case X compute the mean(mu) and std(sigma) w.r.t. X(i,:)
        update X_i <= (X_i - mu) / sigma
        standardize the validation and test set using the same mu and sigma of the training X's
        training, valid and test set are independently scaled, since scaling is performed in terms of cases
        '''
        if robust == 1:
            train_X = preprocessing.robust_scale(train_X, axis=1)
            valid_X = preprocessing.robust_scale(valid_X, axis=1)
            test_X = preprocessing.robust_scale(test_X, axis=1)
        else:
            train_X = preprocessing.scale(train_X, axis=1)
            valid_X = preprocessing.scale(valid_X, axis=1)
            test_X = preprocessing.scale(test_X, axis=1)
    '''
    if scaling == 1:
        scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        valid_X = scaler.transform(valid_X)
        test_X = scaler.transform(test_X)
    elif scaling == 2:
    # [0,1] scaling
        min_max_scaler = preprocessing.MinMaxScaler().fit(train_X)
        train_X = min_max_scaler.transform(train_X)
        valid_X = min_max_scaler.transform(valid_X)
        test_X = min_max_scaler.transform(test_X)
    elif scaling == 3:
        # [-1,1] scaling
        max_abs_scaler = preprocessing.MaxAbsScaler().fit(train_X)
        train_X = max_abs_scaler.transform(train_X)
        valid_X = max_abs_scaler.transform(valid_X)
        test_X = max_abs_scaler.transform(test_X)
    '''
    return train_X, valid_X, test_X
예제 #19
0
파일: prepro.py 프로젝트: clementhtn/nature
 def normalize(self, X):
     return preprocessing.robust_scale(X)
예제 #20
0
def standardize(train_X, valid_X, test_X, scaling=1, robust=0):
    # FIXME: standardization process can be performed:
    # 1. in terms of column (input variables)
    # 2. in terms of row (input cases)
    # when different features are in different scales, we need to perform 1
    # when different features are in the same scale, but different cases have extranous variations, such as exposure, volumn, dynamics, etc., we need to perform 2.
    # in the usage of ACE data preprocessing, we need to standardize in terms of cases rather than variables
    # see http://www.faqs.org/faqs/ai-faq/neural-nets/part2/section-16.html for details
    # for a general workable piece of code, we need to implement both kind of scaling
    ###########################################
    # standardization:
    # for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. the whole training X
    # update x_i <= (x_i - mu) / sigma
    # standardize the validation and test set using the same mu and sigma of the training X's
    # mu = numpy.mean(train_X,axis=0)
    # sigma = numpy.std(train_X,axis=0)
    # print "mu %d"%mu.shape
    # print "sigma %d"%sigma.shape
    ## python will automatically broadcast the row vector to the whole matrix, no worry
    # train_X = (train_X - mu) / sigma
    # valid_X = (valid_X - mu) / sigma
    # test_X = (test_X - mu) / sigma
    ##########################################
    
    # scaling = -1: not scaling at all;
    # scaling = 0, perform standardization along axis=0 - scaling input variables
    # scaling = 1, perform standardization along axis=1 - scaling input cases
    if scaling == 0:
        '''
        for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. X(:,i)
        update x_i <= (x_i - mu) / sigma
        standardize the validation and test set using the same mu and sigma of the training X's
        both valid and test set need to reuse the scaler of training set
        '''
        if robust == 1:
            scaler = preprocessing.RobustScaler().fit(train_X)
        else:
            scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        valid_X = scaler.transform(valid_X)
        test_X = scaler.transform(test_X)
    elif scaling == 1:
        '''
        for every training case X compute the mean(mu) and std(sigma) w.r.t. X(i,:)
        update X_i <= (X_i - mu) / sigma
        standardize the validation and test set using the same mu and sigma of the training X's
        training, valid and test set are independently scaled, since scaling is performed in terms of cases
        '''
        if robust == 1:
            train_X = preprocessing.robust_scale(train_X, axis=1)
            valid_X = preprocessing.robust_scale(valid_X, axis=1)
            test_X = preprocessing.robust_scale(test_X, axis=1)
        else:
            train_X = preprocessing.scale(train_X, axis=1)
            valid_X = preprocessing.scale(valid_X, axis=1)
            test_X = preprocessing.scale(test_X, axis=1)
    '''
    if scaling == 1:
        scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
        valid_X = scaler.transform(valid_X)
        test_X = scaler.transform(test_X)
    elif scaling == 2:
    # [0,1] scaling
        min_max_scaler = preprocessing.MinMaxScaler().fit(train_X)
        train_X = min_max_scaler.transform(train_X)
        valid_X = min_max_scaler.transform(valid_X)
        test_X = min_max_scaler.transform(test_X)
    elif scaling == 3:
        # [-1,1] scaling
        max_abs_scaler = preprocessing.MaxAbsScaler().fit(train_X)
        train_X = max_abs_scaler.transform(train_X)
        valid_X = max_abs_scaler.transform(valid_X)
        test_X = max_abs_scaler.transform(test_X)
    '''
    return train_X, valid_X, test_X
print(X.shape)
print(X2.shape)
raw_data, raw_target = X2, y
print(raw_data.shape)

# In[13]:

train, test, train_t, test_t = train_test_split(X2,
                                                y,
                                                test_size=0.3,
                                                random_state=random_state,
                                                stratify=y)

train = preprocessing.robust_scale(train,
                                   axis=0,
                                   with_centering=True,
                                   with_scaling=True,
                                   quantile_range=(25.0, 75.0),
                                   copy=True)
#
test = preprocessing.robust_scale(test,
                                  axis=0,
                                  with_centering=True,
                                  with_scaling=True,
                                  quantile_range=(25.0, 75.0),
                                  copy=True)

# In[14]:

learning_rate = 0.1

#tun_range1 = {'n_estimators':range(20,81,10)}
예제 #22
0
        plt.xlabel("Y Test")
        plt.ylabel("Y Predicted")
        plt.xlim(lim_lower, lim_upper)
        plt.ylim(lim_lower, lim_upper)
        plt.plot([lim_lower, lim_upper], [lim_lower, lim_upper], ls="--")
        plt.savefig("%s.svg" % (name), format='svg', dpi=300)
        plt.clf()


# READ DATA SET
df = pd.read_csv("drug_descriptors.txt")

names = [i for i in df.columns[5:]]
#print names

df[names] = robust_scale(df[names])
#print df.head(5)
X = df.loc[:, names]

# ANALYZE DESCRIPTOR CORRELATIONS
corr_X = np.corrcoef(X)
of = open("corr.csv", "w")
of.write(", ")
for i in range(len(names)):
    of.write("%20s, " % (names[i]))
of.write("\n")
for i in range(len(names)):
    of.write("%20s, " % (names[i]))
    for j in range(len(names)):
        of.write("%8.5f, " % (corr_X[i][j]))
    of.write("\n")
예제 #23
0
#page 253 데이터 Scaling 변환(minmax_scale)

#minmax_scale(): 최소 최대값을 이용하여 데이터 반환
ds_minmax_scale = minmax_scale(fit_numeric)
ds_minmax_scale = pd.DataFrame(ds_minmax_scale, columns=fit_numeric.columns)
ds_minmax_scale.head()

#요약 통계량
ds_minmax_scale_describe = ds_minmax_scale.describe()
ds_minmax_scale_describe.round(3)

# In[107]:

#page 254
#robust_scale():데이터 변환 함수
ds_robust_scale = robust_scale(fit_numeric)
ds_robust_scale = pd.DataFrame(ds_robust_scale, columns=fit_numeric.columns)
ds_robust_scale.head()

#요약통계량
ds_robust_scale_describe = ds_robust_scale.describe()
ds_robust_scale_describe.round(3)

# In[108]:

#page 255
#Scale, Robust, MinMax scale 변환 비교
ds_rstpulse = pd.DataFrame()
ds_rstpulse["Raw"] = ds_fitness["RSTPULSE"]
ds_rstpulse["Scale"] = ds_scale["RSTPULSE"]
ds_rstpulse["Robust"] = ds_robust_scale["RSTPULSE"]
예제 #24
0
    def __init__(self,
                 ep_len=5000,
                 window=300,
                 arbitrage=False,
                 indicators={}):
        self.ep_len = ep_len
        self.window = window
        self.arbitrage = arbitrage
        self.indicators = indicators

        self.ep_stride = ep_len  # disjoint
        # self.ep_stride = 100  # overlap; shift each episode by x seconds.
        # TODO overlapping stride would cause test/train overlap. Tweak it so train can overlap data, but test gets silo'd

        col_renames = {
            'Timestamp': 'timestamp',
            'Open': 'open',
            'High': 'high',
            'Low': 'low',
            'Close': 'close',
            'Volume_(BTC)': 'volume_btc',
            'Volume_(Currency)': 'volume',
            'Weighted_Price': 'vwap'
        }

        filenames = {
            # 'bitstamp': 'bitstampUSD_1-min_data_2012-01-01_to_2018-06-27.csv',
            'coinbase': 'coinbaseUSD_1-min_data_2014-12-01_to_2018-06-27.csv',
            # 'coincheck': 'coincheckJPY_1-min_data_2014-10-31_to_2018-06-27.csv'
        }
        primary_table = 'coinbase'
        self.target = f"{primary_table}_close"

        df = None
        for table, filename in filenames.items():
            df_ = pd.read_csv(
                path.join(path.dirname(__file__), 'bitcoin-historical-data',
                          filename))
            col_renames_ = {k: f"{table}_{v}" for k, v in col_renames.items()}
            df_ = df_.rename(columns=col_renames_)
            ts = f"{table}_timestamp"
            df_[ts] = pd.to_datetime(df_[ts], unit='s')
            df_ = df_.set_index(ts)
            df = df_ if df is None else df.join(df_)

        # too quiet before 2015, time waste. copy() to avoid pandas errors
        df = df.loc['2015':].copy()

        df['month'] = df.index.month
        df['day'] = df.index.day
        df['hour'] = df.index.hour

        # TODO drop null rows? (inner join?)
        # TODO arbitrage
        # TODO indicators

        diff_cols = [
            f"{table}_{k}"
            for k in 'open high low close volume_btc volume vwap'.split(' ')
            for table in filenames.keys()
        ]
        df[diff_cols] = df[diff_cols].pct_change()\
            .replace([np.inf, -np.inf], np.nan)\
            .ffill()  # .bfill()?
        df = df.iloc[1:]
        target = df[
            self.
            target]  # don't scale price changes; we use that in raw form later
        df = pd.DataFrame(robust_scale(df.values,
                                       quantile_range=(.1, 100 - .1)),
                          columns=df.columns,
                          index=df.index)
        df[self.target] = target

        df['cash'], df['value'] = 0., 0.

        self.df = df
예제 #25
0
def ert_by_hour_by_auction(df, evaluate_var):
    # Log de Precios
    # df['PESPANIA'] = np.log(df['PESPANIA'])
    # df['PPORTUGAL'] = np.log(df['PPORTUGAL'])
    del df['PPORTUGAL']

    df['FECHA'] = df['ANIO'].map(str) + '-' + df['MES'].map(
        str) + '-' + df['DIA'].map(str)
    df['FECHA'] = pd.to_datetime(df['FECHA'], format='%Y-%m-%d')

    df['WEEKDAY'] = df['FECHA'].dt.dayofweek

    # df['DUMMY_2010_REGIMEN'] = pd.Series(0, index=df.index)
    # df.loc[df['FECHA'] >= '2010-01-01', 'DUMMY_2010_REGIMEN'] = 1
    # df = df[df['FECHA'] >= '2010-01-01']

    df['ANIO'] = df['ANIO'].map(int)
    df['MES'] = df['MES'].map(int)
    df['DIA'] = df['DIA'].map(int)

    # df = df.groupby(['FECHA']).mean().reset_index()

    del df['HORA']
    del df['FECHA_HORA']
    del df['DIA']

    # TARGET VARIABLE

    dummy_important = [
        'DUMMY_5_DAY', 'DUMMY_10_DAY', 'DUMMY_15_DAY', 'DUMMY_20_DAY',
        'DUMMY_30_DAY'
    ]
    dummy_important.remove(evaluate_var)
    for i in dummy_important:
        del df[i]

    # DIFFERENCIATE

    # DIFERENCIA PESPANIA
    '''
    df['PESPANIA'] = df['PESPANIA'] - df['PESPANIA'].shift(1)
    df = df.dropna(axis=0)

    # DIFERENCIA RESTO

    need_differenciation = ['TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR', 'CICLO_COMBINADO', 'FUEL_PRIMA',
                            'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM', 'TME_MADRID', 'TMAX_MADRID', 'TME_BCN',
                            'TMAX_BCN', 'TMIN_BCN', 'GDP']


    for i in need_differenciation:
        name = 'D_' + str(i)
        df[name] = df[i] - df[i].shift(1)
        del df[i]

    df = df.dropna()
    '''

    # DUMMIES
    dummy_var = ['ANIO', 'MES', 'WEEKDAY']
    for i in dummy_var:
        name = str(i)
        dummy = pd.get_dummies(df[i], prefix=name)
        df = pd.concat([df, dummy], axis=1)
        del dummy
        del df[i]

    # LAGS
    lag_AR = 24
    for i in range(1, lag_AR + 1, 1):
        name = 'PESPANIA_lag_' + str(i)
        df[name] = df['PESPANIA'].shift(i)

    lag_number = 24
    lag_variables = [
        'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES', 'TOTAL_DEMANDA_NAC_ES',
        'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES', 'TOTAL_POT_IND_ES',
        'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR',
        'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO', 'FUEL_SIN_PRIMA',
        'FUEL_PRIMA', 'REG_ESPECIAL', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM'
    ]

    for i in range(1, lag_number, 1):
        for j in lag_variables:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    lag_number = 24
    climaticas = [
        'TME_MADRID', 'TMAX_MADRID', 'TMIN_MADRID', 'PP_MADRID', 'TME_BCN',
        'TMAX_BCN', 'TMIN_BCN', 'PP_BCN'
    ]
    for i in range(1, lag_number + 1, 1):
        for j in climaticas:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    lag_number = 24
    portugal = ['TOTAL_DEMANDA_POR', 'TOTAL_PRODUCCION_POR']
    for i in range(1, lag_number + 1, 1):
        for j in portugal:
            name = str(j) + '_lag_' + str(i)
            df[name] = df[j].shift(i)

    df = df.dropna(how='any', axis=0)

    normal = df[df[evaluate_var] == 0]
    anormal = df[df[evaluate_var] == 1]

    del normal[evaluate_var]
    del anormal[evaluate_var]

    # NORMALIZE
    column_names = normal.columns.values.tolist()
    normal_date = normal[['FECHA']]
    normal = preprocessing.robust_scale(normal.drop('FECHA', axis=1).values)
    normal = pd.DataFrame(normal)
    normal = pd.concat([normal, normal_date], axis=1)
    normal = pd.DataFrame(normal, columns=[column_names])

    column_names = anormal.columns.values.tolist()
    anormal_date = anormal[['FECHA']]
    anormal = preprocessing.robust_scale(anormal.drop('FECHA', axis=1).values)
    anormal = pd.DataFrame(anormal)
    anormal = pd.concat([anormal, anormal_date], axis=1)
    anormal = pd.DataFrame(anormal, columns=[column_names])

    total_values = len(df.index)
    print('total rows ', total_values)
    anormal_values = len(anormal.index)
    print('anormal rows ', anormal_values)

    proportion = anormal_values / total_values
    print('proportion of anormal ', proportion)

    normalY = normal[['PESPANIA']]
    normalX = normal
    del normalX['PESPANIA']

    anormalY = anormal[['PESPANIA']]
    anormalX = anormal
    del anormalX['PESPANIA']

    names = normalX.columns.values
    fileNames = np.array(names)

    # Solo tomamos test y train del normal con el mismo tamaño del test que la muestra de anormales
    X_train, X_test, y_train, y_test = train_test_split(normalX,
                                                        normalY,
                                                        test_size=proportion,
                                                        random_state=42)

    auction_date = df.FECHA[df['DUMMY'] == 1].tolist()

    del df['DUMMY']

    days_before = 30

    for auction_date in auction_date:

        anormalY = df[df['FECHA'] >= auction_date -
                      datetime.timedelta(days=days_before)]
        anormalY = anormalY[anormalY['FECHA'] <= auction_date]
        anormalY = anormalY[['PESPANIA']]

        df_before_auction = df[df['FECHA'] < auction_date -
                               datetime.timedelta(days=days_before)]
        df_before_auction_Y = df_before_auction[['PESPANIA']]
        df_before_auction_X = df_before_auction.drop(['FECHA', 'PESPANIA'],
                                                     axis=1)

        normalY = df[df['FECHA'] >= auction_date]
        normalY = normalY[normalY['FECHA'] <= auction_date +
                          datetime.timedelta(days=days_before)]
        future_dates = normalY.drop(['FECHA', 'PESPANIA'], axis=1)
        normalY = normalY[['PESPANIA']]

        df_before_auction1 = df[df['FECHA'] <= auction_date]
        df_before_auction_Y_1 = df_before_auction1[['PESPANIA']]
        df_before_auction_X_1 = df_before_auction1.drop(['FECHA', 'PESPANIA'],
                                                        axis=1)

        min_samples_leaf = round(len(df_before_auction.index) * 0.005)
        print('min_samples_leaf ', min_samples_leaf)
        min_samples_split = min_samples_leaf * 10
        print('min_samples_split ', min_samples_split)
        iTrees = 100
        print('iTrees ', iTrees)
        depth = 50
        maxFeat = (round((len(df.columns) / 3)))
        print('Feature Set ', maxFeat)

        fileModel = ensemble.GradientBoostingRegressor(
            learning_rate=0.01,
            n_estimators=iTrees,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_depth=depth,
            verbose=1)

        fileModel.fit(df_before_auction_X, df_before_auction_Y)

        fileModel2 = ensemble.GradientBoostingRegressor(
            learning_rate=0.01,
            n_estimators=iTrees,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_depth=depth,
            verbose=1)
        fileModel2.fit(df_before_auction_X_1, df_before_auction_Y_1)

        # Predecimos los siguientes días para el anormal
        y_hat = fileModel.predict(future_dates)
        y_hat = pd.DataFrame(y_hat, columns=['yhat'])
        prediction_anormal = y_hat
        anormalY = anormalY.reset_index(drop=True)

        # Predecimos los siguientes días para el normal
        y_hat = fileModel2.predict(future_dates)
        y_hat = pd.DataFrame(y_hat, columns=['yhat'])
        prediction_normal = y_hat
        y_test = normalY.reset_index(drop=True)

        print('MSE ANORMAL ', mean_squared_error(anormalY, prediction_anormal))
        print('R2 ANORMAL ', r2_score(anormalY, prediction_anormal))

        # Comparamos con el verdadero valor

        print('MSE NORMAL ', mean_squared_error(y_test, prediction_normal))
        print('R2 NORMAL ', r2_score(y_test, prediction_normal))

        prediction_normal = pd.DataFrame(prediction_normal, index=y_test.index)
        prediction_normal = pd.concat([y_test, prediction_normal], axis=1)
        prediction_normal.columns = [
            'PESPANIA_REAL_NO_COLUSION', 'PESPANIA_PRED_NO_COLUSION'
        ]
        prediction_normal['DIF_PORC'] = (
            prediction_normal['PESPANIA_REAL_NO_COLUSION'] -
            prediction_normal['PESPANIA_PRED_NO_COLUSION']
        ) / prediction_normal['PESPANIA_PRED_NO_COLUSION']
        print('PRECIO PROMEDIO PREDICHO - NO COLUSION %.5f' %
              prediction_normal['PESPANIA_PRED_NO_COLUSION'].mean())
        print('PRECIO PROMEDIO REAL - NO COLUSION %.5f ' %
              prediction_normal['PESPANIA_REAL_NO_COLUSION'].mean())
        print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)',
              prediction_normal['DIF_PORC'].mean() * 100, '%')

        prediction_anormal = pd.DataFrame(prediction_anormal,
                                          index=anormalY.index)
        prediction_anormal = pd.concat([anormalY, prediction_anormal], axis=1)
        prediction_anormal.columns = [
            'PESPANIA_REAL_COLUSION', 'PESPANIA_PRED_COLUSION'
        ]
        prediction_anormal['DIF_PORC'] = (
            prediction_anormal['PESPANIA_REAL_COLUSION'] -
            prediction_anormal['PESPANIA_PRED_COLUSION']
        ) / prediction_anormal['PESPANIA_PRED_COLUSION']
        print('PRECIO PROMEDIO PREDICHO - COLUSION %.5f' %
              prediction_anormal['PESPANIA_PRED_COLUSION'].mean())
        print('PRECIO PROMEDIO REAL - COLUSION %.5f' %
              prediction_anormal['PESPANIA_REAL_COLUSION'].mean())
        print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)',
              prediction_anormal['DIF_PORC'].mean() * 100, '%')
        # prediction_anormal.to_csv('prediction_anormal_dia.csv', sep=';', index=False)

        fig, ax = plot.subplots()
        prediction_anormal = prediction_anormal.reset_index()
        sns.regplot(y='PESPANIA_PRED_COLUSION',
                    x='index',
                    data=prediction_anormal,
                    ax=ax,
                    label='PREDICTED')
        sns.regplot(y='PESPANIA_REAL_COLUSION',
                    x='index',
                    data=prediction_anormal,
                    ax=ax,
                    label='REAL')
        # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction')
        plot.legend(loc='best')
        plot.title('Differences between Prices')
        plot.show()

        fig, ax = plot.subplots()
        prediction_normal = prediction_normal.reset_index()
        sns.regplot(y='PESPANIA_PRED_NO_COLUSION',
                    x='index',
                    data=prediction_normal,
                    ax=ax,
                    label='PREDICTED')
        sns.regplot(y='PESPANIA_REAL_NO_COLUSION',
                    x='index',
                    data=prediction_normal,
                    ax=ax,
                    label='REAL')
        # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction')
        plot.legend(loc='best')
        plot.title('Differences between Prices')
        plot.show()
예제 #26
0
파일: views.py 프로젝트: ismaleuser/MLAV01
    def svmModel(self, training_filename, test_filename):
        # Cargar los datos del fichero
        f = open(test_filename)
        lines = f.readlines()
        f.close()
        # Inicializar los vectores de datos
        matriz = []
        classify = []
        # Convierto en una lista los datos de la primera fila que vienen separados por coma
        format = list(lines[0].strip().split(','))
        # Recorro las líneas del fichero a partir de la segunda
        for line in lines[1:]:
            # Inicializo variable temporal
            vector = []
            # Convierto en lista los elementos de la fila que vienen separados por coma
            lista = list(line.split(','))
            # Recorro la lista
            for i in range(len(lista)):
                # Lleno los vectores correspondientes
                if format[i] == 'num':
                    vector.append(float(lista[i]))
                elif format[i] == 'class':
                    classify.append(list(lista[i][:-2]))
            # Se llena la matriz
            matriz.append(vector)

        # Se aplica el preprocesamiento
        Scaled = preprocessing.scale(matriz)
        Normalized = preprocessing.normalize(matriz, norm='l2')
        robustScaled = preprocessing.robust_scale(matriz,
                                                  axis=0,
                                                  with_centering=True,
                                                  with_scaling=True,
                                                  quantile_range=(25.0, 75.0),
                                                  copy=True)
        #        return JsonResponse({"Prueba": list(robustScaled)})

        # Se crea el modelo a partir del clasificador seleccionado y con los datos escogidos
        clf = svm.SVC(gamma=0.5, C=100.)
        clf.fit(matriz, classify)

        # Se guarda el modelo en el fichero
        joblib.dump(clf, 'modelo.joblib')

        # Para clasificar los nuevos datos
        # Cargar el modelo guardado
        clf2 = joblib.load('modelo.joblib')

        # Inicializar variables para ver resultados
        numCorrect = 0.0
        numIncorrect = 0.0

        # Aplicar clasificación a los datos
        res = clf2.predict(matriz)
        # Se inicaliza una lista para comparar resultados
        Solu = []
        # Convertir el resultado en una lista
        resList = list(res)
        # Recorrer la lista para adicionar a una lista cada elemento del resultado convertido en lista
        for j in range(len(resList)):
            Solu.append(list(res[j]))

        # Comparar resultados obtenidos con los utilizados en la creación del modelo
        resultado = []
        for i in range(len(resList)):
            if Solu[i] == classify[i]:
                numCorrect += 1
            else:
                numIncorrect += 1

        # Devuelvo en formato JSON los resultados
#        resultado.append("%4.2f%% correct" % (numCorrect))
        return JsonResponse({
            "Clasificado": list(res),
            "Original": classify,
            "Correctamente clasificados": numCorrect,
            "Mal clasificados": numIncorrect
        })
예제 #27
0
path3 = os.path.join(current_path, 'dataset/prostate_all_samples_trees.csv')

# Read data sets
#d1 = filter_dataset(Dataset(path1, scale=False, normalize=False, sep=','), 0.25, fdr=True)
#d2 = filter_dataset(Dataset(path2, scale=False, normalize=False, sep=','), 0.10, fdr=false)
d3 = filter_dataset(Dataset(path3, scale=False, normalize=False, sep=','), 0.25, fdr=True)

# Find what is above Mean in each data set... 
# m1 = d1.matrix
# m1 = robust_scale(m1)
# m1[m1 < -0.33] = np.nan
# m1[m1 >  0.33] = np.nan
# m1 = m1 + 10

m3 = d3.matrix
m3 = robust_scale(m3)
i1 = m3 > -0.50
i2 = m3 < 0.50
r = i1 * i2
m3[r] = np.nan
m3 = m3 

# Join the data set into one matrix
# df1 = DataFrame(m1, index=d1.samples, columns=d1.genes)
#df2 = DataFrame(m2, index=d2.samples, columns=d2.genes)
df3 = DataFrame(m3, index=d3.samples, columns=d3.genes)
result = DataFrame()
#result = result.append(df1)
#result = result.append(df2)
result = result.append(df3)
예제 #28
0
	INPUT: Dataframe and column names in list
	OUTPUT: Dataframe with features (X) and target variable dataframe (y) 
	'''

	X = df.loc[:,cols]
	y = df.loc[:, 'current_market_value']

	return X,y

def scale(x,y):
	'''
	INPUT: Dataframe with features (X) and target variable dataframe (y)  
	OUTPUT: Scaled dataframes
	'''

    X = preprocessing.robust_scale(x)
    y = preprocessing.robust_scale(y)

    return X,y

def run_linear_models(x,y):
    '''
    Get an overview of performances of different linear models.
    Linear models: Linear Regression, Ridge, 3x Lasso, ElasticNet
    INPUT: Dataframe with features (X) and target variable dataframe (y)
    OUTPUT: Scores and feature importances of each model
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    lr = LinearRegression()
예제 #29
0
def scale(X):
    X_np = np.array(X.values)
    X_scaled = preprocessing.robust_scale(X_np)
    return X_scaled
예제 #30
0
def rf_reg_grid_search(df, features, label, param_grid, rand_state, scores,
                       name):
    """This routine calculates the random forest regression on a grid of
    hyper-parameters for the random forest method to test the best
    hyper-parameters. The analysis results of the test will be written out and
    saved.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            param_grid : dictionary-like structure
            Parameter grid of input parameters for the grid search

            rand_state : integer
            Setting the random state variables to ensure reproducibility

            scores : list of strings
            Setting the score by which the grid search should be evaluated

            name : strings
            Setting the name of the output file for the grid search which
            contains all information about the grid

    """

    X, y = sets.build_matrices(df, features, label)

    # Standardizing the data
    X = preprocessing.robust_scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=rand_state)

    print "Training sample size: ", X_train.shape
    print "Evaluation sample size: ", X_test.shape

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        reg = GridSearchCV(RandomForestRegressor(random_state=rand_state), \
                        param_grid,scoring='%s' % score,cv=5,n_jobs=4)

        reg.fit(X_train, y_train)

        print("Best parameters set found on training set:")
        print()
        print(reg.best_params_)
        print()
        print("Grid scores on training set:")
        print()
        means = reg.cv_results_['mean_test_score']
        stds = reg.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, reg.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()
        df = pd.DataFrame(reg.cv_results_)
        df.to_hdf('RF_GS_' + name + '_' + score + '.hdf5', 'data')
        print()
        print("The model is trained on the full development set (80%).")
        print("The scores are computed on the full evaluation set (20%).")
        print()
        y_true, y_pred = y_test, reg.predict(X_test)
        ml_an.evaluate_regression(y_test, y_pred)
        pz_an.evaluate_photoz(y_test, y_pred)
        print()
def RandomForest_Classifier():

    np.set_printoptions(precision=3, suppress=True)
    data_title = 'LungData'

    Data = np.loadtxt(
        '../Lung_Challenge_Features/Lung_Feature_Results_Final.csv',
        delimiter=",",
        skiprows=1,
        dtype="object")

    # Randomly divide the data according to a .5 proportion into train and test
    # In order to get the same random split, make sure that the np.random.seed
    # line is left uncommented.
    # np.random.seed(0)

    TrainData = np.zeros((0, Data.shape[1]))
    TestData = np.zeros((0, Data.shape[1]))

    for endpoint in np.arange(100, 600, 100):
        TempData = Data[endpoint - 100:endpoint]
        random_mask = np.random.rand(len(TempData)) < .5
        TrainData = np.vstack((TrainData, TempData[random_mask]))
        TestData = np.vstack((TestData, TempData[~random_mask]))

    np.random.shuffle(TrainData)
    np.random.shuffle(TestData)

    dimsT = np.shape(TestData)
    dims = np.shape(TrainData)

    yT = TestData[:, 1].astype(float)
    XT = TestData[:, 2:].astype(float)

    y = TrainData[:, 1].astype(float)
    X = TrainData[:, 2:].astype(float)

    # X = normalize(robust_scale(X, axis=1), axis=1)
    # XT = normalize(robust_scale(XT, axis=1), axis=1)

    X = robust_scale(normalize(X, axis=1), axis=1)
    XT = robust_scale(normalize(XT, axis=1), axis=1)

    best = [np.inf, np.inf, np.inf]
    Results = np.zeros((5, 2))
    Predictions = np.zeros((dimsT[0], 2))
    Predictions[:, 0] = yT
    Results[:, 0] = [1, 2, 3, 4, 5]
    Winner = []
    ParameterTuner = 20

    # for i in range(1,ParameterTuner+1):
    for i in [15]:
        # Some different machine learning options to test..

        for k in [[1, RandomForestClassifier(n_estimators=i)]]:
            # for k in [[1, RandomForestClassifier(n_estimators=(int(math.ceil(float(i)/10))), n_jobs=-1)]]:
            # for k in [[1, svm.LinearSVC(C=100,dual=True)]]:
            # for k in [[1, svm.SVC(C=10,kernel="rbf", degree=2)]]:
            # for k in [[1, BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=10), n_jobs=-1, n_estimators=20)]]:
            # for k in [[1, svm.NuSVC(nu=(.3), kernel="rbf", verbose=True, probability=True, tol=1e-6, decision_function_shape='ovr')]]:

            print 'Parameter Value: ' + str(i)

            clf = k[1].fit(X, y)

            XP = clf.predict(X)
            XTP = clf.predict(XT)
            RTP = np.random.choice(yT, dims[0])

            TempResults = np.zeros((5, 2))

            for category in range(1, 6):

                print 'Category: ' + str(category)
                # Assess the accuracy on training and testing data.
                # Also compare against a random model.
                count = [0, 0, 0]
                total = [0, 0, 0]
                actual = [y, yT, yT]
                predicted = [XP, XTP, RTP]
                prediction_labels = ['Training', 'Testing', 'Random']

                for j in range(0, min(dims[0], dimsT[0])):
                    for TestTrainRandom in xrange(3):
                        if actual[TestTrainRandom][j] == category:
                            if str(actual[TestTrainRandom][j]) != str(
                                    predicted[TestTrainRandom][j]):
                                count[TestTrainRandom] += 1
                            total[TestTrainRandom] += 1

                if not 0 in total:
                    for ptype in xrange(3):
                        print prediction_labels[ptype] + ' Error Rate: ' + str(
                            float(count[ptype]) / total[ptype])
                    print ""
                    TempResults[category - 1,
                                k[0]] = float(count[1]) / total[1]

            if np.mean(TempResults[:, k[0]]) < best[k[0]]:
                best[k[0]] = np.mean(TempResults[:, k[0]])
                # Results[k[0]+2] = np.std(TempResults[:,k[0]])
                Results[:, k[0]] = TempResults[:, k[0]]
                Predictions[:, k[0]] = XTP
                Winner += [i]

    Results[:, 1] = 1 - Results[:, 1]
    print 'Best Parameters: '
    print Winner
    print 'Accuracy per Category: '
    print Results

    np.savetxt('Results' + data_title + '.csv',
               Results,
               fmt="%s",
               delimiter=",")
    np.savetxt('Predictions' + data_title + '.csv',
               Predictions,
               fmt="%s",
               delimiter=",")

    print "All done!"
예제 #32
0
def rf_reg_example(df,
                   features,
                   label,
                   params,
                   rand_state,
                   save=False,
                   save_filename=None):
    """This routine calculates an example of the random forest regression tuned
    to photometric redshift estimation. The results will be analyzed with the
    analyis routines/functions provided in ml_eval.py and photoz_analysis.py

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            rand_state : integer
            Setting the random state variables to ensure reproducibility


    """

    # Building test and training sample
    X, y = sets.build_matrices(df, features, label)

    # Standardizing the data
    X = preprocessing.robust_scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=rand_state)

    # Random Forest Regression
    reg = RandomForestRegressor(**params)

    reg.fit(X_train, y_train)

    y_pred = reg.predict(X_test)

    feat_importances = reg.feature_importances_

    # Save predicted and test y values for later analysis
    if save:
        if save_filename:
            results = pd.DataFrame(data=np.array([y_pred, y_test]).T,
                                   columns=['y_pred', 'y_test'])
            results.to_csv(save_filename + '.csv', index=False)

        else:
            print "Error: No Filename supplied!"

    # Evaluate regression method
    print "Feature Importances "
    for i in range(len(features)):
        print str(features[i]) + ": " + str(feat_importances[i])
    print "\n"

    ml_an.evaluate_regression(y_test, y_pred)

    pz_an.plot_redshifts(y_test, y_pred)
    pz_an.plot_error_hist(y_test, y_pred)
    plt.show()
예제 #33
0
    features_array_X[index, 5] = out_degree_dictionary[second_node]
    features_array_X[index, 6] = degree_centrality_dictionary[first_node]
    features_array_X[index, 7] = degree_centrality_dictionary[second_node]
    features_array_X[index, 8] = common_in_neighbors_dictionary[(first_node,
                                                                 second_node)]
    features_array_X[index, 9] = common_out_neighbors_dictionary[(first_node,
                                                                  second_node)]
    features_array_X[index,
                     10] = nodes_core[first_node] + nodes_core[second_node]
    features_array_X[index, 11] = nodes_pagerank[first_node]
    features_array_X[index, 12] = nodes_pagerank[second_node]
    index += 1

features_array_X = robust_scale(features_array_X,
                                axis=0,
                                with_centering=False,
                                with_scaling=True,
                                copy=True)

# Normalize X array per feature
#features_array_X = normalize_array(features_array_X,0)
# Normalize X array per training instance
features_array_X = normalize_array(features_array_X, 1)

print("Starting training...")
rand_forest = RandomForestClassifier(n_estimators=25, min_samples_leaf=50)
rand_forest.fit(features_array_X, Y_train)
print("It's prediction time...")
predictions = rand_forest.predict_proba(features_array_X)

#print("Starting training...")
예제 #34
0
    def load_additional_positionwise_data(self,
                                          class_files,
                                          identifier,
                                          standardize=False):
        """ Add additional numerical features to the network (for each nucleotide in a sequence).

        For every position in an input sequence additional numerical data can be added to
        the network (e.g. ChIP-seq signal, conservation for every nucleotide).
        The data will be added to the input matrix. E.g.: Using sequences of length 200
        over the alphabet "ACGT" results in input matrices of size 4x200. Additional position-wise
        data will be added to these matrices as a new row resulting in matrices of size 5x200.
        
        Input files are text files and must contain as many whitespace-separated values 
        in each line as the sequences are long, e.g.:
        
        '0.679 1.223 -0.296  ...
        '0.961 0.532 0.112   ...
        '0.065 -0.333 -0.256 ...
        '...
        
        The number of provided files must match the fasta files provided to the __init__
        function (e.g. if you provided a list of 3 files to __init__ you must provide a list
        of 3 files here as well) and the number of lines in each file must match the number of
        entries in the corresponding fasta file. If you want to add multiple features simply
        call this function multiple times.

        Input features should be standardized in some way prior to adding them to the
        network, as this tends to improve the predictive performance.

        In the same way network kernels are visualized as sequence motifs after the network
        training (based on the first 4 rows of the input matrices and using the visualize_kernel()
        Model function), the rows corresponding to additional features are summarized
        as line plots as well.

        Parameters
        ----------
        class_files: str or [str]
            A text file (multi-label) or a list of text files (single-label).
        
        identifier: str
            A short feature name (will be shown in kernel output plots).

        standardize: bool
            Scale each column according to the interquartile range.
        """
        if not "positionwise" in dir(self):
            self.positionwise = OrderedDict()
        if identifier in self.positionwise:
            raise RuntimeError(
                "Identifier '{}' already exists.".format(identifier))
        if not isinstance(class_files, list):
            class_files = [class_files]
        len_sequence = self.data[0].shape[0]

        new_data = np.empty((len(self.labels), len_sequence), dtype=np.float32)
        row = 0
        for file_name in class_files:
            handle = io.get_handle(file_name, 'rt')
            for i, line in enumerate(handle):
                try:
                    new_data[row, :] = [float(x) for x in line.split()]
                except ValueError as err:
                    raise RuntimeError(
                        "ValueError: {} (in line {} in {}).".format(
                            err, i + 1, file_name))
                row += 1
            handle.close()
        if row != len(self.labels):
            raise RuntimeError(
                "Amount of additional data ({}) doesn't match number of sequences ({})."
                .format(row, len(self.labels)))
        if True == standardize:
            from sklearn.preprocessing import robust_scale
            self.positionwise[identifier] = robust_scale(new_data, axis=0)
            if not "positionwise_unscaled" in dir(self):
                self.positionwise_unscaled = OrderedDict()
            self.positionwise_unscaled[identifier] = new_data
        else:
            self.positionwise[identifier] = new_data
예제 #35
0
def normalize(x, positions):
    num_columns = x.shape[1]
    for i in range(num_columns):
        if i in positions:
            x[:, i:i + 1] = np.copy(preprocessing.robust_scale(x[:, i:i + 1]))
    return x
예제 #36
0
 def _preprocess_minmax(cls, array: np.ndarray) -> np.ndarray:
     frames, channels, window_size = array.shape
     scaled_array = array.transpose((1, 0, 2)).reshape((channels, -1))
     scaled_array = preprocessing.robust_scale(scaled_array, axis=1)
     return preprocessing.minmax_scale(scaled_array, axis=1).reshape(
         channels, frames, window_size).transpose(1, 0, 2)
### Extract features and labels from dataset for local testing
my_df = my_df[ features_list ]

# fill in NaN values strategy -- test all 3 to gauge impact on accuracy
#my_df = my_df.fillna(0)
#my_df = my_df.fillna( my_df.median() )
my_df = my_df.fillna( my_df.mean() )

my_df_array = np.array( my_df )
old_features_array = my_df_array[ :, 1: ]
values_array = my_df_array[ :, [0] ].astype(int)
values_array = np.ravel(values_array)

# test impact of scaling, use robust_scale due to outlier values
from sklearn import preprocessing
old_features_array_scaled = preprocessing.robust_scale( old_features_array )

# select most important features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFpr, chi2, f_classif

# test values of k from 2-14
selector = SelectKBest(f_classif, k=20)
features_array = selector.fit_transform(old_features_array_scaled, values_array)

# split scaled or unscaled data into train and test sets
from sklearn.cross_validation import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split( \
                                features_array, values_array, \
                                test_size = 0.2, random_state=16)
예제 #38
0
#            index = 16 * j + i
#            fig = figs[j]
#            ax = fig.add_subplot(4, 4, i + 1)
#            channels_data[start][index] = pearsonr(s[start], s[index])[0]
#            ax.plot(s[start], ys, 'r')
#            fig.tight_layout()

i=1
x = np.arange(0,3383.766,.001)

data = np.zeros(shape=(61,3383766))

for index, column in enumerate(data3):
	data[index] = np.concatenate((data1[index],data2[index],data3[index],data4[index],data5[index],data6[index]))

data = preprocessing.robust_scale(data,True,True,True)
#eeg = signal.detrend(np.array(data),type='constant')
#data = signal.medfilt(eeg)

#data = eeg-data

#matrix=[]
#for index, row in (enumerate(data)):
#	if(index!=63 and index!=62 and index!=61):
#		print index
#		matrix.append(row[3000000:-1])

#matrix = np.array(matrix)

for index, column in enumerate(data):
    #band = band +