Пример #1
0
def OLStraining(train,test,t_index,y_true):

    df_train = train.copy()
    df_test = test.copy()

    col = [c for c in df_train.columns if c not in excl]

    X = np.array([])
    y = np.array([])


    X = df_train.loc[df_train.timestamp.isin(t_index), col].copy()
    y = df_train.loc[df_train.timestamp.isin(t_index), "y"].copy()

    print("--- selected X shape %s" % (X.shape,))
    print("--- selected y shape %s" % (y.shape,))

    X = X.fillna(X.mean(axis=0))
    #
    # failed to fill with mean, then fill with zero
    #
    X = X.fillna(.0)

    X = ( X - X.mean(axis=0) ) / X.std(axis=0)

    X = np.array(X)

    X = sm.add_constant(X)
    ols_model = sm.OLS(y,X)
    res = ols_model.fit()

    print res.summary()


    Xt = df_test.loc[:, col].copy()
    Xt = Xt.fillna(Xt.mean(axis=0))
    Xt = Xt.fillna(.0)

    Xt = ( Xt - Xt.mean(axis=0) ) / Xt.std(axis=0)

    Xt = np.array(Xt)

    Xt = sm.add_constant(Xt)
    Xt = sm.add_constant(Xt)

    y_pred = res.predict(Xt)

    print(len(y_pred),len(y_true))

    print r_score(y_true,y_pred)
Пример #2
0
def testAalysis3(train, test, y_test_true):

    df_test = test.copy()
    df_train = train.copy()

    print("test shape %s" % (df_test.shape, ))
    print("y_test_true shape %s" % (y_test_true.shape, ))

    uniq_timestamp = df_test["timestamp"].unique()
    uniq_id = sorted(df_test["id"].unique())
    #print uniq_id

    print("-- selected features id: %d" % uniq_timestamp)

    train_uniq_id = sorted(df_train["id"].unique())
    train_uniq_timestamp = sorted(df_train["timestamp"].unique())

    new_test_uniq_id = np.array(
        [test_id for test_id in uniq_id if test_id not in train_uniq_id])

    print("test timestamp length %d , test id length %d" %
          (len(uniq_timestamp), len(uniq_id)))
    print("train id length %d" % len(train_uniq_id))

    print("length of new test unique id %d" % new_test_uniq_id.shape)

    Xtrain = np.array(df_train)
    print Xtrain.shape
    train_id = Xtrain[:, 0]

    scores = {}
    for cnt, idx in enumerate(uniq_id):
        mask = Xtrain[:, 0] == idx
        select_one_id = Xtrain[mask, :].copy()

        mask = select_one_id[:, 1] > (uniq_timestamp - 30)

        select_one_id = select_one_id[mask, :]
        #print select_one_id.shape

        data = select_one_id[:, 2:110]
        y = select_one_id[:, 110].ravel()
        y = np.cumsum(y)

        if cnt % 100 == 0:
            print("-- trainig counter:%d test_id:%d" % (cnt, idx))

        y_pred, final_cost = fitting(data, y)
        if not np.isnan(final_cost):
            #print idx,r_score(y,y_pred)
            scores[idx] = r_score(y, y_pred)
        else:
            scores[idx] = 0.0

    print("-- total counter %d" % cnt)
    r_score_value = scores.values()
    print np.mean(r_score_value)
Пример #3
0
def testAalysis4(train, test, y_test_true):

    df_test = test.copy()
    df_train = train.copy()

    print("test shape %s" % (df_test.shape, ))
    print("y_test_true shape %s" % (y_test_true.shape, ))

    uniq_timestamp = df_test["timestamp"].unique()
    uniq_id = sorted(df_test["id"].unique())
    #print uniq_id

    print("-- selected features timestamp: %d" % uniq_timestamp)

    train_uniq_id = sorted(df_train["id"].unique())
    train_uniq_timestamp = sorted(df_train["timestamp"].unique())

    new_test_uniq_id = np.array(
        [test_id for test_id in uniq_id if test_id not in train_uniq_id])

    print("test id length %d" % (len(uniq_id)))
    print("train id length %d" % len(train_uniq_id))

    print("length of new test unique id %d" % new_test_uniq_id.shape)

    Xtrain = np.array(df_train)
    Xtest = np.array(df_test)

    print("* training data shape %s" % (Xtrain.shape, ))
    print("* test data shape %s" % (Xtest.shape, ))
    train_id = Xtrain[:, 0]

    scores = {}
    y_test_pred_dict = {}
    for cnt, idx in enumerate(uniq_id):
        mask = Xtrain[:, 0] == idx
        select_one_id = Xtrain[mask, :].copy()

        mask = select_one_id[:, 1] > (uniq_timestamp - 30)

        select_one_id = select_one_id[mask, :]
        #print select_one_id.shape

        data = select_one_id[:, 2:110]
        y = select_one_id[:, 110].ravel()
        y = np.cumsum(y)

        if cnt % 100 == 0:
            print("-- trainig counter:%d test_id:%d" % (cnt, idx))

        #
        #  data model used from PCA analysis 5 dimentions
        #
        y_pred, theta, final_cost = fitting(data, y)
        #
        if not np.isnan(final_cost):
            #print idx,r_score(y,y_pred)
            scores[idx] = r_score(y, y_pred)

            mask = Xtest[:, 0] == idx
            select_one_test_id = Xtest[mask, :].copy()
            test_data = select_one_test_id[:, 2:]

            data_stacked = np.vstack((data[1:, :], test_data))
            Xtt = dataPCA(data_stacked)
            y_test_diff = np.dot(Xtt, theta)
            #y_test_diff = np.diff(y_test)

            y_test_prediction = y_test_diff[-1]
            y_test_pred_dict[idx] = y_test_prediction
            #print(y_test_true[cnt],y_test_prediction)

        else:
            scores[idx] = 0.0
            y_test_pred_dict[idx] = 0.0

    print("-- total counter %d" % (cnt + 1))
    r_score_value = scores.values()
    print np.mean(r_score_value)

    return y_test_pred_dict
Пример #4
0
    features = observation.features.copy()

    #    y_hat = gmodel_test.predict2(observation.features.copy())

    target = observation.target

    target['y'] = y_hat

    timestamp = observation.features["timestamp"][0]

    if timestamp % 100 == 0:

        print("Timestamp #{}".format(timestamp))

        y_true = env.temp_test_y

        #y_true = np.exp(y_true)

        score_ = r_score(y_true, y_hat)
        rewards.append(score_)

        print("-- score %.5f" % np.mean(rewards))
        print("-- reward %.5f" % reward)

    # We perform a "step" by making our prediction and getting back an updated "observation":
    observation, reward, done, info = env.step(target)

    if done:
        print("Public score: {}".format(info["public_score"]))
        break
Пример #5
0
def proc1(log):

    env = make()

    observation_test = env.reset()

    emcv = ElasticNetCV()

    #columns = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19']

    columns = ['technical_30', 'technical_20', 'fundamental_11']

    train_data = observation_test.train.copy()

    gmodel_test = glmModel(train_data, columns)
    y_hat = gmodel_test.BuildModel()

    model_test = fitModel(emcv, train_data, columns)

    prediction_test = model_test.predict(observation_test.features.copy())

    print "No elasticnet observation :", len(prediction_test)
    #score_ = r_score(y_true, y_hat)

    #print score_

    return 1
    """
    train_data = observation_test.train.copy()

    features_data = observation_test.features.copy()
        
    feat_colNames = features_data.columns.values.tolist()[2:]
        
    #train_data = observation_test.features.copy    
    
    kaggleAnalysis = KaggleDataAnalysisClass(train_data,True)
    
    kaggleAnalysis.corrCheck(feat_colNames)    
    
    #emcv = ElasticNetCV(fit_intercept = True)
    
    
    kaggleAnalysis.modelfit(emcv)
    """

    while True:

        prediction_test = model_test.predict(observation_test.features.copy())

        target_test = observation_test.target

        target_test['y'] = prediction_test
        """
        features_data = observation_test.features.copy()

        prediction_test = kaggleAnalysis.predict(features_data)  

        target_test      = observation_test.target

        target_test['y'] = prediction_test


        timestamp_ = observation_test.features["timestamp"][0]
    
        log.info("timestamp : %d " % timestamp_)

        """
        timestamp_ = observation_test.features["timestamp"][0]

        rewards = []
        if timestamp_ % 100 == 0:
            print(timestamp_)

            y_true = env.temp_test_y

            score_ = r_score(y_true, prediction_test)
            rewards.append(score_)

            log.info("score %.5f" % np.mean(rewards))

        observation_test, reward_test, done_test, info_test = env.step(
            target_test)

        #log.info("reward_test : %.5f " % reward_test)

        if done_test:
            print('Info-test:', info_test['public_score'])

            break