示例#1
0
def read_file():
    file_content = pd.read_csv('train.csv')
    exc_cols = [u'Id', u'Response']
    cols = [c for c in file_content.columns if c not in exc_cols]
    train_datas = file_content.ix[:, cols]
    train_lables = file_content['Response'].values
    
    test_file = pd.read_csv('test.csv')
    test_ids = test_file['Id'].values
    test_datas = test_file.ix[:, [c for c in test_file.columns if c not in [u'Id']]]
    
    # 填充平均值
    test_datas = test_datas.fillna(-1)
    train_datas = train_datas.fillna(-1)
    all_datas = pd.concat([train_datas, test_datas], axis=0) 
    
    # 对数据进行一下划分
    categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
    all_file_data = all_datas.ix[:, [c for c in all_datas.columns if c not in categoricalVariables]]
    all_file_cate = all_datas.ix[:, [c for c in categoricalVariables]]
 
    # 归一化 对数值数据
    scalar_this = StandardScaler()
    scalar_this.fit_transform(all_file_data)
    
    # 重新组合数据
    train_datas = pd.concat([all_file_data[:train_datas.shape[0]], all_file_cate[:train_datas.shape[0]]], axis=1)
    test_datas = pd.concat([all_file_data[file_content.shape[0]:], all_file_cate[file_content.shape[0]:]], axis=1)
    
    # 向量化
    train_datas = DictVectorizer().fit_transform(train_datas.to_dict(outtype='records')).toarray()
    test_datas = DictVectorizer().fit_transform(test_datas.to_dict(outtype='records')).toarray()
    
    return (train_datas, train_lables, test_ids, test_datas)
示例#2
0
def boston_DBSCAN(class_num=0):
    '''给出Boston房价数据集中class_num类,数据形式为归一化数据列
    :parameter
    ————
    class_num:类号,读取函数所返回的类别号
    :returns
    ————
    x_boston:波士顿数据集中class_num类的自变量,归一化数据列,共13列
    y_boston:波士顿数据集中class_num类自变量,归一化数据列,共1列
    '''
    # 读取全数据集
    bostondata = load_boston()
    boston_X = bostondata.data
    boston_y = bostondata.target
    boston_full = np.c_[boston_X, boston_y]
    # 进行全数据集归一化
    scale = StandardScaler()
    boston_full = scale.fit_transform(boston_full)
    # 数据集降维为3维,方便可视化调参。
    pca = PCA(n_components=3)
    boston_full3 = pca.fit_transform(boston_full)
    # 分类
    clt = DBSCAN(eps=0.8, min_samples=5, n_jobs=4)
    label3 = clt.fit_predict(X=boston_full3)
    # 给定输出数据
    group0_boston = boston_full[label3 == 0]
    x_boston = group0_boston[:, 0:-2]
    y_boston = group0_boston[:, -1]
    return x_boston, y_boston
示例#3
0
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5):

    X = []
    y = []
    sids = []

    with open(infile, "r") as fi:
        fi.readline()
        reader = csv.reader(fi)
        for row in reader:
            sids.append(row[0])
            X.append(row[1:-1])
            y0 = int(row[-1])
            if y0 == 0:
                y0 = -1
            y.append(y0)
    y = np.array(y)

    if balanced:
        X, y = balance_X_y(X, y, seed)

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    encoder = OneHotEncoder(categorical_features=[1, 2, 3])
    encoder.fit(X)
    X = encoder.transform(X).toarray()

    X, y = shuffle_X_y(X, y, seed)

    scale_model = StandardScaler()
    X = scale_model.fit_transform(X)

    return X, np.expand_dims(y, axis=1)
示例#4
0
def get_data(args, logger, debug):
    '''Get data.'''
    # Get data:
    train_data, val_data, test_data = _get_data(args, logger)

    debug(f'train size = {len(train_data):,} | val size = {len(val_data):,} |'
          f' test size = {len(test_data):,}')

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(args.data_df)
        debug('Class sizes')
        debug(class_sizes)

    # Scale features:
    if args.features_scaling:
        features_scaler = train_data.normalize_features()
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    # Initialise scaler and scale training targets by subtracting mean and
    # dividing standard deviation (regression only):
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        scaler = StandardScaler()
        targets = scaler.fit_transform(train_data.targets())
        train_data.set_targets(targets)
    else:
        scaler = None

    return train_data, val_data, test_data, scaler, features_scaler
示例#5
0
def NUS_WIDE_load_three_party_data(data_dir,
                                   selected_labels,
                                   neg_label=-1,
                                   n_samples=-1):
    print("# load_three_party_data")
    Xa, Xb, Xc, y = get_labeled_data_with_3_party(
        data_dir=data_dir,
        selected_labels=selected_labels,
        n_samples=n_samples)

    scale_model = StandardScaler()
    Xa = scale_model.fit_transform(Xa)
    Xb = scale_model.fit_transform(Xb)
    Xc = scale_model.fit_transform(Xc)

    y_ = []
    pos_count = 0
    neg_count = 0
    for i in range(y.shape[0]):
        # the first label in y as the first class while the other labels as the second class
        if y[i, 0] == 1:
            y_.append(1)
            pos_count += 1
        else:
            y_.append(neg_label)
            neg_count += 1

    print("pos counts:", pos_count)
    print("neg counts:", neg_count)

    y = np.expand_dims(y_, axis=1)

    n_train = int(0.8 * Xa.shape[0])
    Xa_train, Xb_train, Xc_train = Xa[:n_train], Xb[:n_train], Xc[:n_train]
    Xa_test, Xb_test, Xc_test = Xa[n_train:], Xb[n_train:], Xc[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]

    print("Xa_train.shape:", Xa_train.shape)
    print("Xb_train.shape:", Xb_train.shape)
    print("Xc_train.shape:", Xc_train.shape)
    print("Xa_test.shape:", Xa_test.shape)
    print("Xb_test.shape:", Xb_test.shape)
    print("Xc_test.shape:", Xc_test.shape)
    print("y_train.shape:", y_train.shape)
    print("y_test.shape:", y_test.shape)
    return [Xa_train, Xb_train, Xc_train,
            y_train], [Xa_test, Xb_test, Xc_test, y_test]
示例#6
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
示例#7
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
示例#8
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch
    savename = args.savename if args.savename is not None else 'model-' + str(
        n_rollout) + 'unroll'

    np.random.seed(1098)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name].values()]
                    for name in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

    x = np.concatenate((x_target, x_first, x_speed), axis=1)

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    output_scaler = StandardScaler()
    effort_concat = np.concatenate([a for a in effort], axis=0)
    output_scaler.fit(effort_concat)
    effort = [output_scaler.transform(eff) for eff in effort]

    y = pad_sequences(effort, padding='post', value=0.)
    aux_output = pad_sequences(aux_output, padding='post', value=0.)
    x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x,
                                                               y,
                                                               aux_output,
                                                               test_size=0.2)

    y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)]
    y_aux_mask, y_aux_test_mask = [
        np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test)
    ]

    model = MyModel(train=[x, [y, y_aux]],
                    val=[x_test, [y_test, y_aux_test]],
                    train_mask=[y_mask, y_aux_mask],
                    val_mask=[y_test_mask, y_aux_test_mask],
                    max_unroll=n_rollout,
                    name=savename)

    if not os.path.exists('save'):
        os.makedirs('save')

    if args.train:
        model.fit(nb_epoch=n_epoch, batch_size=32)
    elif args.resume:
        model.resume(nb_epoch=n_epoch, batch_size=32)
示例#9
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)
        model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)
        all_obs.extend(testY)
        all_pred.extend(prediction)
        
    return rmseEval(all_obs, all_pred)[1]
示例#10
0
def test_scalar():
    from sklearn.preprocessing.data import MinMaxScaler, StandardScaler
    scalar = StandardScaler()
    
    training = pd.read_csv(TRAIN_FEATURES_CSV, nrows=200000)
    test = pd.read_csv(TEST_FEATURES_CSV)
    
    # normalize the values
    for column in TOTAL_TRAINING_FEATURE_COLUMNS:
        training[column] = scalar.fit_transform(training[column])
        test[column] = scalar.transform(test[column])
示例#11
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)

        layers = []
        for _ in range(0, parameters["hidden_layers"]):
            layers.append(
                Layer(parameters["hidden_type"],
                      units=parameters["hidden_neurons"]))
        layers.append(Layer("Linear"))
        model = Regressor(layers=layers,
                          learning_rate=parameters["learning_rate"],
                          n_iter=parameters["iteration"],
                          random_state=42)

        X = np.array(trainX)
        y = np.array(trainY)

        model.fit(X, y)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)

        print("location: " + str(location) + " -> " +
              str(rmseEval(prediction, testY)[1]))

        all_obs.extend(testY)
        all_pred.extend(prediction)

    return rmseEval(all_obs, all_pred)[1]
示例#12
0
def retrieve_data(undersampling=False, ratio=1, random_state=None):
    ## Getting and reading csv-data files into a pandas dataframe
    path = os.path.dirname(os.path.realpath(__file__))
    file1 = path + "/../data/creditcard_part1.csv"
    file2 = path + "/../data/creditcard_part2.csv"

    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    df = pd.concat((df1, df2), ignore_index=True)

    ## Finding the class balances
    class_counts = df.Class.value_counts()
    num_fraudulent = class_counts[1]
    num_non_fraudulent = class_counts[0]

    ## Splitting the dataset into design matrix X and targets y
    X = df.loc[:, df.columns != 'Class'].values
    y = df.loc[:, df.columns == 'Class'].values.ravel()

    #### StandardScaler is more useful for classification, and Normalizer is more useful for regression.
    standard_scaler = StandardScaler()
    X = standard_scaler.fit_transform(X)

    ### Undersampling to fix imbalanced class
    if undersampling:

        if random_state is not None:
            np.random.seed(random_state)

        if ratio > 1:
            raise ValueError("Undersampling ratio can't be larger than one")

        multiplier = int(1.0 / ratio)

        ## Randomized undersampling method
        indices_nonfraud = np.where(y == 0)[0]
        indices_fraud = np.where(y == 1)[0]
        np.random.shuffle(indices_nonfraud)
        indices_nonfraud_under = indices_nonfraud[:multiplier * num_fraudulent]
        indices_under = np.concatenate((indices_fraud, indices_nonfraud_under))
        np.random.shuffle(indices_under)

        ## Using indices from undersampling method to create new balanced dataset
        X_under = X[indices_under]
        y_under = y[indices_under]

    ## Splitting the dataset into test and training sets
    X_train, X_test, y_train, y_test = train_test_split(X_under,
                                                        y_under,
                                                        test_size=0.33,
                                                        random_state=4)
    return X_train, X_test, y_train, y_test
示例#13
0
    def _proccess_input(self, target_pos, target_speed, pos, vel, effort):
        x_target = np.array(target_pos)
        x_first = np.array([pos_[0] for pos_ in pos])
        x_speed = np.array(target_speed).reshape((-1, 1))
        aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

        x = np.concatenate((x_target, x_first, x_speed), axis=1)

        input_scaler = StandardScaler()
        x = input_scaler.fit_transform(x)
        output_scaler = StandardScaler()
        effort_concat = np.concatenate([a for a in effort], axis=0)
        output_scaler.fit(effort_concat)
        effort = [output_scaler.transform(eff) for eff in effort]

        y = pad_sequences(effort, padding='post', value=0.)
        aux_output = pad_sequences(aux_output, padding='post', value=0.)
        x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2)
        return x, x_test, y, y_test, y_aux, y_aux_test
示例#14
0
    def test_iris(self):
        train_X, test_X, train_y, test_y = data_io.get_iris_train_test()
        print("train_X's shape = %s, train_y's shape = %s" %
              (train_X.shape, train_y.shape))
        print("test_X's shape = %s, test_y's shape = %s" %
              (test_X.shape, test_y.shape))

        print("Applying standard scaling ...")
        scaler = StandardScaler()
        train_X = scaler.fit_transform(train_X)
        test_X = scaler.transform(test_X)

        # train_X = test_X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
        # train_y = test_y = np.array([0, 1, 1, 0])

        # train_X = test_X = np.array([[0], [1]])
        # train_y = test_y = np.array([0, 1])

        layers = [100]
        clf = MLPClassifier(layers,
                            batch_size=train_X.shape[0],
                            n_epochs=100,
                            learning_rate=0.1)
        print("clf: %s" % clf)

        print("Fitting ...")
        clf.fit(train_X, train_y)

        print("Predicting ...")
        pred_y = clf.predict(test_X)
        print("y = %s" % test_y)
        print("pred_y = \n%s" % pred_y)

        # pred_proba_y = clf.predict_proba(test_X)
        # print("pred_proba_y = \n%s" % pred_proba_y)

        accuracy = accuracy_score(test_y, pred_y)
        print("Accuracy = %g%%" % (100 * accuracy))

        self.assertGreaterEqual(accuracy, 0.89)
示例#15
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0),
        [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
        X_csr_scaled.astype(np.float))
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
示例#16
0
def main():
    df = load_train_data()
    logger.info('column hash = %d', utils.column_hash(df))
    df = preprocess.drop_column(df, 'fullVisitorId')
    df = preprocess.drop_column(df, 'sessionId')
    #    debug_info(df)

    y = df['totals_transactionRevenue']
    X = preprocess.drop_column(df, 'totals_transactionRevenue')

    #    X, _, y, _ = utils.split_data(X, y, ratio=0.9, seed=42)

    #    n_classes = 10
    n_models = 100

    y_max = y.max()

    for i in range(n_models):

        X_train, X_test, y_train, y_test = utils.split_data(X, y)

        logger.info('training')

        #         y_train, quants = preprocess.make_class_target(y_train, n_classes)
        #         logger.info('y_train.unique() = %s', y_train.unique())
        #         logger.info('quants = %s', quants)

        #        y_train = preprocess.make_class_target2(y_train, y_max, n_classes)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)

        logger.info('X_train.shape = %s', X_train.shape)

        #         cumulative = np.cumsum(pca.explained_variance_ratio_)
        #         pylab.plot(cumulative, 'r-')
        #         pylab.show()

        #        model = build_classifier(X_train.shape[1], n_classes)
        model = build_regressor(X_train.shape[1])
        EPOCHS = 100
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=5)
        history = model.fit(X_train,
                            y_train,
                            epochs=EPOCHS,
                            validation_split=0.1,
                            verbose=0,
                            callbacks=[early_stop,
                                       utils.EpochCallback()])

        linear_model = LinearRegression()
        linear_model.fit(X_train, y_train)

        logger.info('predicting')
        logger.info('X_test.shape = %s', X_test.shape)

        X_test = scaler.transform(X_test)

        #        y_classes = model.predict(X_test)
        #        y_pred = postprocess.make_real_predictions(y_classes, quants)
        #        y_pred = postprocess.make_real_predictions2(y_classes, y_max)

        y_pred = model.predict(X_test).flatten()
        y_linear_pred = linear_model.predict(X_test)

        rms = np.sqrt(mean_squared_error(y_test, y_pred))
        linear_rms = np.sqrt(mean_squared_error(y_test, y_linear_pred))
        logger.info('rms = %s', rms)
        logger.info('linear_rms = %s', linear_rms)

        #        save_model(model, i, quants, scaler)
        save_model2(model, linear_model, i, y_max, scaler)


#    plot_history_classifier(history)
    plot_history_regressor(history)

    pylab.figure()
    pylab.scatter(y_pred, y_test, alpha=0.5)
    pylab.xlabel("pred")
    pylab.ylabel("test")

    hist_revenue(y_linear_pred, 'y_linear_pred')
    hist_revenue(y_pred, 'y_pred')
    hist_revenue(y_test, 'y_test')

    pylab.show()
示例#17
0
columns = []
loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns)

all_features = deepcopy(columns)
all_features.remove("target")
all_features.remove("location")

output = open(OUTPUT_DATA_FILE, 'w')
output.write("location,observation,prediction\n")

for location in locations:
    print(str(location))
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, all_features, "target")
    normalizer_X = StandardScaler()
    trainX = normalizer_X.fit_transform(trainX)
    testX = normalizer_X.transform(testX)
    normalizer_Y = StandardScaler()
    trainY = normalizer_Y.fit_transform(trainY)
    testY = normalizer_Y.transform(testY)
    model = BaggingRegressor(base_estimator=SVR(kernel='rbf',
                                                C=40,
                                                cache_size=5000),
                             max_samples=4200,
                             n_estimators=10,
                             verbose=0,
                             n_jobs=-1)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    prediction = normalizer_Y.inverse_transform(prediction)
    testY = normalizer_Y.inverse_transform(testY)
示例#18
0
X2 = [[float(x) / 10.0, float(x) / 10.0,
       float(x) / 10.0] for x in range(0, 231)]

layers = []
layers.append(Layer("Rectifier", units=100))
layers.append(Layer("Rectifier", units=100))
layers.append(Layer("Rectifier", units=100))
layers.append(Layer("Linear"))

model = Regressor(layers=layers,
                  learning_rate=0.001,
                  n_iter=5000,
                  random_state=42)

normalizer_X = StandardScaler()
trainX = normalizer_X.fit_transform(X)
trainX2 = normalizer_X.fit_transform(X2)
normalizer_Y = StandardScaler()
trainY = normalizer_Y.fit_transform(Y)

model.fit(np.array(trainX), np.array(trainY))
Y_pred = model.predict(np.array(trainX2))
Y_pred = normalizer_Y.inverse_transform(Y_pred)

Y_pred = [y[0] for y in Y_pred]

print(str(Y_pred))

plot2(Y, Y_pred, OUTPUT_PNG_FILE, "Observed pollution concentration levels",
      "Predicted pollution concentration levels by ANR")
示例#19
0
    bestPar2 = clf1.best_params_
    writeDict(str(bestPar2), "dict2XGB.txt")
else:
    clf1 = XGBClassifier(**bestParamXGB2)
    clf1.fit(dataTrain, y)

delta = gmtime(time() - t0)
tstr = strftime('%H:%M:%S', delta)
print("Time since beginning:%s" % tstr)

# Scale data
print("Scaling")
scaler = StandardScaler()
imputer = Imputer()
dataTrain = imputer.fit_transform(dataTrain)
dataTrain = scaler.fit_transform(dataTrain)

#Fit classifier that needs imputation
print("Fitting")
if (gridSearchRF):
    clf2.fit(dataTrain, y)
    bestPar2 = clf2.best_params_
    writeDict(str(bestPar2), "dictRF.txt")
else:
    clf2 = XGBClassifier(**bestParamRF)
    clf2.fit(dataTrain, y)

print("Fit completed")

delta = gmtime(time() - t0)
tstr = strftime('%H:%M:%S', delta)
plt.ylabel("Amount", size=14)
plt.show()

## Plotting the correlation matrix. (Dataset is already PCA'd)
sb.heatmap(data=df.corr(), cmap="viridis", annot=False)
plt.show()

## There are no categories in the dataset, so no need to do one-hot encoding.

## Splitting the dataset into design matrix X and targets y
X = df.loc[:, df.columns != 'Class'].values
y = df.loc[:, df.columns == 'Class'].values.ravel()

## Scaling the data (Most for Time and Amount)
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)

## Randomized undersampling method
indices_nonfraud = np.where(y == 0)[0]
indices_fraud = np.where(y == 1)[0]
np.random.shuffle(indices_nonfraud)
indices_nonfraud_under = indices_nonfraud[:num_fraudulent]
indices_under = np.concatenate((indices_fraud, indices_nonfraud_under))
np.random.shuffle(indices_under)

## Using indices from undersampling method to create new balanced dataset
X_under = X[indices_under]
y_under = y[indices_under]

## Looking at the class balance again, now for undersampled data
plt.bar([0, 1], [len(indices_nonfraud_under), len(indices_fraud)])
示例#21
0
                  tf.keras.metrics.AUC(name='auc')
              ])
save_best_callback = tf.keras.callbacks.ModelCheckpoint(
    './model-{epoch:02d}-{acc:.2f}.hdf5',
    monitor='acc',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    save_freq=1)
logdir = os.path.join('tflogs',
                      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
tb_train_callback = tf.keras.callbacks.TensorBoard(logdir,
                                                   histogram_freq=1,
                                                   profile_batch=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model.fit(
    X_train_scaled,
    y_train,
    class_weight=class_weight,
    # batch_size=64,
    validation_split=0.1,
    callbacks=[save_best_callback, tb_train_callback],
    epochs=50)

# model = tf.keras.models.load_model('./model-35-0.88.hdf5')
X_test_scaled = scaler.transform(X_test)
model.evaluate(X_test_scaled, y_test)
# print(np.round(model.predict(X_test)))
示例#22
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch

    seed = 124
    np.random.seed(seed)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name_].values()]
                    for name_ in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    v_first = np.array([vel_[0] for vel_ in vel])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff_.shape[0]).reshape((-1, 1)) for eff_ in effort]

    x = np.concatenate((x_target, x_first, v_first, x_speed), axis=1)

    def prepare_time_data(data):
        data_scaler = StandardScaler()
        data_concat = np.concatenate(data, axis=0)
        data_scaler.fit(data_concat)
        new_data = [data_scaler.transform(data_) for data_ in data]

        return data_scaler, new_data

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    effort_scaler, effort = prepare_time_data(effort)
    pos_scaler, pos = prepare_time_data(pos)
    vel_scaler, vel = prepare_time_data(vel)

    torque = pad_sequences(effort, padding='post', value=0., dtype=np.float64)
    pos = pad_sequences(pos, padding='post', value=0., dtype=np.float64)
    vel = pad_sequences(vel, padding='post', value=0., dtype=np.float64)
    aux_output = pad_sequences(aux_output,
                               padding='post',
                               value=0.,
                               dtype=np.float64)
    mask = aux_output[:, :, 0]
    aux_mask = np.ones(aux_output.shape[:2])

    x, x_test, torque, torque_test, pos, pos_test, vel, vel_test, \
    aux, aux_test, mask, mask_test, aux_mask, aux_mask_test = \
        train_test_split(x, torque, pos, vel, aux_output, mask, aux_mask, test_size=0.3, random_state=seed)

    kf = KFold(n_splits=3, shuffle=True, random_state=seed)

    if not os.path.exists('save_model_selection'):
        os.makedirs('save_model_selection')

    for (train_index, cv_index), i in zip(kf.split(x), range(kf.n_splits)):
        widths_gru = [1000]
        depths_gru = [1]
        dropout_fractions = [0.5]
        convolution_layer = [False]
        l2_weights = [1e-3]
        names = [
            'gru:{}-{}_conv:{}_dropout:{}_l2:{}/fold:{}'.format(
                width_, depth_, conv_, drop_, l2_, i)
            for width_, depth_, conv_, drop_, l2_ in zip(
                widths_gru, depths_gru, convolution_layer, dropout_fractions,
                l2_weights)
        ]
        save_names = ['save_model_selection/' + name_ for name_ in names]
        log_names = ['log_model_selection/' + name_ for name_ in names]
        img_names = ['imgs/' + name_ for name_ in names]

        this_x, this_torque, this_pos, this_vel, this_aux, this_mask, this_aux_mask = \
            [a_[train_index] for a_ in [x, torque, pos, vel, aux, mask, aux_mask]]
        this_x_cv, this_torque_cv, this_pos_cv, this_vel_cv, this_aux_cv, this_mask_cv, this_aux_mask_cv = \
            [a_[cv_index] for a_ in [x, torque, pos, vel, aux, mask, aux_mask]]

        for width_gru, depth_gru, dropout_fraction, conv, l2_weight, save_name, log_name, img_name in \
                zip(widths_gru, depths_gru, dropout_fractions, convolution_layer, l2_weights,
                    save_names, log_names, img_names):
            div_torque = np.split(this_torque, 7, axis=2)
            div_torque_cv = np.split(this_torque_cv, 7, axis=2)
            div_torque_test = np.split(torque_test, 7, axis=2)
            model = MyModel(train=[this_x, div_torque + [this_aux]],
                            val=[this_x_cv, div_torque_cv + [this_aux_cv]],
                            test=[x_test, div_torque_test + [aux_test]],
                            train_mask=[this_mask] * 7 + [this_aux_mask],
                            val_mask=[this_mask_cv] * 7 + [this_aux_mask_cv],
                            test_mask=[mask_test] * 7 + [aux_mask_test],
                            max_unroll=n_rollout,
                            save_dir=save_name,
                            log_dir=log_name,
                            img_dir=img_name,
                            width_gru=width_gru,
                            depth_gru=depth_gru,
                            width_dense=50,
                            depth_dense=2,
                            torque_scaler=effort_scaler,
                            conv=conv,
                            dropout_fraction=dropout_fraction,
                            l2_weight=l2_weight)
            if args.train:
                model.fit(nb_epoch=n_epoch, batch_size=512)
            elif args.resume:
                model.resume(nb_epoch=n_epoch, batch_size=512)
            if args.visualization:
                model.load()
                model.save_figs()
示例#23
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
        X_csr_scaled.astype(np.float))
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
示例#24
0
class SkRanker(Ranker, SkLearner):
    '''
    Basic ranker wrapping scikit-learn functions
    '''
    
    def train(self, dataset_filename, 
              scale=True, 
              feature_selector=None, 
              feature_selection_params={},
              feature_selection_threshold=.25, 
              learning_params={}, 
              optimize=True, 
              optimization_params={}, 
              scorers=['f1_score'],
              attribute_set=None,
              class_name=None,
              metaresults_prefix="./0-",
              **kwargs):
        
        plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf")
        data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name,  **kwargs)
        learner = self.learner
        
        #the class must remember the attribute_set and the class_name in order to reproduce the vectors
        self.attribute_set = attribute_set
        self.class_name = class_name

 
        #scale data to the mean
        if scale:
            log.info("Scaling datasets...")
            log.debug("Data shape before scaling: {}".format(data.shape))
            self.scaler = StandardScaler()
            data = self.scaler.fit_transform(data)
            log.debug("Data shape after scaling: {}".format(data.shape))
            log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_))

        #avoid any NaNs and Infs that may have occurred due to the scaling
        data = np.nan_to_num(data)
        
        #feature selection
        if isinstance(feature_selection_params, basestring):
            feature_selection_params = eval(feature_selection_params)
        self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) 
        
        #initialize learning method and scoring functions and optimize
        self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers)

        log.info("Data shape before fitting: {}".format(data.shape))

        self.learner.fit(data, labels)
        self.fit = True
        return metadata
    
    def get_model_description(self):
        params = {}
        
        if self.scaler:
            params = self.scaler.get_params(deep=True)
        try: #these are for SVC
            if self.learner.kernel == "rbf":
                params["gamma"] = self.learner.gamma
                params["C"] = self.learner.C
                for i, n_support in enumerate(self.learner.n_support_):
                    params["n_{}".format(i)] = n_support
                log.debug(len(self.learner.dual_coef_))
                return params
            elif self.learner.kernel == "linear":
                coefficients = self.learner.coef_
                att_coefficients = {}
                for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]):
                    att_coefficients[attname] = coeff
                return att_coefficients
        except AttributeError:
            pass
        try: #adaboost etc
            params = self.learner.get_params()
            numeric_params = OrderedDict()
            for key, value in params.iteritems():
                try:
                    value = float(value)
                except ValueError:
                    continue
                numeric_params[key] = value
            return numeric_params
        except:
            pass
        return {}
    
    
    def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", 
                            new_rank_name="rank_hard", 
                            del_orig_class_att=False, 
                            bidirectional_pairs=False, 
                            ties=True,
                            reconstruct='hard'):
        """
        """
        if type(self.learner) == str:
            if self.classifier:
                self.learner = self.classifier
                # this is to provide backwards compatibility for old models 
                # whose classes used differeent attribute names
                try:
                    self.learner._dual_coef_ = self.learner.dual_coef_
                    self.learner._intercept_ = self.learner.intercept_
                except AttributeError:
                    # it's ok if the model doesn't have these variables
                    pass

                try: # backwards compatibility for old LogisticRegression
                    try_classes = self.learner.classes_
                except AttributeError:
                    self.learner.classes_ = [-1, 1]

        #de-compose multiranked sentence into pairwise comparisons
        pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs,
                                                                                     class_name=self.class_name,
                                                                                     ties=ties)        
        if len(parallelsentence.get_translations()) == 1:
            log.warning("Parallelsentence has only one target sentence")
            parallelsentence.tgt[0].add_attribute(new_rank_name, 1)
            return parallelsentence, {}
        elif len(parallelsentence.get_translations()) == 0:
            return parallelsentence, {}
        #list that will hold the pairwise parallel sentences including the learner's decision
        classified_pairwise_parallelsentences = []
        resultvector = {}
        
        for pairwise_parallelsentence in pairwise_parallelsentences:
            #convert pairwise parallel sentence into an orange instance
            instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set)
            #scale data instance to mean, based on trained scaler
            if self.scaler:
                try:
                    instance = np.nan_to_num(instance)
                    instance = self.scaler.transform(instance)
                except ValueError as e:
                    log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e))
                    #raise ValueError(e)
                    pass
            try:
                if self.featureselector:
                    instance = np.nan_to_num(instance)
                    instance = self.featureselector.transform(instance)
            except AttributeError:
                pass
            log.debug('Instance = {}'.format(instance)) 
            #make sure no NaN or inf appears in the instance
            instance = np.nan_to_num(instance)
            #run learner for this instance
            predicted_value = self.learner.predict(instance)
            try:
                distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0]))
            except AttributeError: 
                #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5
                distribution = dict([(cl, 0.5) for cl in self.learner.classes_])
            log.debug("Distribution: {}".format(distribution))
            log.debug("Predicted value: {}".format(predicted_value))
            #even if we have a binary learner, it may be that it cannot decide between two classes
            #for us, this means a tie
            if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5:
                predicted_value = 0
                distribution[predicted_value] = 0.5
                
            log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution))
            
            
            #gather several metadata from the classification, which may be needed 
            resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(),
                                 'value' : predicted_value,
                                 'distribution': distribution,
                                 'confidence': distribution[int(predicted_value)],
#                                 'instance' : instance,
                                 })
            
            #add the new predicted ranks as attributes of the new pairwise sentence
            pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value,
                                                       "prob_-1":distribution[-1],
                                                       "prob_1":distribution[1]
                                                       })
            
            classified_pairwise_parallelsentences.append(pairwise_parallelsentence)

        
        #gather all classified pairwise comparisons of into one parallel sentence again
        sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences)
        if reconstruct == 'hard':
            log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name))
            ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, 
                                                               new_rank_name=new_rank_name, 
                                                               del_orig_class_att=del_orig_class_att)
        else:
            attribute1 = "prob_-1"
            attribute2 = "prob_1"
            log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name))
            try:
                ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, 
                        critical_attribute, new_rank_name, normalize_ranking=False)
            except:
                raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence))
        return ranked_sentence, resultvector
示例#25
0
            svecs.append(svec)
        #END per-student loop

    svecs = numpy.array(svecs)

    #     gmarks = []
    #     for sv in svecs:
    #         if sv[0]==-1:
    #             gmarks.append("D")
    #         elif sv[0]==1:
    #             gmarks.append("O")
    #         else:
    #             gmarks.append(".")

    scaler = StandardScaler()
    svecs = scaler.fit_transform(svecs)
    print(svecs)
    print(m, f, h)
    print("number of students", len(svecs))

    print("gender, avg_atts, avg_hints, avg_succ, avg_diff")
    kmeans = KMeans(n_clusters=3, n_jobs=-1).fit(svecs)
    labs = kmeans.labels_
    centres = kmeans.cluster_centers_
    print(centres)
    print(kmeans.inertia_)

    pca = PCA(n_components=2)
    tvecs = pca.fit_transform(svecs)

    x, y = zip(*tvecs)
示例#26
0
def train_and_test(alpha,
                   predictors,
                   predictor_params,
                   x_filename,
                   y_filename,
                   n_users,
                   percTest,
                   featureset_to_use,
                   diff_weighting,
                   phi,
                   force_balanced_classes,
                   do_scaling,
                   optimise_predictors,
                   report,
                   conf_report=None):
    # all_X = numpy.loadtxt(x_filename, delimiter=",")
    all_X = numpy.load(x_filename + ".npy")
    all_y = numpy.loadtxt(y_filename, delimiter=",")

    print("loaded X and y files", x_filename, y_filename)

    if numpy.isnan(all_X.any()):
        print("nan in", x_filename)
        exit()

    if numpy.isnan(all_y.any()):
        print("nan in", y_filename)
        exit()

    #print("selecting balanced subsample")
    print("t t split")
    X_train, X_test, y_train, y_test = train_test_split(all_X,
                                                        all_y,
                                                        test_size=percTest,
                                                        random_state=666)

    # feature extraction
    # test = SelectKBest(score_func=chi2, k=100)
    # kb = test.fit(X_train, y_train)
    # # summarize scores
    # numpy.set_printoptions(precision=3)
    # print(kb.scores_)
    # features = kb.transform(X_train)
    # mask = kb.get_support()
    # # summarize selected features
    # print(features.shape)
    # X_train = X_train[:,mask]
    # X_test = X_test[:,mask]

    scaler = StandardScaler()
    rdim = FeatureAgglomeration(n_clusters=100)
    if do_scaling:
        # input(X_train.shape)
        X_train = rdim.fit_transform(X_train)
        X_test = rdim.transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        with open('../../../isaac_data_files/qutor_scaler.pkl',
                  'wb') as output:
            pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
        with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output:
            pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL)

    # print("feature reduction...")
    # pc = PCA(n_components=100)
    # X_train = pc.fit_transform(X_train)
    # X_test = pc.transform(X_test)

    classes = numpy.unique(y_train)
    sample_weights = None
    if (force_balanced_classes):
        X_train, y_train = balanced_subsample(X_train, y_train, 1.0)  #0.118)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("tuning classifier ...")
    for ix, p in enumerate(predictors):
        print(type(p))
        print(p.get_params().keys())

        if optimise_predictors == True and len(predictor_params[ix]) > 1:
            pbest = run_random_search(p, X_train, y_train,
                                      predictor_params[ix])
        else:
            pbest = p.fit(X_train, y_train)
        predictors[ix] = pbest

    print("pickling classifier ...")
    for ix, p in enumerate(predictors):
        p_name = predictor_params[ix]['name']
        with open(
                '../../../isaac_data_files/p_{}_{}_{}.pkl'.format(
                    p_name, alpha, phi), 'wb') as output:
            pickle.dump(p, output, pickle.HIGHEST_PROTOCOL)
    print("done!")

    # report.write("* ** *** |\| \` | |  |) /; `|` / |_| *** ** *\n")
    # report.write("* ** *** | | /_ |^|  |) ||  |  \ | | *** ** *\n")
    #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n")
    for ix, p in enumerate(predictors):

        report.write(",".join(
            map(str, (all_X.shape[0], str(p).replace(",", ";").replace(
                "\n", ""), force_balanced_classes, diff_weighting, alpha, phi,
                      do_scaling))))

        y_pred_tr = p.predict(X_train)
        y_pred = p.predict(X_test)

        # for x,y,yp in zip(X_train, y_test, y_pred):

        if conf_report:
            conf_report.write(
                str(p).replace(",", ";").replace("\n", "") + "\n")
            conf_report.write(str(alpha) + "," + str(phi) + "\n")
            conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n")
            conf_report.write("\n")
        # p = precision_score(y_test, y_pred, average=None, labels=classes)
        # r = recall_score(y_test, y_pred, average=None, labels=classes)
        # F = f1_score(y_test, y_pred, average=None, labels=classes)
        p, r, F, s = precision_recall_fscore_support(y_test,
                                                     y_pred,
                                                     labels=classes,
                                                     average=None,
                                                     warn_for=('precision',
                                                               'recall',
                                                               'f-score'))
        avp, avr, avF, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            labels=classes,
            average='weighted',
            warn_for=('precision', 'recall', 'f-score'))
        for ix, c in enumerate(classes):
            report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix],
                                                   s[ix]))
        report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s)))

        # report.write(classification_report(y_test, y_pred)+"\n")
        # report.write("------END OF CLASSIFIER------\n")
        report.flush()
    return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
示例#27
0
def split_train_validation_test(multi_time_series_df,
                                valid_start_time,
                                test_start_time,
                                features,
                                time_step_lag=1,
                                horizon=1,
                                target='target',
                                time_format='%Y-%m-%d %H:%M:%S',
                                freq='H'):

    if not isinstance(features, list) or len(features) < 1:
        raise Exception(
            "Bad input for features. It must be an array of dataframe colummns used"
        )

    train = multi_time_series_df.copy()[
        multi_time_series_df.index < valid_start_time]
    train_features = train[features]
    train_targets = train[target]

    # X_scaler = MinMaxScaler()
    # target_scaler = MinMaxScaler()
    # y_scaler = MinMaxScaler()

    X_scaler = StandardScaler()
    target_scaler = StandardScaler()
    y_scaler = StandardScaler()

    # 'load' is our key target. If it is in features, then we scale it.
    # if it not 'load', then we scale the first column
    if 'load' in features:
        tg = train[['load']]
        y_scaler.fit(tg)
    else:

        tg = train[target]
        ## scale the first column
        y_scaler.fit(tg.values.reshape(-1, 1))

    train[target] = target_scaler.fit_transform(train_targets)

    X_scaler.fit(train_features)
    train[features] = X_scaler.transform(train_features)

    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    train_inputs = TimeSeriesTensor(train,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(train_inputs.dataframe.head())

    look_back_dt = dt.datetime.strptime(
        valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1)
    valid = multi_time_series_df.copy()[
        (multi_time_series_df.index >= look_back_dt)
        & (multi_time_series_df.index < test_start_time)]
    valid_features = valid[features]
    valid[features] = X_scaler.transform(valid_features)
    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    valid_inputs = TimeSeriesTensor(valid,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(valid_inputs.dataframe.head())

    # test set
    # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1)
    test = multi_time_series_df.copy()[test_start_time:]
    test_features = test[features]
    test[features] = X_scaler.transform(test_features)
    test_inputs = TimeSeriesTensor(test,
                                   target=target,
                                   H=horizon,
                                   freq=freq,
                                   tensor_structure=tensor_structure)

    print("time lag:", time_step_lag, "original_feature:", len(features))

    return train_inputs, valid_inputs, test_inputs, y_scaler