Exemplo n.º 1
1
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler(with_mean=False)
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, with_mean=False)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)
def test_regressors_train():
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators
                  if issubclass(E, RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X = Scaler().fit_transform(X)
    y = Scaler().fit_transform(y)
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            reg = Reg()
        if hasattr(reg, 'alpha'):
            reg.set_params(alpha=0.01)

        # raises error on malformed input for fit
        assert_raises(ValueError, reg.fit, X, y[:-1])
        # fit
        reg.fit(X, y)
        reg.predict(X)
        assert_greater(reg.score(X, y), 0.5)
Exemplo n.º 3
0
def run_svm(svc,X):
    X = X.copy()
    scaler  = Scaler()
    X  = scaler.fit_transform(X)
    y_predict = svc.predict(X)
    
    return y_predict
Exemplo n.º 4
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
Exemplo n.º 5
0
def test_scaler_without_copy():
    """Check that Scaler.fit does not change input"""
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    X_copy = X.copy()
    Scaler(copy=False).fit(X)
    assert_array_equal(X, X_copy)

    X_csr_copy = X_csr.copy()
    Scaler(with_mean=False, copy=False).fit(X_csr)
    assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
Exemplo n.º 6
0
    def _pre_fit(self, X, y):
        random_state = check_random_state(self.random_state)

        if self.scale_y:
            self.y_scaler_ = Scaler(copy=True).fit(y)
            y = self.y_scaler_.transform(y)

        if self.metric == "precomputed":
            self.components_ = None
            n_components = X.shape[1]
        else:
            if self.init_components is None:
                if self.verbose: print "Selecting components..."
                self.components_ = select_components(X, y,
                                                     self.n_components,
                                                     random_state=random_state)
            else:
                self.components_ = self.init_components

            n_components = self.components_.shape[0]


        n_nonzero_coefs = self.n_nonzero_coefs
        if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1:
            n_nonzero_coefs = int(n_nonzero_coefs * n_components)
        n_nonzero_coefs = int(n_nonzero_coefs)

        if n_nonzero_coefs > n_components:
            raise AttributeError("n_nonzero_coefs cannot be bigger than "
                                 "n_components.")

        if self.verbose: print "Computing dictionary..."
        start = time.time()
        K = pairwise_kernels(X, self.components_, metric=self.metric,
                             filter_params=True, n_jobs=self.n_jobs,
                             **self._kernel_params())
        if self.verbose: print "Done in", time.time() - start, "seconds"

        if self.scale:
            if self.verbose: print "Scaling dictionary"
            start = time.time()
            copy = True if self.metric == "precomputed" else False
            self.scaler_ = Scaler(copy=copy)
            K = self.scaler_.fit_transform(K)
            if self.verbose: print "Done in", time.time() - start, "seconds"

        # FIXME: this allocates a lot of intermediary memory
        norms = np.sqrt(np.sum(K ** 2, axis=0))

        return n_nonzero_coefs, K, y, norms
Exemplo n.º 7
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sp.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = Scaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sp.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
Exemplo n.º 8
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sp.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = Scaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sp.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
Exemplo n.º 9
0
def getPreparedData():
    data = getData()
    attributes = data.columns
    # transformacia kategorickych parametrov na celociselne
    data['school'] = data['school'].apply(school_to_numeric)
    data['sex'] = data['sex'].apply(sex_to_numeric)
    data['address'] = data['address'].apply(address_to_numeric)
    data['famsize'] = data['famsize'].apply(famsize_to_numeric)
    data['Pstatus'] = data['Pstatus'].apply(Pstatus_to_numeric)
    data['Mjob'] = data['Mjob'].apply(job_to_numeric)
    data['Fjob'] = data['Fjob'].apply(job_to_numeric)
    data['reason'] = data['reason'].apply(reason_to_numeric)
    data['guardian'] = data['guardian'].apply(guardian_to_numeric)
    data['schoolsup'] = data['schoolsup'].apply(yesno_to_numeric)
    data['famsup'] = data['famsup'].apply(yesno_to_numeric)
    data['paid'] = data['paid'].apply(yesno_to_numeric)
    data['activities'] = data['activities'].apply(yesno_to_numeric)
    data['nursery'] = data['nursery'].apply(yesno_to_numeric)
    data['higher'] = data['higher'].apply(yesno_to_numeric)
    data['internet'] = data['internet'].apply(yesno_to_numeric)
    data['romantic'] = data['romantic'].apply(yesno_to_numeric)

    # transformacia vsetkych dat do intervalu [0,1]
    scaler = Scaler(feature_range=(0, 1)).fit(data)
    data = pd.DataFrame(scaler.transform(data), columns=attributes)
    return np.array(data)
Exemplo n.º 10
0
def test_fit_transform():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for obj in ((Scaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
Exemplo n.º 11
0
def cluster(playlist):
    arq = 'Total ' + playlist + '.csv'
    n_clusters = 0
    Full_data = pd.read_csv(arq)
    Full_data = Full_data.dropna(axis=1, how='all')
    Full_data = Full_data.dropna(axis=0, how='any')
    ID = Full_data['id']
    Mode = Full_data['mode']
    length = Full_data['duration_ms']
    artist = Full_data['artist']
    Full_data = Full_data.drop(
        columns=['track', 'album_id', 'artist', 'id', 'mode'])
    Fdata = Full_data.values
    scaler = Scaler()
    data_u = scaler.fit_transform(Fdata)
    # pca_transf = PCA(0.8)
    # PCA_data = pca_transf.fit_transform(data_u)
    clusterer = AffinityPropagation(random_state=None, preference=-500)
    # clusterer = HDBSCAN(min_cluster_size=20)
    # clusterer = MeanShift()
    labels = clusterer.fit_predict(data_u)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    labels.shape = (len(labels), 1)
    Full_data['cluster'] = labels + 1
    Full_data['id'] = ID
    Full_data['mode'] = Mode
    Full_data['artist'] = artist
    Full_data['duration_ms'] = length
    # Full_data.sort_values(by='cluster')
    Full_data.to_csv('clustered.csv', index=False)
    # sns.pairplot(Full_data, hue="cluster", palette='YlGnBu')
    # plt.show()
    return n_clusters
Exemplo n.º 12
0
def cluster(playlist):
    arq = 'Total ' + playlist + '.csv'
    n_clusters = 0
    Full_data = pd.read_csv(arq)
    Full_data = Full_data.dropna(axis=1, how='all')
    Full_data = Full_data.dropna(axis=0, how='any')
    ID = Full_data['id']
    Mode = Full_data['mode']
    length = Full_data['duration_ms']
    artist = Full_data['artist']
    key = Full_data['key']
    time_signature = Full_data['time_signature']
    Full_data = Full_data.drop(columns=[
        'track', 'album_id', 'artist', 'artist_id', 'id', 'mode',
        'duration_ms', 'key', 'time_signature'
    ])
    Fdata = Full_data.values
    scaler = Scaler()
    data_u = scaler.fit_transform(Fdata)
    clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=35)
    labels = clusterer.fit_predict(data_u)
    score = silhouette_score(data_u, labels)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    labels.shape = (len(labels), 1)
    Full_data['cluster'] = labels + 1
    Full_data['id'] = ID
    Full_data['mode'] = Mode
    Full_data['artist'] = artist
    Full_data['duration_ms'] = length
    Full_data['key'] = key
    Full_data['time_signature'] = time_signature
    Full_data.to_csv('clustered.csv', index=False)
    return n_clusters, score
Exemplo n.º 13
0
def test_pipeline_methods_preprocessing_svm():
    """Test the various methods of the pipeline (preprocessing + svm)."""
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = Scaler()
    pca = RandomizedPCA(n_components=2, whiten=True)
    clf = SVC(probability=True)

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('scaler', scaler), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Exemplo n.º 14
0
def data_to_kernels(tr_data, te_data):
    scaler = Scaler(copy=False)
    scaler.fit_transform(tr_data)
    #tr_data, mu, sigma = standardize(tr_data)
    tr_data = power_normalize(tr_data, 0.5)
    tr_data = L2_normalize(tr_data)

    #te_data, _, _ = standardize(te_data, mu, sigma)
    scaler.transform(te_data)
    te_data = power_normalize(te_data, 0.5)
    te_data = L2_normalize(te_data)

    tr_kernel = np.dot(tr_data, tr_data.T)
    te_kernel = np.dot(te_data, tr_data.T)

    return tr_kernel, te_kernel
Exemplo n.º 15
0
def dimension(fnames, fp='ECFP', alg='PCA', maximum=int(1e5), ref='GPCR'):
    df = pd.DataFrame()
    for i, fname in enumerate(fnames):
        sub = pd.read_table(fname).dropna(subset=['Smiles'])
        sub = sub[sub.VALID == True]
        if maximum is not None and len(sub) > maximum:
            sub = sub.sample(maximum)
        if ref not in fname:
            sub = sub[sub.DESIRE == True]
        sub = sub.drop_duplicates(subset='Smiles')
        sub['LABEL'] = i
        df = df.append(sub)

    if fp == 'similarity':
        ref = df[(df.LABEL == 0) & (df.DESIRE == True)]
        refs = Predictor.calc_ecfp(ref.Smiles)
        fps = Predictor.calc_ecfp(df.Smiles)
        from rdkit.Chem import DataStructs
        fps = np.array(
            [DataStructs.BulkTanimotoSimilarity(fp, refs) for fp in fps])
    else:
        fp_alg = Predictor.calc_ecfp if fp == 'ECFP' else Predictor.calc_physchem
        fps = fp_alg(df.Smiles)
    fps = Scaler().fit_transform(fps)
    pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2)
    xy = pca.fit_transform(fps)
    df['X'], df['Y'] = xy[:, 0], xy[:, 1]
    if alg == 'PCA':
        ratio = pca.explained_variance_ratio_[:2]
        return df, ratio
    else:
        return df
Exemplo n.º 16
0
def utl_scaleImpute(X_data, p_imputeColumns, p_scaleColumns, p_scalers = None):
    X_data = X_data.copy()
    v_imputeColumns = [column for column in p_imputeColumns if column in X_data.columns]
    v_scaleColumns  = [column for column in p_scaleColumns  if column in X_data.columns]
    
    for column in v_imputeColumns:
        v_values = X_data[column].astype(float).values.reshape(-1, 1)
        if np.isnan(v_values).all():
            X_data[column] = -999
        else:            
            try:
                X_data[column] = Imputer(strategy = 'mean', axis = 0).fit_transform(v_values)
            except:
                values = [np.unique(-999 if np.isnan(ll).all() else ll) for ll in v_values.reshape(1, -1)]
                print(column, values)
                raise
    
    if p_scalers is None:
        p_scalers = {}
        for column in v_scaleColumns:
            v_values = X_data[column].value_counts(dropna = True).index.tolist()
            p_scalers[column] = Scaler().fit(np.array(v_values).reshape(-1, 1))
    
    for column in v_scaleColumns:
        X_data[column] = p_scalers[column].transform(X_data[column].values.reshape(-1, 1) )
    
    return X_data, p_scalers
Exemplo n.º 17
0
    def __init__(self, env):        
        self.env = env 
        
        # sampleing envrionment state in order to featurize it. 
        observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)])
        
        # Feature Preprocessing: Normalize to zero mean and unit variance
        # We use a few samples from the observation space to do this
        self.scaler = Scaler()
        self.scaler.fit(observation_examples)
                
        # Used to convert a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        self.featurizer = FeatureUnion([
                ("rbf1", RBF(gamma=5.0, n_components=100)),
                ("rbf2", RBF(gamma=2.0, n_components=100)),
                ("rbf3", RBF(gamma=1.0, n_components=100)),
                ("rbf4", RBF(gamma=0.5, n_components=100))
                ])
        self.featurizer.fit(self.scaler.transform(observation_examples))

        # action model for SGD regressor
        self.action_models = []
        self.nA = self.env.action_space.n
        
        for na in range(self.nA):
            model = SGD(learning_rate="constant")
            model.partial_fit([self.__featurize_state(self.env.reset())], [0])
            self.action_models.append(model)
Exemplo n.º 18
0
def test_regressors_int():
    # test if regressors can cope with integer labels (by converting them to
    # float)
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators if issubclass(E,
        RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    X = Scaler().fit_transform(X)
    y = np.random.randint(2, size=X.shape[0])
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        if hasattr(reg1, 'alpha'):
            reg1.set_params(alpha=0.01)
            reg2.set_params(alpha=0.01)
        if hasattr(reg1, 'random_state'):
            reg1.set_params(random_state=0)
            reg2.set_params(random_state=0)

        # fit
        reg1.fit(X, y)
        pred1 = reg1.predict(X)
        reg2.fit(X, y.astype(np.float))
        pred2 = reg2.predict(X)
        assert_array_almost_equal(pred1, pred2, 2)
Exemplo n.º 19
0
def load_data(path, label_encoder):
    data = {
        "train_o": [],
        "train_i": [],
        "val_o": [],
        "val_i": [],
        "test_o": [],
        "test_i": []
    }
    for data_type in ["train", "val", "test"]:
        csv_files_path = list(
            paths.list_files(os.path.join(path, data_type), ".csv"))
        for csv_path in tqdm(csv_files_path):
            df = pd.read_csv(csv_path)
            df = df["close"]

            input_data = df.values
            input_data = Scaler().fit_transform(input_data.reshape(
                -1, 1)).flatten()

            output_data = os.path.normpath(csv_path)
            output_data = output_data.split(os.path.sep)[-2]

            data[f"{data_type}_o"].append(label_encoder[output_data])
            data[f"{data_type}_i"].append(input_data)
    for key in data:
        data[key] = np.asarray(data[key])
    return data
Exemplo n.º 20
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    X = Scaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78)
Exemplo n.º 21
0
def reorder(data):
    F = bm.TSPCF
    scaler = Scaler()
    c_data = data.drop(
        columns=['cluster', 'id', 'artist', 'duration_ms', 'time_signature'])
    c_data = scaler.fit_transform(c_data)
    dist = distance_matrix(c_data, c_data)
    M = 2 * np.max(dist)
    np.fill_diagonal(dist, M)
    L = len(c_data)
    limite = [L * [0], L * [(L - 1)]]
    p = L * [1]
    A, B, C, D = PSO.run(F, 1000, limite, 5, 10, dist, passo=p, permut=True)
    edge_lengths = [dist[-1][0]]
    for i in range(len(A) - 1):
        d = dist[i][i + 1]
        edge_lengths.append(d)
    m_d = max(edge_lengths)
    M = edge_lengths.index(m_d)
    A = np.append(A[M:], A[:M])
    B -= m_d
    try:
        L_range = list(range(len(A)))
        N_order = [list(A).index(val) for val in L_range]
        data['new_order'] = N_order
        data = data.sort_values(by='new_order')
        dataset = data.drop(columns=['new_order'])
        try:
            dataset = data.drop(columns=['Unnamed: 0'])
        except:
            pass
        return dataset
    except:
        return data
Exemplo n.º 22
0
    def process_data(self):
        test = pandas.read_csv("test.csv")
        testMat = test.as_matrix()

        train = pandas.read_csv("train.csv")
        trainMat = train.as_matrix()
        trainResult = trainMat[:, 0]
        trainMat = trainMat[:, 1:]

        # trainInd = np.where(trainResult == 0)[0]
        # how_many = (trainResult == 1).sum() - len(trainInd)
        # np.random.shuffle(trainInd)
        # addedResult = trainResult[trainInd[:how_many],:]
        # addedData = trainMat[trainInd[:how_many],:]
        # trainResult = np.append(trainResult,addedResult)
        # trainMat = np.vstack((trainMat,addedData))

        cv = StratifiedKFold(trainResult, 2)
        # cv = KFold(n=trainResult.shape[0],k=2)
        reduceFeatures = ExtraTreesClassifier(
            compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini"
        )
        reduceFeatures.fit(trainMat, trainResult)
        trainScaler = Scaler()

        self.cv_data = []
        self.cv_data_nonreduced = []
        for train, test in cv:
            X_train, X_test, Y_train, Y_test = (
                trainMat[train, :],
                trainMat[test, :],
                trainResult[train, :],
                trainResult[test, :],
            )
            X_train = trainScaler.fit_transform(X_train)
            X_test = trainScaler.transform(X_test)
            self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test))
            X_train = reduceFeatures.transform(X_train)
            X_test = reduceFeatures.transform(X_test)
            self.cv_data.append((X_train, X_test, Y_train, Y_test))
        testMat = trainScaler.transform(testMat)
        self.testMat_nonreduced = testMat
        self.testMat = reduceFeatures.transform(testMat)
        allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced
        data_handle = open("allData.pkl", "w")
        pickle.dump(allData, data_handle)
        data_handle.close()
Exemplo n.º 23
0
def get_sl_test_data(fileEvents,fileLabels,includedChannels,useMeans=False,parentIndices=None):
    ## declare variables
    X = fileEvents[:,includedChannels].copy()
    scaler  = Scaler()
    X = scaler.fit_transform(X)

    #if parentIndices != None:
    #    X = X[parentIndices,:]
    
    #X = (X - X.mean(axis=0)) / X.std(axis=0)

    if useMeans == True:
        clusterIds,X = get_mean_matrix(X,fileLabels)
        #X = (X - X.mean(axis=0)) / X.std(axis=0)
        return clusterIds,X
    
    return X
Exemplo n.º 24
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
Exemplo n.º 25
0
 def __init__(self, set_name='AWA1', batch_size=256, **kwargs):
     super().__init__(set_name, batch_size, **kwargs)
     self.content = [
         'feat', 'label', 'label_emb', 's_cls', 'u_cls', 'cls_emb',
         's_cls_emb', 'u_cls_emb'
     ]
     self.parts = ['training', 'seen', 'unseen']
     self.scaler = Scaler()
     self._init_scaler()
Exemplo n.º 26
0
def Classifier():
    scaler = Scaler()
    es = callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=500,
        verbose=0,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )
    mc = tf.keras.callbacks.ModelCheckpoint(
        'best_model.h5',
        monitor="val_accuracy",
        verbose=0,
        save_best_only=True,
        save_weights_only=False,
        mode="auto",
        save_freq="epoch",
    )
    in_data = 'clustered.csv'
    data = pd.read_csv(in_data)
    data = data.dropna()
    id = data['id']
    data = data.drop(columns=['id'])
    step = 1
    train_label = data['cluster']
    train_data = data.drop(columns=['cluster', 'artist'])
    train_data = train_data.fillna(0)
    train_label = train_label.fillna(0)
    train_data = scaler.fit_transform(train_data)
    n_layers = 2
    model = Sequential()
    model.add(Dense(64, input_dim=len(train_data[0]), activation='relu'))
    for _ in range(n_layers):
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
    model.add(Dense((1 + np.max(train_label.values)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(train_data,
              train_label,
              epochs=1000,
              batch_size=50,
              verbose=2,
              callbacks=[es, mc],
              validation_split=0.2,
              shuffle=True)
    model.load_weights('best_model.h5')
    probs = model.predict(train_data)
    predictions = np.argmax(probs, axis=-1)
    P = pd.DataFrame(probs)
    dp = pd.DataFrame(predictions, columns=['predicted_cluster'])
    dp['predicted_prob'] = np.max(probs, axis=-1)
    P.to_csv('Song_Probs.csv', index=False)
    dp.to_csv('Song_preds.csv', index=False)
Exemplo n.º 27
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    transformers = [(name, E) for name, E in estimators if issubclass(E,
        TransformerMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    X, y = X[:10], y[:10]
    n_samples, n_features = X.shape
    X = Scaler().fit_transform(X)
    X -= X.min()
    for name, Trans in transformers:
        if Trans in dont_test or Trans in meta_estimators:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()

        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1

        # fit
        trans.fit(X, y)
        X_pred = trans.fit_transform(X, y=y)
        assert_equal(X_pred.shape[0], n_samples)

        if hasattr(trans, 'transform'):
            X_pred2 = trans.transform(X)
            assert_array_almost_equal(X_pred, X_pred2, 2)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
Exemplo n.º 28
0
def test_regressors_train():
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators
                  if issubclass(E, RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X = Scaler().fit_transform(X)
    y = Scaler().fit_transform(y)
    succeeded = True
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            reg = Reg()
        if hasattr(reg, 'alpha'):
            reg.set_params(alpha=0.01)

        # raises error on malformed input for fit
        assert_raises(ValueError, reg.fit, X, y[:-1])
        # fit
        try:
            if Reg in (_PLS, PLSCanonical, PLSRegression, CCA):
                y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
                y_ = y_.T
            else:
                y_ = y
            reg.fit(X, y_)
            reg.predict(X)

            if Reg not in (PLSCanonical, CCA):  # TODO: find out why
                assert_greater(reg.score(X, y_), 0.5)
        except Exception as e:
            print(reg)
            print e
            print
            succeeded = False

    assert_true(succeeded)
Exemplo n.º 29
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to Scaler in feature space"""
    X_fit = np.random.random((5, 4))
    scaler = Scaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = np.random.random((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
Exemplo n.º 30
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    X = Scaler().fit_transform(X)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        assert_greater(zero_one_score(y, y_pred), 0.78)

        # raises error on malformed input for predict
        assert_raises(ValueError, clf.predict, X.T)
        if hasattr(clf, "decision_function"):
            try:
                # decision_function agrees with predict:
                decision = clf.decision_function(X)
                assert_equal(decision.shape, (n_samples, n_labels))
                if not isinstance(clf, BaseLibSVM):
                    # 1on1 of LibSVM works differently
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)
                # raises error on malformed input for decision_function
                assert_raises(ValueError, clf.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(clf, "predict_proba"):
            try:
                # predict_proba agrees with predict:
                y_prob = clf.predict_proba(X)
                assert_equal(y_prob.shape, (n_samples, n_labels))
                assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                # raises error on malformed input for predict_proba
                assert_raises(ValueError, clf.predict_proba, X.T)
            except NotImplementedError:
                pass
Exemplo n.º 31
0
def single_task(feat, alg='RF', reg=False, is_extra=True):
    df = pd.read_table('data/LIGAND_RAW.tsv').dropna(subset=pair[1:2])
    df = df[df[pair[0]] == feat]
    df = df[pair].set_index(pair[1])
    year = df[pair[-1:]].groupby(pair[1]).min().dropna()
    test = year[year[pair[-1]] > 2015].index
    numery = df[pair[2]].groupby(pair[1]).mean().dropna()

    comments = df[(df.Comment.str.contains('Not Active') == True)]
    inhibits = df[(df.Standard_Type == 'Inhibition')
                  & df.Standard_Relation.isin(['<', '<='])]
    relations = df[df.Standard_Type.isin(['EC50', 'IC50', 'Kd', 'Ki'])
                   & df.Standard_Relation.isin(['>', '>='])]
    binary = pd.concat([comments, inhibits, relations], axis=0)
    binary = binary[~binary.index.isin(numery.index)]
    binary[pair[2]] = 3.99
    binary = binary[pair[2]].groupby(binary.index).first()
    df = numery.append(binary) if is_extra else numery
    if not reg:
        df = (df > th).astype(float)
    df = df.sample(len(df))
    print(feat, len(numery[numery >= th]), len(numery[numery < th]),
          len(binary))

    test_ix = set(df.index).intersection(test)
    test = df.loc[test_ix].dropna()
    data = df.drop(test.index)

    test_x = utils.Predictor.calc_fp(
        [Chem.MolFromSimles(mol) for mol in test.index])
    data_x = utils.Predictor.calc_fp(
        [Chem.MolFromSimles(mol) for mol in data.index])
    out = 'output/single/%s_%s_%s' % (alg, 'REG' if reg else 'CLS', feat)
    if alg != 'RF':
        scaler = Scaler()
        scaler.fit(data_x)
        test_x = scaler.transform(test_x)
        data_x = scaler.transform(data_x)
    else:
        X = np.concatenate([data_x, test_x], axis=0)
        y = np.concatenate([data.values, test.values], axis=0)
        Train_RF(X, y[:, 0], out=out + '.pkg', reg=reg)
    data, test = data.to_frame(name='Label'), test.to_frame(name='Label')
    data['Score'], test['Score'] = cross_validation(data_x,
                                                    data.values,
                                                    test_x,
                                                    test.values,
                                                    alg,
                                                    out,
                                                    reg=reg)
    data.to_csv(out + '.cv.tsv', sep='\t')
    test.to_csv(out + '.ind.tsv', sep='\t')
Exemplo n.º 32
0
 def __init__(self):
     """
     Model initialization
     """
     self.model = Pipeline(
         steps=[('imputer', Imputer(strategy='median')),
                ('feature_union',
                 FeatureUnion(
                     n_jobs=1,
                     transformer_list=[
                         ('band_1_pipe',
                          Pipeline(steps=[
                              ('band_1', FunctionTransformer(self.band1)),
                              ('band_1_standard_scale', Scaler()),
                          ])),
                         ('band_2_pipe',
                          Pipeline(steps=[
                              ('band_2', FunctionTransformer(self.band2)),
                              ('band_2_standard_scale', Scaler()),
                          ])), ('inc_angle',
                                FunctionTransformer(self.angle))
                     ])), ('xgboost', XGBClassifier(n_estimators=100))])
Exemplo n.º 33
0
def sparser(dataset):
    d = dataset.drop(columns=['id', 'artist', 'cluster', 'probs'])
    scaler = Scaler()
    scaled_d = scaler.fit_transform(d)
    # scaled_d = scaled_d[-20:]
    dist = np.triu(spy.distance_matrix(scaled_d, scaled_d))
    scaled_dist = scaler.fit_transform(dist)
    arg = np.max(scaled_dist)
    indexes = np.argwhere(dist > (.25 * arg))
    I = indexes.T
    keep = list(set(I[0]))
    sparse_d = dataset.iloc[keep, :]
    sparse_d = sparse_d.sort_values('probs', ascending=False)
    return sparse_d
Exemplo n.º 34
0
def test_pipeline_methods_scaler_svm():
    """Test the various methods of the pipeline (scaler + svm)."""
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Scaler + SVC
    clf = SVC(probability=True)
    scaler = Scaler()
    pipe = Pipeline([('scaler', scaler), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemplo n.º 35
0
def test_regressors_int():
    estimators = all_estimators()
    clustering = [(name, E) for name, E in estimators if issubclass(E,
        ClusterMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X_m = Scaler().fit_transform(X_m)
    X = Scaler().fit_transform(X)
    succeeded = True
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        if hasattr(reg1, 'random_state'):
            reg1.set_params(random_state=0)
            reg2.set_params(random_state=0)
Exemplo n.º 36
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
Exemplo n.º 37
0
def run_real_data_experiments(nr_samples,
                              delta,
                              verbose=0,
                              do_scatter_plot=False):

    dataset = Dataset('hollywood2',
                      suffix='.per_slice.delta_%d' % delta,
                      nr_clusters=256)
    samples, _ = dataset.get_data('test')
    nr_samples = np.minimum(len(samples), nr_samples)
    nr_samples = np.maximum(1, nr_samples)

    if verbose > 2:
        print "Loading train data."
    tr_data, _, _ = load_sample_data(dataset, 'train', pi_derivatives=True)
    scaler = Scaler()
    scaler.fit(tr_data)

    true_values, approx_values = [], []
    for ii in xrange(nr_samples):
        if verbose > 2:
            sys.stdout.write("%s\r" % samples[ii].movie)
        data, _, _ = load_sample_data(dataset,
                                      str(samples[ii]),
                                      pi_derivatives=True)
        data = scaler.transform(data)
        L2_norm_true, L2_norm_approx = L2_approx(data)
        true_values.append(L2_norm_true)
        approx_values.append(L2_norm_approx)

    if verbose:
        print
        print_info(true_values, approx_values, verbose)
        print

    if do_scatter_plot:
        scatter_plot(true_values, approx_values)
Exemplo n.º 38
0
def processData(filename, split, trainingSet, testSet):
    DATASET_PATH = './'
    data_path = os.path.join(DATASET_PATH, filename)
    dataset = pd.read_csv(data_path, header=None)
    dataset.columns = [
        "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
        "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"
    ]

    median_bmi = dataset['BMI'].median()
    dataset['BMI'] = dataset['BMI'].replace(to_replace=0, value=median_bmi)

    median_bloodp = dataset['BloodPressure'].median()
    dataset['BloodPressure'] = dataset['BloodPressure'].replace(
        to_replace=0, value=median_bloodp)

    median_plglcconc = dataset['Glucose'].median()
    dataset['Glucose'] = dataset['Glucose'].replace(to_replace=0,
                                                    value=median_plglcconc)

    median_skinthick = dataset['SkinThickness'].median()
    dataset['SkinThickness'] = dataset['SkinThickness'].replace(
        to_replace=0, value=median_skinthick)

    median_twohourserins = dataset['Insulin'].median()
    dataset['Insulin'] = dataset['Insulin'].replace(to_replace=0,
                                                    value=median_twohourserins)
    # print(dataset.head())
    # print(dataset['BMI'][0])
    scaler = Scaler()
    scaler.fit(dataset)
    scaled_dataset = scaler.transform(dataset)
    # print(dataset['BMI'][0])
    df = pd.DataFrame(data=scaled_dataset)
    # print(df.head())
    # datasetlst = pd.
    # datasetlst = list(dataset)
    datasetlst = df.values.tolist()
    # map(datasetlst,dataset.values)
    # datasetlst = list(dataset.values)
    # print(datasetlst[0][5])
    print(len(datasetlst))
    for x in range(0, len(datasetlst) - 1):
        for y in range(9):
            datasetlst[x][y] = float(datasetlst[x][y])
        if random.random() < split:
            trainingSet.append(datasetlst[x])
        else:
            testSet.append(datasetlst[x])
Exemplo n.º 39
0
def run_svm_validation(X1,y1,X2,y2,gammaRange=[0.5],cRange=[0.005],useLinear=False):
    #X_train,y_train,X_test,y_test = split_train_test(X1,y1,X2,y2)

    X = np.vstack((X1, X2))
    Y = np.hstack((y1, y2))

    scaler = Scaler()
    X = scaler.fit_transform(X)

    #if useLinear == True:
    #    svc = svm.SVC(kernel='linear')#class_weight={1: 10
    #    #    #    #svc = svm.SVC(kernel='poly',degree=3,C=1.0)
    #    svc.fit(X, Y)
    #    return svc

    C_range = 10.0 ** np.arange(-2, 9)
    gamma_range = 10.0 ** np.arange(-5, 4)
    param_grid = dict(gamma=gamma_range, C=C_range)

    grid = GridSearchCV(SVC(class_weight={1: 100}), param_grid=param_grid, cv=StratifiedKFold(y=Y,k=2))
    grid.fit(X, Y)

    print("The best classifier is: ", grid.best_estimator_)
    return grid.best_estimator_
Exemplo n.º 40
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
Exemplo n.º 41
0
def SVM_fit(X_in, y_in, X_out, gamma, C):    

    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))

    shuffle(X_in, y_in)
    
    X_test = [X_in[i] for i in test_indices]
    y_test = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
  
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_test, y_test)
    X_in = scaler.transform(X_in)
    X_test = scaler.transform(X_test)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before

    std_test = X_test.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered    
    svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5)   
    svc.fit(X_in, y_in)      
        
    y_out = svc.predict(X_out)
    return y_out
Exemplo n.º 42
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    scaler = Scaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = Scaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_scaled_back, X)
Exemplo n.º 43
0
def test_scaler():
    """Test scaling of dataset along all axis"""
    # First test with 1D data
    X = np.random.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # Test with 2D data
    X = np.random.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert X_scaled is not X

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is X

    X = np.random.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X
Exemplo n.º 44
0
def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))

    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
   
    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)  
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance
    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
  
    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_features_number = M
                
    for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1):
    
        
        # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True)
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number)

                            
        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:
    
            X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]        
            

            svc.fit(X_train, y_train)
            in_accuracy += svc.score(X_train, y_train)
            cv_accuracy += svc.score(X_cv, y_cv)
   
        
        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        if file_log:        
            file_log.writelines('# of features: {}\n'.format(len(X_train[0])))   
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
    
        if (cv_accuracy > best_cv_accuracy):
            best_features_number = features_number
            best_cv_accuracy = cv_accuracy
            
    #Now tests the out of sample error
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(1. - best_cv_accuracy, best_features_number))
    
    
    svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number)
    svc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler)))    
        
    y_out = svc.predict(X_out)
    return y_out
Exemplo n.º 45
0
def Logistic_train(X_in, y_in, X_out, cs, file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
    
    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
    
    
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)  
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance
    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]   
    
    M = len(X_in[0])
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_c = 0.



    for c in cs:
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        lrc = LogisticRegression(C=c, tol=1e-5)
                            
        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:
    
            X_train = array([X_in[i][:] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([X_in[i][:] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]            
            
            lrc.fit(X_train, y_train)
            in_accuracy += lrc.score(X_train, y_train)
            cv_accuracy += lrc.score(X_cv, y_cv)
              
        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        
        if file_log:
            file_log.writelines('C: {}\n'.format(c))  
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

        if (cv_accuracy > best_cv_accuracy):
            best_c = c
            best_cv_accuracy = cv_accuracy
            
    #Now tests the out of sample error
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) 
    
    lrc = LogisticRegression(C=best_c, tol=1e-5)

    lrc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler)))     
        
    y_out = lrc.predict(X_out)
    return y_out
Exemplo n.º 46
0
def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breaks the input set into train. cross validation
    #and scale sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    scale_set_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))

#    shuffle(X_in, y_in)
    
    X_scale = [X_in[i] for i in scale_set_indices]
    y_scale = [y_in[i] for i in scale_set_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
        
    #Scale data first
    scaler = Scaler(copy=False)             #WARNING: copy=False => in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data
    scaler.fit(X_scale, y_scale)
    X_scale = scaler.transform(X_scale)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out)         #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scale.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
    
    
    if file_log:        
        file_log.writelines('Initial features :{}, Features used: {}\n'.format(M, len(X_in[0])))
    
    M = len(f_indices)
    best_cv_accuracy = 0.
    best_gamma = 0.
    best_c = 0.

     
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    for c in cs:
        for g in gammas:
            #Balanced cross validation (keeps the ratio of the two classes as
            #constant as possible across the k folds).
            kfold = cross_validation.StratifiedKFold(y_in, k=10)        
            svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5)
                                
            in_accuracy = 0.
            cv_accuracy = 0.
            for t_indices, cv_indices in kfold:
        
                X_train = array([X_in[i][:] for i in t_indices])
                y_train = [y_in[i] for i in t_indices]
                X_cv = array([X_in[i][:] for i in cv_indices])
                y_cv = [y_in[i] for i in cv_indices]                
                
                svc.fit(X_train, y_train)
                in_accuracy += svc.score(X_train, y_train)
                cv_accuracy += svc.score(X_cv, y_cv)
            
            in_accuracy /= kfold.k
            cv_accuracy /= kfold.k
            if file_log:        
                file_log.writelines('C:{}, gamma:{}\n'.format(c, g))           
                file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
                file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
    
            if (cv_accuracy > best_cv_accuracy):
                best_gamma = g
                best_c = c
                best_cv_accuracy = cv_accuracy
            
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(1. - best_cv_accuracy, best_c, best_gamma))
    
    
    svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5)

    svc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale)))      
        
    y_out = svc.predict(X_out)
#DEBUG:    output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))]
#DEBUG:    file_log.writelines('------------------------')    
    return y_out
Exemplo n.º 47
0
class KMPBase(BaseEstimator):

    def __init__(self,
                 n_nonzero_coefs=0.3,
                 loss=None,
                 # components (basis functions)
                 init_components=None,
                 n_components=None,
                 check_duplicates=False,
                 scale=False,
                 scale_y=False,
                 # back-fitting
                 n_refit=5,
                 estimator=None,
                 # metric
                 metric="linear", gamma=0.1, coef0=1, degree=4,
                 # validation
                 X_val=None, y_val=None,
                 n_validate=1,
                 epsilon=0,
                 score_func=None,
                 # misc
                 random_state=None, verbose=0, n_jobs=1):
        if n_nonzero_coefs < 0:
            raise AttributeError("n_nonzero_coefs should be > 0.")

        self.n_nonzero_coefs = n_nonzero_coefs
        self.loss = loss
        self.init_components = init_components
        self.n_components = n_components
        self.check_duplicates = check_duplicates
        self.scale = scale
        self.scale_y = scale_y
        self.n_refit = n_refit
        self.estimator = estimator
        self.metric = metric
        self.gamma = gamma
        self.coef0 = coef0
        self.degree = degree
        self.X_val = X_val
        self.y_val = y_val
        self.n_validate = n_validate
        self.epsilon = epsilon
        self.score_func = score_func
        self.random_state = random_state
        self.verbose = verbose
        self.n_jobs = n_jobs

    def _kernel_params(self):
        return {"gamma" : self.gamma,
                "degree" : self.degree,
                "coef0" : self.coef0}

    def _get_estimator(self):
        if self.estimator is None:
            estimator = LinearRegression()
        else:
            estimator = clone(self.estimator)
        estimator.fit_intercept = False
        return estimator

    def _get_loss(self):
        if self.loss == "squared":
            return SquaredLoss()
        else:
            return None

    def _pre_fit(self, X, y):
        random_state = check_random_state(self.random_state)

        if self.scale_y:
            self.y_scaler_ = Scaler(copy=True).fit(y)
            y = self.y_scaler_.transform(y)

        if self.metric == "precomputed":
            self.components_ = None
            n_components = X.shape[1]
        else:
            if self.init_components is None:
                if self.verbose: print "Selecting components..."
                self.components_ = select_components(X, y,
                                                     self.n_components,
                                                     random_state=random_state)
            else:
                self.components_ = self.init_components

            n_components = self.components_.shape[0]


        n_nonzero_coefs = self.n_nonzero_coefs
        if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1:
            n_nonzero_coefs = int(n_nonzero_coefs * n_components)
        n_nonzero_coefs = int(n_nonzero_coefs)

        if n_nonzero_coefs > n_components:
            raise AttributeError("n_nonzero_coefs cannot be bigger than "
                                 "n_components.")

        if self.verbose: print "Computing dictionary..."
        start = time.time()
        K = pairwise_kernels(X, self.components_, metric=self.metric,
                             filter_params=True, n_jobs=self.n_jobs,
                             **self._kernel_params())
        if self.verbose: print "Done in", time.time() - start, "seconds"

        if self.scale:
            if self.verbose: print "Scaling dictionary"
            start = time.time()
            copy = True if self.metric == "precomputed" else False
            self.scaler_ = Scaler(copy=copy)
            K = self.scaler_.fit_transform(K)
            if self.verbose: print "Done in", time.time() - start, "seconds"

        # FIXME: this allocates a lot of intermediary memory
        norms = np.sqrt(np.sum(K ** 2, axis=0))

        return n_nonzero_coefs, K, y, norms

    def _fit_multi(self, K, y, Y, n_nonzero_coefs, norms):
        if self.verbose: print "Starting training..."
        start = time.time()
        coef = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_run_iterator)(self._get_estimator(),
                                       self._get_loss(),
                                       K, Y[:, i], n_nonzero_coefs, norms,
                                       self.n_refit, self.check_duplicates)
                for i in xrange(Y.shape[1]))
        self.coef_ = np.array(coef)
        if self.verbose: print "Done in", time.time() - start, "seconds"

    def _score(self, y_true, y_pred):
        if self.score_func == "auc":
            return auc(y_true, y_pred)
        if hasattr(self, "lb_"):
            y_pred = self.lb_.inverse_transform(y_pred, threshold=0.5)
            if self.score_func is None:
                return np.mean(y_true == y_pred)
            else:
                return self.score_func(y_true, y_pred)
        else:
            # FIXME: no need to ravel y_pred if y_true is 2d!
            return -np.mean((y_true - y_pred.ravel()) ** 2)

    def _fit_multi_with_validation(self, K, y, Y, n_nonzero_coefs, norms):
        iterators = [FitIterator(self._get_estimator(), self._get_loss(),
                                 K, Y[:, i], n_nonzero_coefs, norms,
                                 self.n_refit, self.check_duplicates,
                                 self.verbose)
                     for i in xrange(Y.shape[1])]

        if self.verbose: print "Computing validation dictionary..."
        start = time.time()
        K_val = pairwise_kernels(self.X_val, self.components_,
                                 metric=self.metric,
                                 filter_params=True,
                                 n_jobs=self.n_jobs,
                                 **self._kernel_params())
        if self.verbose: print "Done in", time.time() - start, "seconds"
        if self.scale:
            K_val = self.scaler_.transform(K_val)

        y_val = self.y_val
        if self.scale_y:
            y_val = self.y_scaler_.transform(y_val)


        if self.verbose: print "Starting training..."
        start = time.time()
        best_score = -np.inf
        validation_scores = []
        training_scores = []
        iterations = []

        for i in xrange(1, n_nonzero_coefs + 1):
            iterators = [it.next() for it in iterators]
            #iterators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                    #delayed(_run_iterator)(it) for it in iterators)
            coef = np.array([it.coef_ for it in iterators])
            y_train_pred = np.array([it.y_train_ for it in iterators]).T

            if i % self.n_validate == 0:
                if self.verbose >= 2:
                    print "Validating %d/%d..." % (i, n_nonzero_coefs)

                y_val_pred = np.dot(K_val, coef.T)

                validation_score = self._score(y_val, y_val_pred)
                training_score = self._score(y, y_train_pred)

                if validation_score > best_score:
                    self.coef_ = coef.copy()
                    best_score = np.abs(validation_score)

                validation_scores.append(np.abs(validation_score))
                training_scores.append(np.abs(training_score))
                iterations.append(i)

                if len(iterations) > 2 and self.epsilon > 0:
                    diff = (validation_scores[-1] - validation_scores[-2])
                    diff /= validation_scores[0]
                    if abs(diff) < self.epsilon:
                        if self.verbose:
                            print "Converged at iteration", i
                        break

        self.validation_scores_ = np.array(validation_scores)
        self.training_scores_ = np.array(training_scores)
        self.iterations_ = np.array(iterations)
        self.best_score_ = best_score

        if self.verbose: print "Done in", time.time() - start, "seconds"

    def _fit(self, K, y, Y, n_nonzero_coefs, norms):
        if self.X_val is not None and self.y_val is not None:
            meth = self._fit_multi_with_validation
        else:
            meth = self._fit_multi
        meth(K, y, Y, n_nonzero_coefs, norms)

    def _post_fit(self):
        if self.metric != "precomputed":
            used_basis = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            self.coef_ = self.coef_[:, used_basis]
            self.components_ = self.components_[used_basis]

    def decision_function(self, X):
        K = pairwise_kernels(X, self.components_, metric=self.metric,
                             filter_params=True, n_jobs=self.n_jobs,
                             **self._kernel_params())
        if self.scale:
            K = self.scaler_.transform(K)

        pred = np.dot(K, self.coef_.T)

        if self.scale_y:
            pred = self.y_scaler_.inverse_transform(pred)

        return pred
Exemplo n.º 48
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    transformers = [(name, E) for name, E in estimators if issubclass(E,
        TransformerMixin)]
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
            random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = Scaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Trans in transformers:
        if Trans in dont_test or Trans in meta_estimators:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()

        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1

        # fit

        if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
            y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        try:
            trans.fit(X, y_)
            X_pred = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print trans
            print e
            print
            succeeded = False

        if hasattr(trans, 'transform'):
            if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
                X_pred2 = trans.transform(X, y_)
            else:
                X_pred2 = trans.transform(X)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2 in zip(X_pred, X_pred2):
                    assert_array_almost_equal(x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
            else:
                assert_array_almost_equal(X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
    assert_true(succeeded)
Exemplo n.º 49
0
    coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

    # The correlation of our design: variables correlated by blocs of 3
    corr = np.zeros((n_features, n_features))
    for i in range(0, n_features, block_size):
        corr[i:i + block_size, i:i + block_size] = 1 - conditionning
    corr.flat[::n_features + 1] = 1
    corr = linalg.cholesky(corr)

    # Our design
    X = rng.normal(size=(n_samples, n_features))
    X = np.dot(X, corr)
    # Keep [Wainwright2006] (26c) constant
    X[:n_relevant_features] /= np.abs(
            linalg.svdvals(X[:n_relevant_features])).max()
    X = Scaler().fit_transform(X.copy())

    # The output variable
    y = np.dot(X, coef)
    y /= np.std(y)
    # We scale the added noise as a function of the average correlation
    # between the design and the output variable
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(X[:, :n_relevant_features],
                            X[:, n_relevant_features:])

    ###########################################################################
    # Plot stability selection path, using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X, y,
                                            random_state=42, eps=0.05)
Exemplo n.º 50
0
 def normalize(self, data, n=N_COMPONENTS):
    X=np.array(data, dtype='float')
    #X=np.array(X[:,np.std(X,0)!=0.0], dtype='float')
    scaler=Scaler()
    Xnorm=scaler.fit_transform(X)
    return Xnorm
Exemplo n.º 51
0
if folding == "stratified":
    cv = StratifiedKFold(y, k=n_folds)
elif folding == "kfolding":
    cv = KFold(n=y.shape[0], k=n_folds)
elif folding == "leaveoneout":
    n_folds[0] = y.shape[0]
    cv = LeaveOneOut(n=y.shape[0])
else:
    print("unknown crossvalidation method!")


# -- classifier
clf = svm.SVC(kernel="linear", probability=True, C=svm_C)

# -- normalizer
scaler = Scaler()

# -- feature selection
fs = SelectPercentile(f_classif, percentile=fs_n)

print("INITIALIZE RESULTS")
if compute_predict:
    predict = np.zeros([n_splits, n_samples, n_dims, n_dims_tg]) ** np.nan
    predictg = np.zeros([n_splits, n_samplesg, n_dimsg, n_dimsg_tg, n_folds]) ** np.nan
else:
    predict = []
    predictg = []

if compute_probas:
    probas = np.zeros([n_splits, n_samples, n_dims, n_dims_tg, n_classes]) ** np.nan
    probasg = np.zeros([n_splits, n_samplesg, n_dimsg, n_dimsg_tg, n_classes, n_folds]) ** np.nan
Exemplo n.º 52
0
    k = 10
    records = data[:,1:]
    labels = data[:,0]
    n_train = 35000
    #n_val = n - n_train
    n_val = 7000
    trainset = records[:n_train,:]
    trainlabels = labels[:n_train]
    #valset = records[n_train:,:]
    #vallabels = labels[n_train:,:]
    valset = records[n_train:n_train+n_val,:]
    vallabels = labels[n_train:n_train+n_val]
    n,dim = trainset.shape

    # mean centering, stdev normalization and whitening
    scaler = Scaler()
    scaler.fit(trainset)
    trainset = scaler.transform(trainset)
    valset = scaler.transform(valset)
    pca = PCA(n_components=dim,whiten=True)
    pca.fit(trainset)
    trainset = pca.transform(trainset)
    valset = pca.transform(valset)

    config = Train_config()
    config.iterations = 10
    config.nonlinearity = 'tanh'
    config.batchsize = 50
    config.learning_rate = 0.2
    config.momentum = 0.7
    log = open('log.txt','w')
Exemplo n.º 53
0
def main():
	X =[]
	Y=[]
	featuresDB = Base(os.getcwd()+"\\Databases\\features.db")
	featuresDB.open()
	print "features open"
 
	for rec in featuresDB:
		vec = []
		vec.append(rec.f1)
		vec.append(rec.f3)
		vec.append(rec.f4)
		vec.append(rec.f5)
		vec.append(rec.f6)
		vec.append(rec.f7)
		vec.append(rec.f10)
		vec.append(rec.f11)
		vec.append(rec.f12)
		vec.append(rec.f13)
		vec.append(rec.f14)
		vec.append(rec.f15)
		vec.append(rec.f16)
		vec.append(rec.f17)
		vec.append(rec.f18)
		vec.append(rec.f19)
		vec.append(rec.f20)
		vec.append(rec.f21)
		vec.append(rec.f22)
		vec.append(rec.f23)
		X.append(vec)
		Y.append(rec.score)
	print "building classifier"	

	Y = np.array(Y)
	ybar = Y.mean()
	for i in range(len(Y)):
		if Y[i]<ybar: 
			Y[i]=1
		else:
			 Y[i]=2
	scaler = Scaler().fit(X)
	X = scaler.transform(X)
	
	X= np.array(X)
	Y=np.array(Y)

	skf = cross_validation.StratifiedKFold(Y,k=2)
	for train, test in skf:
		X_train, X_test = X[train], X[test]
		y_train, y_test = Y[train], Y[test]

	
	clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True)
	scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5)
	
	clf.fit_transform(X_train,y_train)
	print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
	print clf.feature_importances_

	y_pred =clf.predict(X_test)
	print	classification_report(y_test,y_pred)
	
	model=(scaler,clf)
	joblib.dump(model,'AestheticModel\\aestheticModel.pkl')
	
	print "Done"
from datetime import datetime,timedelta
from errorcurves import ErrorCurves
import numpy as np
from sklearn import mixture
import pandas

df = pandas.read_csv('TrainingDataset.csv')
df_test = pandas.read_csv('TestDataset.csv')
ids = df_test.pop('id')

outcomes = list()
train_sets = list()

quants = [i for i in df.columns if 'Q' in i]
df_quants = df[quants]
scaler = Scaler()
scaled = scaler.fit_transform(df_quants.fillna(0))
dpgmm = mixture.DPGMM(n_components = 75)
dpgmm.fit(scaled)
clusters = dpgmm.predict(scaled)
df['clusters'] = clusters

# Parse dates
jan1 = datetime(2000,1,1)

# Drop all rows where response variable == NaN
for i in range(1,13):
	df_i = df[df['Outcome_M'+str(i)]>0]
	outcomes.append(df_i.pop('Outcome_M'+str(i)))
	[df_i.pop(i) for i in df_i.columns if 'Out' in i]
Exemplo n.º 55
0
def load_kernels(
    dataset, tr_norms=['std', 'sqrt', 'L2'], te_norms=['std', 'sqrt', 'L2'],
    analytical_fim=False, pi_derivatives=False, sqrt_nr_descs=False,
    only_train=False, verbose=0, do_plot=False, outfile=None):

    tr_outfile = outfile % "train" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('train')
    tr_data, tr_counts, tr_labels = load_video_data(
        dataset, samples, outfile=tr_outfile, analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose)

    if verbose > 0:
        print "Train data: %dx%d" % tr_data.shape

    if do_plot:
        plot_fisher_vector(tr_data[0], 'before')

    scalers = []
    for norm in tr_norms:
        if norm == 'std':
            scaler = Scaler()
            tr_data = scaler.fit_transform(tr_data)
            scalers.append(scaler)
        elif norm == 'sqrt':
            tr_data = power_normalize(tr_data, 0.5)
        elif norm == 'sqrt_cnt':
            tr_data = approximate_signed_sqrt(
                tr_data, tr_counts, pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            tr_data = L2_normalize(tr_data)
        if do_plot:
            plot_fisher_vector(tr_data[0], 'after_%s' % norm)

    tr_kernel = np.dot(tr_data, tr_data.T)

    if only_train:
        return tr_kernel, tr_labels, scalers, tr_data

    te_outfile = outfile % "test" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('test')
    te_data, te_counts, te_labels = load_video_data(
        dataset, samples, outfile=te_outfile, analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose)

    if verbose > 0:
        print "Test data: %dx%d" % te_data.shape

    ii = 0
    for norm in te_norms:
        if norm == 'std':
            te_data = scalers[ii].transform(te_data)
            ii += 1
        elif norm == 'sqrt':
            te_data = power_normalize(te_data, 0.5)
        elif norm == 'sqrt_cnt':
            te_data = approximate_signed_sqrt(
                te_data, te_counts, pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            te_data = L2_normalize(te_data)

    te_kernel = np.dot(te_data, tr_data.T)

    return tr_kernel, tr_labels, te_kernel, te_labels
from sklearn.svm import SVC
from sklearn.preprocessing import Scaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

iris_dataset = load_iris()

X, Y = iris_dataset.data, iris_dataset.target

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)

# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10. ** np.arange(-5, 5)
gamma_range = 10. ** np.arange(-5, 5)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5))

grid.fit(X, Y)
Exemplo n.º 57
0
if folding == 'stratified':
    cv = StratifiedKFold(y, k=n_folds)
elif folding == 'kfolding':
    cv = KFold(n=y.shape[0], k=n_folds)
elif folding == 'leaveoneout':
    n_folds[0] = y.shape[0]
    cv = LeaveOneOut(n=y.shape[0])
else:
    print("unknown crossvalidation method!")


#-- classifier
clf = svm.SVC(kernel='linear', probability=True, C=svm_C)

#-- normalizer
scaler = Scaler()

#-- feature selection
fs = SelectPercentile(f_classif, percentile=fs_n)
#-- grid search
#parameters = {'svm__C': (1e-6,1e-3, 1e-1, .4)}
#clf       = GridSearchCV(svm, parameters,n_jobs=1)

#-- initialize results
predict = np.zeros([n_splits, n_samples, n_dims]) ** np.nan
probas = np.zeros([n_splits, n_samples, n_dims, n_classes]) ** np.nan
predictg = np.zeros([n_splits, n_samplesg, n_dimsg, n_folds]) ** np.nan
probasg = np.zeros([n_splits, n_samplesg, n_dimsg, n_classes, n_folds]) ** np.nan
coef = np.empty([n_splits, n_folds, n_dims, n_classes * (n_classes - 1) / 2, n_features]) ** 0
all_folds = np.zeros([n_splits, n_folds, n_samples]) ** np.nan
y_shfl = np.copy(y)
Exemplo n.º 58
0
iris = load_iris()
X = iris.data
Y = iris.target

# dataset for decision function visualization
X_2d = X[:, :2]
X_2d = X_2d[Y > 0]
Y_2d = Y[Y > 0]
Y_2d -= 1

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the training set and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

##############################################################################
# Train classifier
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10.0 ** np.arange(-2, 9)
gamma_range = 10.0 ** np.arange(-5, 4)
param_grid = dict(gamma=gamma_range, C=C_range)