Пример #1
0
def pca_kpca(train_data, labels):
    estimators = make_union(PCA(), TruncatedSVD(), KernelPCA())
#    estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
    combined = FeatureUnion(estimators)
    combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels)

    return combined
Пример #2
0
def rbf_kernels(env, n_samples=100000, gamma=[0.01, 0.1], n_components=100):
    """Represent observation samples using RBF-kernels.

    EXAMPLE
    -------
    >>> env = gym.make('MountainCar-v0')
    >>> n_params, rbf = rbf_kernels(env, n_components=100)
    >>> sample = env.observation_space.sample().reshape((1, env.observation_space.shape[0]))
    >>> rbf(sample).shape
    (1, 100)
    """
    observation_examples = np.array([env.observation_space.sample() for _ in range(n_samples)])

    # Fit feature scaler
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Fir feature extractor
    features = []
    for g in gamma:
        features.append(('gamma={}'.format(g), RBFSampler(n_components=n_components // len(gamma), gamma=g)))

    features = FeatureUnion(features)
    features.fit(scaler.transform(observation_examples))

    def _rbf_kernels(observation):
        return features.transform(scaler.transform(observation))

    return _rbf_kernels
Пример #3
0
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
Пример #4
0
 def test_feature_union(self):
     """Tests that combining multiple featurizers works as expected"""
     modules = ["bag-of-words", "entities"]
     modules_list, _ = modules_to_dictionary(modules)
     feature_union = FeatureUnion(modules_list)
     feature_union.fit(texts_entities, outcomes)
     feature_union.transform(["unknown"])
Пример #5
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Пример #6
0
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
Пример #7
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Пример #8
0
def test_feature_union_parallel():
    # test that n_jobs work for FeatureUnion
    X = JUNK_FOOD_DOCS

    fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))])

    fs_parallel = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs_parallel2 = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs.fit(X)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape[0], len(X))

    fs_parallel.fit(X)
    X_transformed_parallel = fs_parallel.transform(X)
    assert_equal(X_transformed.shape, X_transformed_parallel.shape)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray())

    # fit_transform should behave the same
    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())

    # transformers should stay fit after fit_transform
    X_transformed_parallel2 = fs_parallel2.transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
Пример #9
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Пример #10
0
def pca(x, y, test_x, n_features=-1):
    if n_features == -1:
        n_features = int(np.ceil(np.sqrt(x.shape[1])))

    pca = PCA(n_components=n_features)
    selection = SelectKBest(k=n_features/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    combined_features.fit(x, y)

    return combined_features.transform(x), combined_features.transform(test_x)
Пример #11
0
def taskA(strTweets, stances, X_train, X_test, y_train, y_test):
    le = preprocessing.LabelEncoder()
    count_word = TfidfVectorizer(ngram_range=(1, 3))
    count_char = TfidfVectorizer(analyzer='char', ngram_range=(4, 4))
    vectorizer = FeatureUnion([('word', count_word), ('char', count_char)])
    vectorizer.fit(strTweets)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)
    X_test = vectorizer.transform(X_test)
    X_train = vectorizer.transform(X_train)

    print("Task A w/o gender: {}".format(
        run_tests(X_train, X_test, y_train, y_test)))
Пример #12
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert "chars__" in feat or "words__" in feat
    assert len(feature_names) == 35

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
Пример #14
0
def test_feature_union_warns_unknown_transformer_weight():
    # Warn user when transformer_weights containers a key not present in
    # transformer_list
    X = [[1, 2], [3, 4], [5, 6]]
    y = [0, 1, 2]

    transformer_list = [("transf", Transf())]
    # Transformer weights dictionary with incorrect name
    weights = {"transformer": 1}
    expected_msg = ('Attempting to weight transformer "transformer", '
                    "but it is not present in transformer_list.")
    union = FeatureUnion(transformer_list, transformer_weights=weights)
    with pytest.raises(ValueError, match=expected_msg):
        union.fit(X, y)
Пример #15
0
def data_vectorize(df):
    russian_stop = set(stopwords.words("russian"))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": "word",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        #"min_df":5,
        #"max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ("description",
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=36000,
                         **tfidf_para,
                         preprocessor=get_col("description"))),
        ("title_description",
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=200000,
                         **tfidf_para,
                         preprocessor=get_col("title_description"))),
        ("text_feature",
         CountVectorizer(ngram_range=(1, 2),
                         preprocessor=get_col("text_feature"))),
        ("title",
         TfidfVectorizer(ngram_range=(1, 2),
                         **tfidf_para,
                         preprocessor=get_col("title"))),
    ])
    vectorizer.fit(df.to_dict("records"))
    ready_full_df = vectorizer.transform(df.to_dict("records"))

    tfvocab = vectorizer.get_feature_names()

    df.drop([
        "text_feature", "text_feature_2", "description", "title",
        "title_description"
    ],
            axis=1,
            inplace=True)
    df.fillna(-1, inplace=True)
    return df, ready_full_df, tfvocab
Пример #16
0
def test_feature_stacker_weights():
    # test feature stacker with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
Пример #17
0
def test_feature_stacker_weights():
    # test feature stacker with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())
Пример #18
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert "chars__" in feat or "words__" in feat
    assert len(feature_names) == 35

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])

    msg = re.escape("Transformer tr1 (type Transf) does not provide get_feature_names")
    with pytest.raises(AttributeError, match=msg):
        ft.get_feature_names()
Пример #19
0
    def __init__(self, env, n_components=500):
        observation_samples = np.array([env.observation_space.sample() for x in range(10000)])
        scaler = StandardScaler()
        scaler.fit(observation_samples.astype('float'))

        #convert a state to feature representation
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=3.0, n_components=n_components)),
            #("rbf2", RBFSampler(gamma=.5, n_components=n_components)),
        ])

        featurizer.fit(scaler.transform(observation_samples.astype('float')))

        self.scaler = scaler
        self.featurizer = featurizer
Пример #20
0
class RBFAgent(BaseAgent):
    def __init__(self):
        super().__init__("MountainCar-v0")
        self.name = "RBFAgent"
        self.env._max_episode_steps = 300
        self.max_epochs = 1000
        self.alpha = 0.1
        self.gamma = 1.0
        self.epsilon = 0.8

        gammas = [0.5, 1.0, 2.0, 3.5, 5.0]
        self.n_actions = self.env.action_space.n
        self.n_components = 30
        features = [(f'rbf{i}',
                     RBFSampler(gamma=g,
                                n_components=self.n_components,
                                random_state=1)) for i, g in enumerate(gammas)]
        samples = np.array(
            [self.env.observation_space.sample() for _ in range(10000)])

        self.scaler = StandardScaler()
        self.scaler.fit(samples)
        self.featurizer = FeatureUnion(features)
        self.featurizer.fit(self.scaler.transform(samples))

        self.w = np.zeros((self.n_actions, self.n_components * len(features)))

    def featurize(self, state):
        return self.featurizer.transform(self.scaler.transform([state]))

    def Q(self, state, action):
        return state.dot(self.w[action])

    def policy(self, state, epsilon=0):
        A = np.ones(self.n_actions, dtype=float) * epsilon / self.n_actions
        a = np.argmax([self.Q(state, a) for a in range(self.n_actions)])
        A[a] += (1.0 - epsilon)
        return np.random.choice(self.n_actions, p=A)

    def perform_step(self, state, action):
        next_state, reward, done, _ = self.env.step(action)
        next_state = self.featurize(next_state)
        next_action = self.policy(next_state)
        current_q = self.Q(state, action)
        next_q = self.Q(next_state, next_action)
        self.w[action] += self.alpha * (reward + self.gamma * next_q -
                                        current_q).dot(state)
        return next_state, done
Пример #21
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Пример #22
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
 def fit(self, X, y=None):
     Trans2 = Q2Transformer()
     Trans3 = Q3Transformer()
     Trans4 = Q4Transformer()
     combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)])
     self.fit = combined_features.fit(X)
     return self
class Featurizer(BaseEstimator, TransformerMixin):
    """constructs a feature union of text and numeric features for each video.
    """
    def __init__(self, *args, **kwargs):
        self.featurizer = FeatureUnion(
            transformer_list=[
                ('text_title',
                 Pipeline([
                     ('selector', ItemSelector(key='title')),
                     ('count_vectorizer', CountVectorizer(*args, **kwargs)),
                 ])),
                ('text_channel_title',
                 Pipeline([
                     ('selector', ItemSelector(key='channel_title')),
                     ('count_vectorizer', CountVectorizer(*args, **kwargs)),
                 ])),
                ('numeric', NumericFeatures()),
            ],
            # weight components in FeatureUnion
            transformer_weights={
                'text_title': 1.0,
                'text_channel_title': 1.0,
                'numeric': 1.0,
            },
        )

    def fit(self, X, y=None):
        return self.featurizer.fit(X)

    def transform(self, X):
        return self.featurizer.transform(X).todense()
Пример #25
0
def concat_feature_extractors(train_data, labels):
    # This dataset is way to high-dimensional. Better do PCA:
    pca = PCA(n_components=2)

    # Maybe some original features where good, too?
    selection = SelectKBest(k=1)

    # Build estimator from PCA and Univariate selection:

    combined_features = FeatureUnion([("pca", pca),
                                      ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(train_data,
                                       labels).transform(train_data)

    # Classify:
    svm = SVC(kernel="linear")
    svm.fit(X_features, labels)

    # Do grid search over k, n_components and C:

    pipeline = Pipeline([("features", combined_features), ("svm", svm)])

    param_grid = dict(features__pca__n_components=[1, 2, 3],
                      features__univ_select__k=[1, 2],
                      svm__C=[0.1, 1, 10])

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
    grid_search.fit(train_data, labels)
    print(grid_search.best_estimator_)
Пример #26
0
def testSVC(lbda=1.0, n_components=20, kbest=4):
	otto = load_otto()
	X = otto.data
	y = otto.target
	# X = otto.data[:10000, :10]
	# y = otto.target[:10000]

	scaler = StandardScaler().fit(X)
	X = scaler.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ("univ_select", selection)]
	)
	X_features = combined_features.fit(X, y).transform(X)

	svc = SVC(C=1.0/lbda, kernel='rbf', cache_size=400, probability=True)
	pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scaler.transform(testData)
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)
Пример #27
0
def testSVC(lbda=1.0, n_components=20, kbest=4):
    otto = load_otto()
    X = otto.data
    y = otto.target
    # X = otto.data[:10000, :10]
    # y = otto.target[:10000]

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    pca = PCA(n_components=n_components)
    selection = SelectKBest(k=kbest)

    combined_features = FeatureUnion([("pca", pca),
                                      ("univ_select", selection)])
    X_features = combined_features.fit(X, y).transform(X)

    svc = SVC(C=1.0 / lbda, kernel='rbf', cache_size=400, probability=True)
    pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)])
    trainData = X
    trainTarget = y
    pipe.fit(trainData, trainTarget)
    test_otto = load_testotto()
    testData = test_otto.data
    testData = scaler.transform(testData)
    'save the prediction'
    prediction = pipe.predict_proba(testData)
    proba = pipe.predict_proba(testData)
    save_submission(lbda, proba, prediction)
Пример #28
0
	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1, 2, 3],
				                  features__univ_select__k=[1, 2])
			else:
				param_grid = dict(features__pca__n_components=[1, 2,3],
				                  features__univ_select__k=[1,2])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None
Пример #29
0
def featureVect(X_train, y, compoents, feature_para):

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 25),
                                        stop_words="english")
    X_2 = bigram_vectorizer.fit_transform(X_train).toarray()

    vectorizer = TfidfVectorizer(ngram_range=(1, 25), stop_words="english")
    X_2_DFIDF = vectorizer.fit_transform(X_train).toarray()

    X = np.multiply(X_2, X_2_DFIDF)

    # This dataset is way to high-dimensional. Better do PCA:
    # pca = PCA(n_components=400)
    pca = SparsePCA(n_components=compoents[0])

    # Build estimator from PCA and Univariate selection:
    # ,("dfr",selection_fdr),("fwe",selection_fwe),("fpr",selection_fpr), ("univ_select", selection)
    feature_list = [("pca", pca)]
    feature_list += feature_para

    combined_features = FeatureUnion(feature_list)

    # Use combined features to transform dataset:
    X_features = combined_features.fit(X, y).transform(X)

    select_chi = chi2(X_2, y)

    ind = np.argpartition(select_chi[0], -compoents[1])[-compoents[1]:]
    selection_chi2 = X_2[:, ind]

    X_features = np.concatenate((X_features, selection_chi2), axis=1)

    return [X_features, combined_features, bigram_vectorizer, vectorizer, ind]
Пример #30
0
def build_cat_data(category, model, best_params, train_data, dev_data, train_labels, dev_labels, 
                   train_categories, dev_categories):
    
    if model not in ['mlp', 'knn']:
        # reduce # f dimensions
        pca = PCA(n_components=best_params['features__pca__n_components'])
        
        # Select high value original features
        selection = SelectKBest(k=best_params['features__univ_select__k'])
        
        combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
        
        # Use combined features to transform dataset:
        sub_features = combined_features.fit(train_data, train_labels)
        train_reduced = sub_features.transform(train_data)
        dev_reduced = sub_features.transform(dev_data)
    else:
        train_reduced = train_data
        dev_reduced = dev_data
    
    #now, subset out the correct set of data based on category
    idx = list(np.array(np.where(train_categories == category))[0])
    idx_dev = list(np.array(np.where(dev_categories == category))[0])
    cat_train_data = train_reduced.take(idx, axis=0)
    cat_train_labels = train_labels.take(idx, axis=0)  
    cat_dev_data = dev_reduced.take(idx_dev, axis=0)  
    cat_dev_labels = dev_labels.take(idx_dev, axis=0)
    
    return cat_train_data, cat_train_labels, cat_dev_data, cat_dev_labels
def testLogistic(lbda=1.0, n_components=20, kbest=4):
	# X = otto.data[:1000, :20]
	# y = otto.target[:1000]
	otto = load_otto()
	X = otto.data[:, :]
	y = otto.target[:]
	# n_components = 20
	# kbest = 4
#	print 'y.shape =', y.shape

	scalar = StandardScaler().fit(X)
	X = scalar.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ('univ_select', selection)]
	)
	X_features = combined_features.fit(X,y).transform(X)

	logistic = LogisticRegression(C=1.0/lbda)
	pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	# print trainTarget
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scalar.transform(testData)
	# logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score()))
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)
Пример #32
0
def concat_feature_extractors(train_data, labels):
    # This dataset is way to high-dimensional. Better do PCA:
    pca = PCA(n_components = 2)

    # Maybe some original features where good, too?
    selection = SelectKBest(k = 1)

    # Build estimator from PCA and Univariate selection:

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(train_data, labels).transform(train_data)

    # Classify:
    svm = SVC(kernel = "linear")
    svm.fit(X_features, labels)

    # Do grid search over k, n_components and C:

    pipeline = Pipeline([("features", combined_features), ("svm", svm)])

    param_grid = dict(features__pca__n_components = [1, 2, 3],
                      features__univ_select__k = [1, 2],
                      svm__C = [0.1, 1, 10])

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = 10)
    grid_search.fit(train_data, labels)
    print(grid_search.best_estimator_)
Пример #33
0
	def trainItalianSexClassifier(self):
		#get correct labels from dictionary in trainY and testY
		trainX = self.italianTrainData[0]
		trainY = self.getYlabels(self.italianTrainData[1], 'sex')

		

		combined_features = FeatureUnion([("tfidf", TfidfVectorizer()),
										("ngrams", TfidfVectorizer(ngram_range=(3, 3), analyzer="char")), 
										("counts", CountVectorizer()),
										("latin", Latin()),	
										],transformer_weights={
											'latin': 1,
											'tfidf': 2,
											'ngrams': 2,
											'counts': 1,

        								})
		
		X_features = combined_features.fit(trainX, trainY).transform(trainX)
		classifier = svm.LinearSVC()
		pipeline = Pipeline([("features", combined_features), ("classifier", classifier)])
		pipeline.fit(trainX, trainY)
		
		return pipeline
Пример #34
0
    def best_estimator(self, X, y):
        try:
            pca = PCA(n_components=2)
            selection = SelectKBest(k=2)
            combined_features = FeatureUnion([("pca", pca),
                                              ("univ_select", selection)])
            X_features = combined_features.fit(X, y).transform(X)
            regr = linear_model.LassoCV()
            pipeline = Pipeline([("features", combined_features),
                                 ("regression", regr)])

            if 'batter' in self.player:
                param_grid = dict(features__pca__n_components=[1, 2, 3],
                                  features__univ_select__k=[1, 2])
            else:
                param_grid = dict(features__pca__n_components=[1, 2, 3],
                                  features__univ_select__k=[1, 2])

            grid_search = GridSearchCV(pipeline,
                                       param_grid=param_grid,
                                       verbose=100)
            grid_search.fit(X, y)
            self.modelled = True
            regr = grid_search
            return regr
        except ValueError, e:
            print e
            self.modelled = False
            return None
Пример #35
0
def example():
    import numpy as np
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit([[1, 2], [np.nan, 3], [7, 6]])

    X = [[np.nan, 2], [6, np.nan], [7, 6]]
    print(imp.transform(X))

    ######################################
    from sklearn.datasets import load_iris
    from sklearn.impute import SimpleImputer, MissingIndicator
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import FeatureUnion, make_pipeline
    from sklearn.tree import DecisionTreeClassifier
    X, y = load_iris(return_X_y=True)
    mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
    X[mask] = np.nan
    X_train, X_test, y_train, _ = train_test_split(X,
                                                   y,
                                                   test_size=100,
                                                   random_state=0)

    transformer = FeatureUnion(
        transformer_list=[('features', SimpleImputer(
            strategy='mean')), ('indicators', MissingIndicator())])
    transformer = transformer.fit(X_train, y_train)
    results = transformer.transform(X_test)
    print(results.shape)

    clf = make_pipeline(transformer, DecisionTreeClassifier())
    clf = clf.fit(X_train, y_train)
    results = clf.predict(X_test)
    print(results.shape)
Пример #36
0
def testLogistic(lbda=1.0, n_components=20, kbest=4):
    # X = otto.data[:1000, :20]
    # y = otto.target[:1000]
    otto = load_otto()
    X = otto.data[:, :]
    y = otto.target[:]
    # n_components = 20
    # kbest = 4
    #	print 'y.shape =', y.shape

    scalar = StandardScaler().fit(X)
    X = scalar.transform(X)

    pca = PCA(n_components=n_components)
    selection = SelectKBest(k=kbest)

    combined_features = FeatureUnion([("pca", pca),
                                      ('univ_select', selection)])
    X_features = combined_features.fit(X, y).transform(X)

    logistic = LogisticRegression(C=1.0 / lbda)
    pipe = Pipeline(steps=[('features',
                            combined_features), ('logistic', logistic)])
    trainData = X
    trainTarget = y
    pipe.fit(trainData, trainTarget)
    # print trainTarget
    test_otto = load_testotto()
    testData = test_otto.data
    testData = scalar.transform(testData)
    # logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score()))
    'save the prediction'
    prediction = pipe.predict_proba(testData)
    proba = pipe.predict_proba(testData)
    save_submission(lbda, proba, prediction)
Пример #37
0
    def best_estimator(self, X, y):
        try:
            pca = PCA(n_components=2)
            selection = SelectKBest(k=2)
            combined_features = FeatureUnion([("pca", pca),
                                              ("univ_select", selection)])
            X_features = combined_features.fit(X, y).transform(X)
            regr = linear_model.LassoCV()
            pipeline = Pipeline([("features", combined_features),
                                 ("regression", regr)])

            if 'batter' in self.player:
                param_grid = dict(features__pca__n_components=[1],
                                  features__univ_select__k=[1])
            else:
                param_grid = dict(features__pca__n_components=[1, 2, 3, 4],
                                  features__univ_select__k=[1, 2, 3, 4])

            grid_search = GridSearchCV(pipeline,
                                       param_grid=param_grid,
                                       verbose=0)
            grid_search.fit(X, y)
            self.modelled = True
            regr = grid_search
            self.R2 = r2_score(
                self.target_matrix, regr.predict(self.feature_matrix)
            )  #Ian: should do R2 on predicted points vs. points on a given day
            return regr
        except ValueError, e:
            print e
            self.modelled = False
            return None
    def __init__(self, env, batch, n_components=500):
        self.actions = [0, 1, 2]
        # self.scaler = StandardScaler().fit(batch)
        # self.features_extractor = FeatureUnion([
        #     ("rbf1", RBFSampler(gamma=0.05, n_components=1000)),
        #     ("rbf2", RBFSampler(gamma=1.0, n_components=1000)),
        #     ("rbf3", RBFSampler(gamma=0.5, n_components=1000)),
        #     ("rbf4", RBFSampler(gamma=0.1, n_components=1000))
        #     ])
        # self.features_extractor.fit(self.scaler.transform(batch))
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        self.scaler = StandardScaler()
        self.scaler.fit(observation_examples)

        # Used to converte a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
        ])
        self.features_extractor = featurizer.fit(
            self.scaler.transform(observation_examples))

        D = len(
            self.features_extractor.transform(self.scaler.transform(batch))[0])
        self.w = np.array(
            [np.random.randn(D) / np.sqrt(D) for a in self.actions])
        self.e = np.zeros((len(self.actions), D))
Пример #39
0
    def __init__(self, env: TimeLimit):
        observation_examples = np.array(
            [env.observation_space.sample() for _ in range(10000)])
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=5.0, n_components=500)),
            ('rbf2', RBFSampler(gamma=2.0, n_components=500)),
            ('rbf3', RBFSampler(gamma=1.0, n_components=500)),
            ('rbf4', RBFSampler(gamma=0.5, n_components=500)),
        ])

        featurizer.fit(scaler.transform(observation_examples))
        self.scaler = scaler
        self.featurizer = featurizer
Пример #40
0
	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1],
				                  features__univ_select__k=[1])
			else:
				param_grid = dict(features__pca__n_components=[1,2,3,4],
				                  features__univ_select__k=[1,2,3,4])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			self.R2=r2_score(self.target_matrix,regr.predict(self.feature_matrix)) #Ian: should do R2 on predicted points vs. points on a given day
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None
Пример #41
0
    def fit(self, **kwargs):
        self.feature_list = kwargs.get('feature_list', None)
        k_single = kwargs.get('k_single', 0)
        k_pca = kwargs.get('k_pca', 1)

        self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y)

        # 数据归一化
        scaler = preprocessing.StandardScaler()
        self.scalar_ = scaler.fit(self.train_x)

        # pca
        selection = SelectKBest(k=k_single)
        n_components = int(len(self.feature_names) * k_pca)
        pca = PCA(n_components=n_components)
        combined_features = FeatureUnion([("pca", pca),
                                          ("univ_select", selection)])
        self.pca = combined_features.fit(self.train_x, self.train_y)
        self.pca = PCA(n_components=n_components).fit(self.train_x)

        self.model = SVC(kernel=self.kernel)
        fit_data = self.train_x.copy()
        fit_data = self.scalar_.transform(fit_data)
        fit_data = self.pca.transform(fit_data)
        self.model.fit(fit_data, self.train_y)

        # 评估训练集上的效果
        self.train_y_pred = self.predict(self.train_x)
        self.train_y = np.array(self.train_y)
        self.train_y_pred = np.array(self.train_y_pred)
        self.train_ev = self.evaluation.evaluate(y_true=self.train_y,
                                                 y_pred=self.train_y_pred,
                                                 threshold=0.5)

        return self
Пример #42
0
class ImputerIndicatorPrim(primitive):
    def __init__(self, random_state=0):
        super(ImputerIndicatorPrim, self).__init__(name='imputerIndicator')
        self.id = 3
        self.hyperparams = []
        self.type = 'data preprocess'
        self.description = "All features will be imputed using SimpleImputer, in order to enable classifiers to work with this data. Additionally, it adds the the indicator variables from MissingIndicator."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.imp = FeatureUnion(transformer_list=[('features', SimpleImputer()), ('indicators', MissingIndicator())])
        self.num_cols = None
        self.imp_cols = None
        self.accept_type = 'b'

    def can_accept(self, data):
        return self.can_accept_b(data)

    def is_needed(self, data):
        # data = handle_data(data)
        if data['X'].isnull().any().any():
            return True
        return False

    def fit(self, data):
        data = handle_data(data)
        self.num_cols = data['X']._get_numeric_data().columns
        self.imp.fit(data['X'][self.num_cols])
        self.imp_cols = data['X'][self.num_cols].columns[data['X'][self.num_cols].isnull().any()].tolist()

    def produce(self, data):
        output = handle_data(data)

        cols = self.num_cols.tolist()
        reg_cols = list(set(cols)-set(self.imp_cols))
        # new_cols = ["{}_imp_mean".format(v) for v in list(imp_cols)]
        for i in range(len(cols)):
            if cols[i] in reg_cols:
                continue
            elif cols[i] in self.imp_cols:
                cols[i] = "{}_imp_mean".format(cols[i])
        result = self.imp.transform(output['X'][self.num_cols])
        # extra_cols = list(range(result.shape[1] - len(cols)))
        extra_cols = ["{}_miss_indicator".format(v) for v in self.imp_cols]
        output['X'] = pd.DataFrame(result, columns=cols + extra_cols).reset_index(drop=True).infer_objects()
        output['X'] = output['X'].ix[:,~output['X'].columns.duplicated()]
        final_output = {0: output}
        return final_output
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Пример #44
0
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Пример #45
0
def test_feature_union_feature_names():
    JUNK_FOOD_DOCS = (
        "the pizza pizza beer copyright",
        "the pizza burger beer copyright",
        "the the pizza beer beer copyright",
        "the burger beer beer copyright",
        "the coke burger coke copyright",
        "the coke burger burger",
    )
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
 def __init__(self,env):
     observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
     scaler = StandardScaler() # mean 0 varaince 1
     scaler.fit(observation_examples)
     
     # used to convert a state to a featurized representation
     # we use RBF kernels with different variance
     featurizer = FeatureUnion([
             ("rbf1", RBFSampler(gamma=5.0, n_components=500)),# n_components refer to the number exemplers
             ("rbf2", RBFSampler(gamma=2.0, n_components=500)),
             ("rbf3", RBFSampler(gamma=1.0, n_components=500)),
             ("rbf4", RBFSampler(gamma=0.5, n_components=500)),
             ])
     featurizer.fit(scaler.transform(observation_examples))
     
     self.scaler= scaler
     self.featurizer = featurizer
    def __init__(self, env, n_components=500):
        examples = np.array(
            [env.observation_space.sample() for x in range(10000)],
            dtype=np.float64)
        scaler = StandardScaler()
        scaler.fit(examples)
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
        ])

        # example_features = featurizer.fit_transform(scaler.transform(examples))
        featurizer.fit(scaler.transform(examples))
        self.scaler = scaler
        self.featurizer = featurizer
Пример #48
0
 def _fit_features_matrix_target_array(self, X: pd.DataFrame):
     """Get features matrix and target array. TODO -  more description helpful."""
     features = self._get_features_matrix_transformer()
     target = self._get_target_array_transformer()
     feat_tar = FeatureUnion(transformer_list=[("features",
                                                features), ("target",
                                                            target)])
     self.fitted_features_and_target_ = feat_tar.fit(X)
Пример #49
0
    def __init__(self, env):
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        # used to concatenate feature vectors since RBF uses scale parameter
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=500)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=500)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=500)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=500)),
        ])
        featurizer.fit(scaler.transform(observation_examples))

        self.scaler = scaler
        self.featurizer = featurizer
Пример #50
0
def make_tfidf(train, test):
    russian_stop = set(stopwords.words('russian'))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": 'word',
        "token_pattern": r'\w{1,}',
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": 'l2',
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ('description',
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=100,
                         **tfidf_para,
                         preprocessor=get_col('description'))),
        # ('text_feat', CountVectorizer(
        #     ngram_range=(1, 2),
        #     # max_features=7000,
        #     preprocessor=get_col('text_feat'))),
        ('title',
         TfidfVectorizer(ngram_range=(1, 2),
                         **tfidf_para,
                         max_features=70,
                         preprocessor=get_col('title')))
    ])
    vectorizer.fit(train)
    ret_df = vectorizer.transform(train)
    feature_names = vectorizer.get_feature_names()
    return ret_df, feature_names


# vectorizer.fit(df.loc[traindex, :].to_dict('records'))
# ready_df = vectorizer.transform(df.to_dict('records'))
# tfvocab = vectorizer.get_feature_names()
#
#
# # get char count
# length_of_words = len(df["len"])
Пример #51
0
def test_feature_stacker_feature_names():
    JUNK_FOOD_DOCS = (
        "the pizza pizza beer copyright",
        "the pizza burger beer copyright",
        "the the pizza beer beer copyright",
        "the burger beer beer copyright",
        "the coke burger coke copyright",
        "the coke burger burger",
    )
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Пример #52
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
Пример #53
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1],
                    10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())
Пример #54
0
def get_pca_transformer(train_x, train_y, n_components=-1):
    if n_components == -1:
        n_components = int(np.ceil(np.sqrt(train_x.shape[1])))

    pca = PCA(n_components=n_components)
    selection = SelectKBest(k=n_components/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    return combined_features.fit(train_x, train_y)
    def test_same_result_weight(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], transformer_weights={"words": 10})
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], transformer_weights={"words": 10})

        loc_union.fit(X)
        dist_union.fit(Z)

        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Пример #56
0
def ageClassifier(doc, age):
	""" A function that trains an age classifier """
	xTrain = doc
	yTrain = age

	unionOfFeatures = FeatureUnion([
									('normaltfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)),
									('bigrams', TfidfVectorizer(preprocessor = identity, tokenizer = identity, ngram_range = (3,3), analyzer = 'char')),
									('counts', CountVectorizer(preprocessor = identity, tokenizer = identity))
									])

	featureFit = unionOfFeatures.fit(xTrain, yTrain).transform(xTrain)
	classifier = Pipeline([('featureunion', unionOfFeatures), ('cls', svm.SVC(kernel='linear', C=1.5))])
	classifier.fit(xTrain, yTrain)
	
	return classifier
Пример #57
0
def old():
    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.plot(pca.explained_variance_ratio_, linewidth=2)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('explained variance')
    plt.show()

    # initialize selectKBest to pick the best of the ones that occur naturally
    # the feature union does not check whether there is overlap between the estimators
    # so we need to seriously watch out for this...
    selection = SelectKBest(k=1)

    # build a dict with these for pipeline purposes
    combined_features = FeatureUnion([('pca', pca), ('univ_select', selection)])

    # use the combined features to transform the dataset
    X_features = combined_features.fit(X, y).transform(X)

    # initialize the svm
    svm = SVR(kernel="linear")

    # I think I put the scaler into the first set of the pipeline...

    pipeline = Pipeline([('scaler', scaler), ('features', combined_features), ('svm', svm)])

    param_grid = dict(features__pca__n_components=[2, 5, 10],
                      features__univ_select__k=[1, 2],
                      svm__C=[0.1, 1, 10])

    #scoring: precision, accuracy, recall,

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
    grid_search.fit(X, y)
    print grid_search.best_estimator_

    scores = grid_search.grid_scores_


    fig, ax = plt.subplots()

    ax.scatter(y, predicted)
    ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
Пример #58
0
def optimize_clf (clf, dataset, feature_list,params):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    scores_arr = []
    pca = PCA()
    selection = SelectKBest(k = 1)
    combined_features = FeatureUnion([("pca",pca),("univ_select",selection)])
    X_features = combined_features.fit(features,labels).transform(features)
    pipeline = Pipeline([("features", combined_features),("clf",clf)])
    pca_range = range(1,len(feature_list))
    params['features__pca__n_components']=pca_range
    k_range = range(1,len(feature_list))
    k_range.append('all')
    params['features__univ_select__k']=k_range
    grid_search = GridSearchCV(pipeline, param_grid= params, scoring = "f1")
    grid_search.fit(features,labels)
    return grid_search.best_estimator_
Пример #59
0
def testLogistic(otto, lbda=1.0, n_components=20, kbest=4):
	# X = otto.data[:1000, :20]
	# y = otto.target[:1000]
	X = otto.data[:, :]
	y = otto.target[:]
	# n_components = 20
	# kbest = 4
#	print 'y.shape =', y.shape

	scalar = StandardScaler().fit(X)
	X = scalar.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ('univ_select', selection)]
	)
	X_features = combined_features.fit(X,y).transform(X)

	logistic = LogisticRegression(C=1.0/lbda)
	pipes = [
		Pipeline(steps=[('features', combined_features), ('logistic', logistic)])\
		for i in range(5)
	]
	cv = KFold(n=X.shape[0], n_folds=5, shuffle=True)

	threadList = []
	for i,(trainIndex, testIndex) in enumerate(cv):
		pipe = pipes[i]
		trainData = X[trainIndex]
		trainTarget = y[trainIndex]
		# print trainTarget
		testData = X[testIndex]
		testTarget = y[testIndex]
		# pipe.fit(trainData, trainTarget)
		t = otto_thread(
			name=str(i+1), args=(pipe,), kwargs={'algo':'Logistic', 'train':(trainData, trainTarget), 'test':(testData, testTarget)}
		)
		t.start()
		threadList.append(t)

	for t in threadList:
		t.join()
Пример #60
0
def analysis():
    genotype=pandas.read_excel('test.xlsx','data')
    print(genotype.describe())
    # Author: Andreas Mueller <*****@*****.**>
    #
    # License: BSD 3 clause
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    from sklearn.datasets import load_iris
    from sklearn.decomposition import PCA
    from sklearn.feature_selection import SelectKBest
    iris = load_iris()

    X, y = iris.data, iris.target

    # This dataset is way to high-dimensional. Better do PCA:
    pca = PCA(n_components=2)

    # Maybe some original features where good, too?
    selection = SelectKBest(k=1)

    # Build estimator from PCA and Univariate selection:

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(X, y).transform(X)

    svm = SVC(kernel="linear")

    # Do grid search over k, n_components and C:

    pipeline = Pipeline([("features", combined_features), ("svm", svm)])

    param_grid = dict(features__pca__n_components=[1, 2, 3],
                      features__univ_select__k=[1, 2],
                      svm__C=[0.1, 1, 10])

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
    grid_search.fit(X, y)
    print(grid_search.best_estimator_)