Exemplo n.º 1
0
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
Exemplo n.º 2
0
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
Exemplo n.º 3
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Exemplo n.º 4
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def train_model(trainset, testset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
#	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []
        testclasses = []
        testcorpus = []
	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])
	
	for item in testset:
		testcorpus.append(item['text'])
		testclasses.append(item['label'])

#	print "Training instances : ", len(classes)
#	print "Testing instances : ", len(set(classes)) 
	
	matrix = vectorizer.fit_transform(corpus)
	testmatrix = vectorizer.fit_transform(testcorpus)
#	print "feature count :. ", len(vectorizer.get_feature_names())
#	print "training model"
	X = matrix.toarray()
	TX = testmatrix.toarray()
	Ty= numpy.asarray(testclasses)
	y = numpy.asarray(classes)
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.9999,test_size=.00001,random_state=0)
	model = LinearSVC(dual=True, loss='l1')
#	model = SVC()
#	model = NuSVC()
#	model = RandomForestClassifier() 
	#scores=cross_validation.cross_val_score(model,X,y)
	#print "Accuracy "+ str(scores.mean())
#	print y_pred
	y_prob = model.fit(X_train, y_train).predict(TX)
#	y_prob = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
#	print(y_prob)
#	cm = confusion_matrix(y_test, y_pred)
#	cr = classification_report(y_test, y_pred)
#	print cr
#	print(cm)
#	pl.matshow()
#	pl.title('Confusion matrix#')
#	pl.colorbar()
#	pl.ylabel('True label')
#	pl.xlabel('Predicted label')
#	pl.show()
        print accuracy_score(y_prob,Ty)
def make_checkdata(mode="df"):
    
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    train_keys = train_gray_data.keys()[:2]
   
    train_inputs = {}
    train_labels = {}
    for i in xrange(len(train_keys)):
        input_ = train_gray_data[train_keys[i]]
        label = labels[train_keys[i]]

        train_inputs.update({train_keys[i]:input_})
        train_labels.update({train_keys[i]:label})
 
    test_keys = test_gray_data.keys()[:2]
    test_inputs = {}
    for i in xrange(len(test_keys)):
        input_ = test_gray_data[test_keys[i]]
        test_inputs.update({test_keys[i]:input_})
        
    train_df = f.make_data_df(train_inputs, train_labels)
    test_df = f.make_test_df(test_inputs) 
    

    if mode == "df":

        train_df = train_df.reset_index()
        test_df = test_df.reset_index()
        
        train_df.columns = ["pngname", "input", "label"]
        test_df.columns = ["pngname", "input"]

        return train_df, train_keys, test_df, test_keys


    elif mode == "feature":

        X_train = fu.fit_transform(train_df)
        X_train = Std.fit_transform(X_train)
        y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
        
        
        
        X_test = fu.fit_transform(test_df)
        X_test = Std.fit_transform(X_test)    
        
        return X_train, y_train, X_test
Exemplo n.º 7
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Exemplo n.º 8
0
def test_feature_union_parallel():
    # test that n_jobs work for FeatureUnion
    X = JUNK_FOOD_DOCS

    fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))])

    fs_parallel = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs_parallel2 = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs.fit(X)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape[0], len(X))

    fs_parallel.fit(X)
    X_transformed_parallel = fs_parallel.transform(X)
    assert_equal(X_transformed.shape, X_transformed_parallel.shape)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray())

    # fit_transform should behave the same
    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())

    # transformers should stay fit after fit_transform
    X_transformed_parallel2 = fs_parallel2.transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def prediction(train_df, test_df, MODEL):

    print "... start prediction"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    test_X = fu_obj.transform(test_df)

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
Exemplo n.º 10
0
def train_model(trainset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []

	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])

	print "Training instances : ", 0.8*len(classes)
	print "Testing instances : ", 0.2*len(classes) 
	
	matrix = vectorizer.fit_transform(corpus)
	print "feature count : ", len(vectorizer.get_feature_names())
	print "training model"
	X = matrix.toarray()
	y = numpy.asarray(classes)
	model =LinearSVC()
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0)
	y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
	#y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test)
	#print y_prob
	#con_matrix = []
	#for row in range(len(y_prob)):
	#	temp = [y_pred[row]]	
	#	for prob in y_prob[row]:
	#		temp.append(prob)
	#	con_matrix.append(temp)
	#for row in con_matrix:
	#	output.write(str(row)+"\n")
	#print y_pred		
	#print y_test
	
	res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited']
	res2=[i for i, j in enumerate(y_test) if j == 'anonEdited']
	reset=[]
	for r in res1:
		if y_test[r] != "anonEdited":
			reset.append(y_test[r])
	for r in res2:
		if y_pred[r] != "anonEdited":
			reset.append(y_pred[r])
	
	
	output=open(sys.argv[2],"w")
	for suspect in reset:
		output.write(str(suspect)+"\n")	
	cm = confusion_matrix(y_test, y_pred)
	print(cm)
	pl.matshow(cm)
	pl.title('Confusion matrix')
	pl.colorbar()
	pl.ylabel('True label')
	pl.xlabel('Predicted label')
	pl.show()
	print accuracy_score(y_pred,y_test)
def convert_testdata(test_gray_data, feature_rule=f.feature_transformer_rule):

    data_df = f.make_test_df(test_gray_data)
    fu = FeatureUnion(transformer_list=feature_rule)
    Std = preprocessing.StandardScaler()

    X_test = fu.fit_transform(data_df)
    #X_test = Std.fit_transform(X_test)

    return X_test
def prediction(train_df, test_df, MODEL):

    print "... start prediction"
    fu_obj = FeatureUnion(transformer_list=features.feature_list)
    train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)]
    train_X = fu_obj.fit_transform(train_df)
    train_y = np.log1p(train_df["Sales"]).as_matrix()
    train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj))
    train_dump_df["target"] = train_y
    train_dump_df = train_dump_df.dropna(axis=0)
    print train_dump_df.shape
    train_X = train_dump_df[get_split_feature_list(fu_obj)].values
    train_y = train_dump_df["target"].values
    train_dump_df["ID"] = -1
    train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False)
    test_X = fu_obj.transform(test_df)
    test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj))
    print (test_dump_df == 0).sum(axis=0)
    test_dump_df["ID"] = test_df["Id"]
    test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False)
    if MODEL == "XGB":
        train_X, valid_X, train_y, valid_y =\
            train_test_split(train_X, train_y, test_size=0.05)
        fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)],
                     "eval_metric": rmspe_xg,
                     "early_stopping_rounds": 100}
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1,
                           fit_params=fit_param)
    else:
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    y_pred = np.expm1(clf.predict(test_X))
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def set_traindata(df, key):

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X = fu.fit_transform(df)
    y = np.concatenate(df["label"].apply(lambda x: x.flatten()))

    X = Std.fit_transform(X)

    return (X, y)
def cv_score(train_df, MODEL):
    print "... start cross validation"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()
    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=-1, scoring=rmspe, cv=None)
    print cross_val_score(clf, train_X, train_y, scoring=rmspe, cv=5, n_jobs=3)
def convert_traindata(train_gray_data, labels):

    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X_train = fu.fit_transform(data_df)
    y_train = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    X_train = Std.fit_transform(X_train)

    return X_train, y_train
def get_data():
    '''
    get X, y data

    :rtype: tuple
    '''
    _, _, _, train_gray_data, _, _, labels = i_p.load_data()
    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    X = fu.fit_transform(data_df)
    y = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    return (X, y)
Exemplo n.º 17
0
class MuscleClassifier():

	def __init__(self, auto_load=True):
		""" Initializes our MuscleClassifier
			Option to preload it or start from fresh model 
		"""

		#=====[ If auto_load, then we rehydrate our existing models ]=====
		if auto_load:

			self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r'))
			self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r'))
			self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r'))

		else:

			self.model = BernoulliNB()

	def train(self, muscle_groups, labels):
		""" 
			Vectorizes raw input and trains our classifier 
		"""

		#=====[ Instantiate label encoder to turn text labels into ints ]=====
		self.le = preprocessing.LabelEncoder()

		#=====[ Declare vectorizers and merge them via a FeatureUnion ]=====
		char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8')
		word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8')

		self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)])

		#=====[ Transform our input and labels ]=====
		X = self.vectorizer.fit_transform(muscle_groups).toarray()
		Y = self.le.fit_transform(labels)

		#=====[ Fit our model and then run inference on training data ]=====
		self.model.fit(X,Y)
		y = self.model.predict(X)

		#=====[ Report Traning Accuracy ]=====
		print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y)))

	def predict(self, exercises):
		""" Takes in raw input, vectorizes it, and reports back predicted muscle group """

		X = self.vectorizer.transform(exercises).toarray()
		y = self.model.predict(X)

		return self.le.classes_[y]
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Exemplo n.º 19
0
def train_model(trainset):

  # create 2 blocks of features, word and character ngrams, size of 2 (using TF-IDF method)
  # we can also append here multiple other features in general

  word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 )
  char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 )

  # our vectors are the feature union of word/char ngrams
  vectorizer = FeatureUnion([  ("chars", char_vector),("words", word_vector)  ] )

  corpus, classes = [], []
    

  for item in trainset:    
    corpus.append( item['text'] )
    classes.append( item['label'] )

  print "num of training instances: ", len(classes)    
  print "num of training classes: ", len(set(classes))

  #fit the model of tfidf vectors for the coprus
  matrix = vectorizer.fit_transform(corpus)
 
  print "num of features: " , len(vectorizer.get_feature_names())
  print "training model"
  X = matrix.toarray()
  y = np.asarray(classes)

  print X[0]

  # Here are results of several different models for Law corpus:

  # model  = SVC(kernel='sigmoid') # ->                       0.38
  # model  = KNeighborsClassifier(algorithm = 'kd_tree') # -> 0.41
  # model = AdaBoostClassifier() #->                            0.46
  # model  = RandomForestClassifier() # ->                    0.52
  # model  = LogisticRegression() # ->                        0.65 
  model  = LinearSVC( loss='l1', dual=True) # ->              0.70
  # Results of several different models for Enron corpus:
  # model  = LinearSVC( loss='l1', dual=True) # ->              0.6

  scores = cross_validation.cross_val_score(  estimator = model,
    X = matrix.toarray(), 
        y= np.asarray(classes), cv=10  )

  print "10-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
  def __init__(self, env, n_components=500):
    observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    scaler = StandardScaler()
    scaler.fit(observation_examples)

    # Used to converte a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
            ])
    example_features = featurizer.fit_transform(scaler.transform(observation_examples))

    self.dimensions = example_features.shape[1]
    self.scaler = scaler
    self.featurizer = featurizer
Exemplo n.º 21
0
 def _build_bow(self, data_train):
     fu_bow = FeatureUnion(
             transformer_list=[
                     ('names', MultiSourceTfidfVectorizer(item_name='names', tokenizer=tokenizer_for_names, lowercase=False)),
                     ('content', MultiSourceTfidfVectorizer(item_name='content', tokenizer=tokenizer, lowercase=False)),
                     # ('title', Pipeline([
                     #         ('selector', ItemSelector(key='title')),
                     #         ('tfidf', TfidfVectorizer(tokenizer=tokenizer, lowercase=False)),
                     #     ])
                     # ),
                 ],
             transformer_weights={
                 'names': 1.0,
                 # 'title': 0.8,
                 'content': 0.6,
                 },
             )
     featured_data = fu_bow.fit_transform(data_train)
     return fu_bow, featured_data
def set_validdata(df, keys):

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    for i in xrange(len(keys)):
        if i == 0:
            valid_df = df[(df["pngname"] == keys[i])]
        else:
            valid_df = pd.concat([valid_df, df[(df["pngname"] == keys[i])]])

    valid_df = valid_df.drop("pngname", axis=1).reset_index()

    X = fu.fit_transform(valid_df)
    y = np.concatenate(valid_df["label"].apply(lambda x: x.flatten()))

    X = Std.fit_transform(X)

    return (X, y)
def validation_model(df, MODEL):

    print "... start validation"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_df = df[(df["valflag"] != 1)]

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, cv=None)
    clf.fit(train_X, train_y)
    print clf.grid_scores_
    print clf.best_estimator_
    print clf.best_score_
    print clf.best_params_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s_validation.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s_validation.csv" % MODEL
        coef_df.to_csv(coeffile)

    val_df = df[(df["valflag"] == 1)]
    test_X = fu_obj.transform(val_df)
    test_y = val_df["Sales"].as_matrix()

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales_Pred")
    y_sr = pd.Series(test_y, name="Sales")
    res = pd.concat([pred_sr, y_sr], axis=1).rename(index=lambda x: x + 1)
    submissionfile = SUBMISSION + "submission_validation_%s.csv" % MODEL
    res.to_csv(submissionfile)
Exemplo n.º 24
0
def get_features(Xdata, y=None, ncomp=2, kbest=0):
    """Feature selection using PCA or Kbest variance selection"""
    if ncomp > 0 and kbest > 0:
        pca = PCA(n_components=ncomp)
        selection = SelectKBest(f_classif, k=(
            int(kbest) if int(kbest) < Xdata.shape[1] else 'all'))
        combined_features = FeatureUnion(
            [("pca", pca), ("univ_select", selection)])
        X_features = combined_features.fit_transform(Xdata, y)

    elif ncomp > 0:
        pca = PCA(n_components=ncomp)
        X_features = pca.fit_transform(Xdata, y)

    elif kbest > 0:
        selection = SelectKBest(k=int(kbest) if int(
            kbest) < Xdata.shape[1] else 'all')
        X_features = selection.fit_transform(Xdata, y)

    return X_features
Exemplo n.º 25
0
    def __init__(self, env, n_components=500):
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        # Used to converte a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
        ])
        example_features = featurizer.fit_transform(
            scaler.transform(observation_examples))

        self.dimensions = example_features.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 26
0
def preprocessing_data(data):
    num_pipeline = Pipeline([
        ("numetic", DataFrameSelector(["Age", "SibSp", 'Parch', "Fare"])),
        ("imputer", SimpleImputer(strategy='median')),
        # ("std_scaler", StandardScaler()),
    ])
    # num_pipeline.fit_transform(data)
    cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Sex", "Pclass", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False))
    ])
    # cat_pipeline.fit_transform(data)
    preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
    x_data = preprocess_pipeline.fit_transform(data)

    return x_data
  def __init__(self, env):
    # observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    # NOTE!! state samples are poor, b/c you get velocities --> infinity
    observation_examples = np.random.random((20000, 4))*2 - 1
    scaler = StandardScaler()
    scaler.fit(observation_examples)

    # Used to converte a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=0.05, n_components=1000)),
            ("rbf2", RBFSampler(gamma=1.0, n_components=1000)),
            ("rbf3", RBFSampler(gamma=0.5, n_components=1000)),
            ("rbf4", RBFSampler(gamma=0.1, n_components=1000))
            ])
    feature_examples = featurizer.fit_transform(scaler.transform(observation_examples))

    self.dimensions = feature_examples.shape[1]
    self.scaler = scaler
    self.featurizer = featurizer
Exemplo n.º 28
0
def test_sklearn_pipeline_works(lang):
    pipe = Pipeline([("embed", lang), ("model", LogisticRegression())])

    X = [
        "i really like this post",
        "thanks for that comment",
        "i enjoy this friendly forum",
        "this is a bad post",
        "this is a bad post",
        "i dislike this article",
        "this is not well written",
    ]
    y = np.array([1, 1, 1, 0, 0, 0, 0])

    pipe.fit(X, y)
    assert pipe.predict(X).shape[0] == 7

    preprocess = FeatureUnion([("dense", lang), ("sparse", CountVectorizer())])

    assert preprocess.fit_transform(X).shape[0] == 7
Exemplo n.º 29
0
def test_explain_prediction_feature_union_sparse(newsgroups_train_binary):
    # FeatureUnion with sparce features and text highlighting
    docs, ys, target_names = newsgroups_train_binary
    vec = FeatureUnion([
        ('word', CountVectorizer(stop_words='english')),
        ('char', CountVectorizer(ngram_range=(3, 3))),
    ])
    clf = XGBClassifier(n_estimators=100, max_depth=2, missing=0)
    xs = vec.fit_transform(docs)
    clf.fit(xs, ys)
    res = explain_prediction(clf,
                             'computer graphics in space: a sign of atheism',
                             vec=vec,
                             target_names=target_names)
    format_as_all(res, clf)
    check_targets_scores(res)
    weights = res.targets[0].feature_weights
    pos_features = get_all_features(weights.pos)
    assert 'word__graphics' in pos_features
    assert res.targets[0].weighted_spans
Exemplo n.º 30
0
def pipeline(housing):
    #handing text attributes
    housing_num = housing.drop("ocean_proximity", axis=1)
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer()),
    ])
    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
    housing_prepared = full_pipeline.fit_transform(housing)
Exemplo n.º 31
0
    def __init__(self, env):
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        ## Standardize the observations so we have mean 0 and variance 1
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        ## Converts the state into a feature representations
        ## We use the rbf kernel with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=5.0, n_components=500)),
            ('rbf2', RBFSampler(gamma=2.0, n_components=500)),
            ('rbf3', RBFSampler(gamma=1.0, n_components=500)),
            ('rbf4', RBFSampler(gamma=0.5, n_components=500))
        ])
        example_features = featurizer.fit_transform(
            scaler.transform(observation_examples))
        self.dimensions = example_features.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 32
0
    def __init__(self, env):
        # observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
        # sampling state has issues, velocities --> infinity,
        # so we created random the space,
        #  [(-1, 1), (-1, 1),..]
        observation_examples = np.random.random((20000, 4)) * 2 - 1
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=0.05, n_components=1000)),
            ("rbf2", RBFSampler(gamma=0.1, n_components=1000)),
            ("rbf3", RBFSampler(gamma=0.5, n_components=1000)),
            ("rbf4", RBFSampler(gamma=1.0, n_components=1000))
        ])
        feature_examples = featurizer.fit_transform(scaler.transform(observation_examples))

        self.dimensions = feature_examples.shape[1] # [20000, 4000] -> 4000
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 33
0
    def __init__(self, env, n_components=500):
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)],
            dtype=np.float64)
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        # concatenate => add collumns => return (10000, n_components*4)
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
        ])
        example_features = featurizer.fit_transform(
            scaler.transform(observation_examples))

        self.dimension = example_features.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 34
0
    def __init__(self, env):

        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=500)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=500)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=500)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=500))
        ])

        example_features = featurizer.fit_transform(
            scaler.transform(observation_examples))

        self.dimensions = example_features.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
  def __init__(self, env):
    # observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    # NOTE!! state samples are poor, b/c you get velocities --> infinity
    observation_examples = np.random.random((20000, 4))*2 - 1
    scaler = StandardScaler()
    scaler.fit(observation_examples)

    # Used to converte a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=0.05, n_components=1000)),
            ("rbf2", RBFSampler(gamma=1.0, n_components=1000)),
            ("rbf3", RBFSampler(gamma=0.5, n_components=1000)),
            ("rbf4", RBFSampler(gamma=0.1, n_components=1000))
            ])
    feature_examples = featurizer.fit_transform(scaler.transform(observation_examples))

    self.dimensions = feature_examples.shape[1]
    self.scaler = scaler
    self.featurizer = featurizer
    def __init__(self, n_components=500):
        #lets say [8,-8]
        observation_examples = 8 * (np.random.random((20000, 1)) * 2 - 1)
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        # Used to converte a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=n_components))
        ])
        example_features = featurizer.fit_transform(
            scaler.transform(observation_examples))

        self.dimensions = example_features.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 37
0
class Featurizer:
    def __init__(self):
        # To add new features, just add a new pipeline to the feature union
        # The ItemSelector is used to select certain pieces of the input data
        # In this case, we are selecting the plaintext of the input data

        # TODO: Add any new feature transformers or other features to the FeatureUnion
        self.all_features = FeatureUnion([
            ('text_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('text_length', TextLengthTransformer())
            ])),

            ('sentence_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('num_sentence', NumSentenceTransformer())
            ])),

            #Caused minor decrease but could perform well on unseen data
            ('pattern_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('num_patterns', PatternTransformer()),
                ('vect', DictVectorizer())
            ])),

            ('frequency_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer(min_df=50))
            ])),

            ('bigrams_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('bigrams', CountVectorizer(ngram_range=(1,3)))
            ])),
        ])

    def train_feature(self, examples):
        return self.all_features.fit_transform(examples)

    def test_feature(self, examples):
        return self.all_features.transform(examples)
Exemplo n.º 38
0
def preprocess_features(features_to_process, test_set=False):
    # Drop unimportant columns
    features.drop(['Date', 'Team1_Ball_Possession(%)'], axis=1, inplace=True)

    # Separate categorical columns from numerical columns
    categorical_features_list = [
        'Location', 'Phase', 'Team1', 'Team2', 'Team1_Continent',
        'Team2_Continent', 'Normal_Time'
    ]
    numerical_features = features_to_process.drop(categorical_features_list,
                                                  axis=1,
                                                  inplace=False)
    categorical_features = features_to_process[categorical_features_list].copy(
    )

    # Preprocess features
    numerical_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(numerical_features))),
        ('imputer', Imputer(strategy='median')),
        ('std_scaler', StandardScaler())
    ])

    category_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(categorical_features))),
        ('cat_encoder', cs.OneHotEncoder(drop_invariant=True))
    ])

    if parsed_args.different_values:
        full_pipeline = FeatureUnion(transformer_list=[
            ('num_pipeline', numerical_pipeline),
        ])
    else:
        full_pipeline = FeatureUnion(transformer_list=[(
            'num_pipeline',
            numerical_pipeline), ('cat_pipeline', category_pipeline)])

    prepared_features = pd.DataFrame(
        data=full_pipeline.fit_transform(features_to_process),
        index=np.arange(1, features_to_process.shape[0] + 1))

    return prepared_features
Exemplo n.º 39
0
def get_relevent_questions(input_text):

    questions = Question.objects.all()
    train_data = []
    train_labels = []
    for question in questions:
        train_labels = train_labels + [question.id]
        answers = Answers.objects.filter(question=question.id)
        text = question.content
        for obj in answers:
            text = text + " " + obj.answer
            train_data = train_data + [text]

    Stop = stopwords.words('english')
    train_data = cleaning_text(questions, Stop)

    # Create feature vectors
    ### ---- Put here all features
    dict_vect = HashingVectorizer()
    vectorizer = TfidfVectorizer(min_df=1, max_df=0.9)
    count_vect = CountVectorizer()

    #list_comb = [('tf', vectorizer), ('cnt',count_vect), ('hs',dict_vect) ]
    list_comb = [('tf', vectorizer), ('cnt', count_vect)]
    combined_f = FeatureUnion(list_comb)

    X_train2 = combined_f.fit_transform(train_data)

    # ---------------
    model_svm = svm.SVC(kernel='linear', probability=True)
    model_svm.fit(X_train2, train_labels)
    # -----
    X_test2 = combined_f.transform([input_text])
    proba = model_svm.predict_proba(X_test2)
    predict_proba = np.array(model_svm.classes_)
    proba = np.array(proba)
    inds = proba.argsort()
    predict_proba = predict_proba[inds].tolist()
    #print (classification_report(test_labels, prediction))
    # In[24]:
    return predict_proba[0][:6]
 def __init__(self, env, n_components=100):
     # First we need to sample from the observation_space to tune the RBFs (10k samples)
     observation_examples = np.array(
         [env.observation_space.sample() for x in range(10000)])
     # Define the scaler
     scaler = StandardScaler()
     scaler.fit(observation_examples)
     # Setup RBFsampler to transform the observation space
     featurizer = FeatureUnion([
         ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)),
         ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)),
         ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)),
         ('rbf4', RBFSampler(gamma=0.5, n_components=n_components))
     ])
     # WHAT IS DIS (maybe used for dimensions)
     example_features = featurizer.fit_transform(
         scaler.transform(observation_examples))
     # Save all
     self.dimensions = example_features.shape[1]
     self.scaler = scaler
     self.featurizer = featurizer
Exemplo n.º 41
0
class FeatureTransformer:
    def __init__(self, env, n_components=1000):
        observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        self._sc = StandardScaler()
        self._sc.fit(observation_examples)

        self._featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)),
            ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)),
            ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)),
            ('rbf4', RBFSampler(gamma=.5, n_components=n_components)),
        ])
        #do test
        samples = self._featurizer.fit_transform(
            self._sc.transform(observation_examples))
        self.dimensions = samples.shape[1]

    def transform(self, observations):
        scaled_o = self._sc.transform(observations)
        return self._featurizer.transform(scaled_o)
Exemplo n.º 42
0
    def __init__(self, env: TimeLimit):
        observation_examples = np.hstack((np.random.random(
            (20000, 1)) * 4.8 - 2.4, np.random.random(
                (20000, 1)) * 4.0 - 2.0, np.random.random(
                    (20000, 1)) * 0.8 - 0.4, np.random.random(
                        (20000, 1)) * 8.0 - 4.0))
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=0.05, n_components=1000)),
            ('rbf2', RBFSampler(gamma=1.0, n_components=1000)),
            ('rbf3', RBFSampler(gamma=0.5, n_components=1000)),
            ('rbf4', RBFSampler(gamma=0.1, n_components=1000)),
        ])

        examples = featurizer.fit_transform(
            scaler.transform(observation_examples))
        self.scaler = scaler
        self.featurizer = featurizer
        self.dim = examples.shape[1]
Exemplo n.º 43
0
    def __init__(self, env):
        obs_examples = np.random.random((20000, 4))
        print(obs_examples.shape)
        scaler = StandardScaler()
        scaler.fit(obs_examples)

        # Used to convert a state to a featurized representation.
        # We use RBF kernels with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ("cart_position", RBFSampler(gamma=0.02, n_components=500)),
            ("cart_velocity", RBFSampler(gamma=1.0, n_components=500)),
            ("pole_angle", RBFSampler(gamma=0.5, n_components=500)),
            ("pole_velocity", RBFSampler(gamma=0.1, n_components=500))
            ])

        feature_examples = featurizer.fit_transform(scaler.transform(obs_examples))
        print(feature_examples.shape)

        self.dimensions = feature_examples.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
Exemplo n.º 44
0
def refine_data(data=load_housing_data(), train_flag=True):
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import Imputer, StandardScaler
    from Encoder import DataFrameSelector, CategoricalEncoder
    from sklearn.pipeline import FeatureUnion

    housing = make_sampling_by_median_income(housing=data);
    housing, housing_label = strat_data_by_median_housing(housing=housing, train_flag=train_flag);

    # ocean_proximity is not a numeric feature
    housing_num = housing.drop("ocean_proximity", axis=1);

    num_attribs = list(housing_num);
    cat_attribs = ["ocean_proximity"];

    # Make a pipeline for numerical feature data
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ]);

    # Make a pipeline for categorical feature data
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ]);

    # Combine pipelines
    full_pipeline = FeatureUnion(transformer_list=
                                 [("num_pipeline", num_pipeline),
                                  ("cat_pipeline", cat_pipeline),
                                  ]);

    housing_prepared = full_pipeline.fit_transform(housing);
    # print(housing_prepared);
    # print(housing_prepared.shape);

    return housing_prepared, housing_label;
Exemplo n.º 45
0
def prepare_data(data):
    """
            Prepares the data for Machine Learning algorithm.
            First, we make the Feature Engineering, and then the selection of revelant features for solving
            the machine learning problem. And then, the mean of the data, finally the feature scaling.
        """
    num_attrs = [
        'BMI',
        'AGE_AT_ADMIT',
        'Gender',
        'Female',
        'PreOpNarcotic',
        'PreOpInsulin',
        'PreOpDMMeds',
        'PreOpBloodThinner',
        'degre_dx',
        'med_cond',
    ]
    cat_attrs = ['RawDx', 'Side']

    #A numerical pipeline for transfoming the numerical features
    num_pipeline = Pipeline([
        ('feature_engineering', PreProcessing()),
        ('selector', DataFrameSelector(num_attrs)),
        ('imputer', Imputer(strategy="mean")),
        ('std_scaler', StandardScaler()),
    ])

    #Categorical pipeline for transforming textual features
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attrs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

    #Union both pipelines
    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
    return full_pipeline.fit_transform(data)
def convert_df_to_X_data(df, categorical_variables, numeric_variables):

    class DataFrame_Col_Selector(BaseEstimator, TransformerMixin):
        def __init__(self, cols):
            self.cols = cols
        def fit(self,x,y=None):
            return self
        def transform(self, df):
            return df[self.cols]

    class DataFrame_Categorical_Converter(BaseEstimator, TransformerMixin):
        def fit(self, x,y=None):
            return self
        def transform(self, df):
            series_list = []
            for c in df.columns:
                temp_series = df[c].astype(str)
                temp_series.name = c + '_str'
                series_list.append(temp_series)
            temp_data = pandas.concat(series_list, axis=1)
            # Convert data to dict to pass to sklearn DictVectorizer
            dict_data = temp_data.to_dict(orient='records')
            return dict_data
        
 
        
    pipeline = FeatureUnion([
            ('categorical_pipeline',Pipeline([
                ('col_selector', DataFrame_Col_Selector(categorical_variables)),
                ('dict_converter', DataFrame_Categorical_Converter()),
                ('dict_vectorizer', DictVectorizer())
            ])),
            ('numeric_pipeline',Pipeline([
                        ('col_selector', DataFrame_Col_Selector(numeric_variables)),
                        ('standard_scaler',StandardScaler(with_mean=True))
                    ]))
        ])


    return pipeline.fit_transform(df), pipeline
def helpfulModelingPipelineLR():
   #load the pickle
   print "Loading pickle..."
   #comments_discussion_df=pd.read_pickle('comments_discussion.p')
   X=pd.read_pickle('X_type.p')

   comments_discussion_df= pd.read_pickle('comments_type_discussion.p')
   #assign the target (session length) to y and convert to int
   y_actual = comments_discussion_df.iloc[:,comments_discussion_df.columns.get_loc("code_numbers"):comments_discussion_df.columns.get_loc("code_numbers")+1].astype(int)

   #scaling the data for feature selection
   X.loc[:,['users_count','comments_count']] = preprocessing.scale(X.loc[:,['users_count','comments_count']])

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.3, random_state=0)
   print y_actual_train.head()

   pca = PCA(n_components=2)

   selection = SelectKBest(k=1)

   # Build estimator from PCA and Univariate selection:
   combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

   # Use combined features to transform dataset:
   X_features = combined_features.fit_transform(X_train, y_actual_train)

   lr = LogisticRegression(C=1e4)

   # Do grid search over k, n_components and C:
   pipeline = Pipeline([("features", combined_features), ("lr", lr)])

   param_grid = dict(features__pca__n_components=[1, 2, 3, 4, 5, 6, 7],
                     features__univ_select__k=[1, 2, 3, 4, 5, 6, 7],
                     lr__C=[0.0001,0.001,0.01, 0.1, 1, 10,100,500,1000,1e4,1e5,1e6])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid,scoring='accuracy', verbose=10)
   grid_search.fit(X_train, y_actual_train['code_numbers'].values) 
   print(grid_search.best_estimator_)
   y_actual.to_pickle('y_actual_type.p')
   pickle.dump(grid_search.best_estimator_, open( "lr_best_estimator.p", "wb" ) )
Exemplo n.º 48
0
def test_feature_union_parallel():
    # test that n_jobs work for FeatureUnion
    X = JUNK_FOOD_DOCS

    fs = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ])

    fs_parallel = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ],
                               n_jobs=2)

    fs_parallel2 = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ],
                                n_jobs=2)

    fs.fit(X)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape[0], len(X))

    fs_parallel.fit(X)
    X_transformed_parallel = fs_parallel.transform(X)
    assert_equal(X_transformed.shape, X_transformed_parallel.shape)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel.toarray())

    # fit_transform should behave the same
    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel2.toarray())

    # transformers should stay fit after fit_transform
    X_transformed_parallel2 = fs_parallel2.transform(X)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel2.toarray())
Exemplo n.º 49
0
def preprocess_and_extract(dataset):
    y_dataset = np.log1p(dataset['price'])
    dataset = dataset.drop('price', 1)

    # Slight preprocessing
    dataset = preprocess(dataset)

    # Feature extraction.
    default_preprocessor = CountVectorizer().build_preprocessor()
    def build_preprocessor(field):
        field_idx = list(dataset.columns).index(field)
        # This string casting here is a bad idea, but for the time being produces a working thing.
        # Vectorizers need to be thought through so test set can be used with the train fitted vectorizer.
        return lambda x: default_preprocessor(str(x[field_idx]))

    vectorizer = FeatureUnion([
        ('name', CountVectorizer(
            ngram_range=(1, 2),
            max_features=50000,
            preprocessor=build_preprocessor('name'))),
        ('category_name', CountVectorizer(
            token_pattern='.+',
            preprocessor=build_preprocessor('category_name'))),
        ('brand_name', CountVectorizer(
            token_pattern='.+',
            preprocessor=build_preprocessor('brand_name'))),
        ('shipping', CountVectorizer(
            token_pattern='\d+',
            preprocessor=build_preprocessor('shipping'))),
        ('item_condition_id', CountVectorizer(
            token_pattern='\d+',
            preprocessor=build_preprocessor('item_condition_id'))),
        ('item_description', TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=100000,
            preprocessor=build_preprocessor('item_description'))),
    ])
    X_dataset = vectorizer.fit_transform(dataset.values)

    return X_dataset, y_dataset, vectorizer
Exemplo n.º 50
0
def test_models(X_train, y_train, X_test, y_test, models):

    trained_models = {}

    vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer())])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    for key in models:
        model_name = key
        model = models[key]
        t1 = time.time()
        model.fit(X_train, y_train)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()

        calculate_accuracy(y_test, predicted_y, model_name, t2 - t1, t3 - t2)
        trained_models[model_name] = model

    return (trained_models, vectorizer)
Exemplo n.º 51
0
    def __init__(self, env, n_components=500):
        observation_examples = [
            env.observation_space.sample() for _ in range(10000)
        ]
        scaler = StandardScaler()
        scaler.fit(observation_examples)

        # Used to convert a state to featurized representation
        # We use RBF kernels with different variances to cover different parts
        # of the space
        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)),
            ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)),
            ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)),
            ('rbf4', RBFSampler(gamma=0.5, n_components=n_components))
        ])
        featurized = featurizer.fit_transform(
            scaler.transform(observation_examples))

        self.scaler = scaler
        self.featurizer = featurizer
        self.dimension = featurized.shape[1]
def my_transform(data, label, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)
Exemplo n.º 53
0
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word)])
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word)])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(loc_union.get_feature_names(),
                     dist_union.get_feature_names())
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([("chars", loc_char),
                                      ("words", loc_word)],
                                     n_jobs=2)
        dist_union_par = SparkFeatureUnion([("chars", dist_char),
                                            ("words", dist_word)],
                                           n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def dump_train():
    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()

    train_df = f.make_data_df(train_gray_data, labels)
    test_df = f.make_test_df(test_gray_data)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    train_df.columns = ["pngname", "input", "label"]
    test_df.columns = ["pngname", "input"]

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()]
    feature_name_list.append("target")
    train_X = fu.fit_transform(train_df)
    train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
    train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2)
    train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list)
    dump_path = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/train_dump"
    train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
Exemplo n.º 55
0
    def __init__(self, env):
        '''
		we are not sampling from observation space 
		rather from a uniform distribution as its 
		state space extend to infinity.
		'''
        observation_examples = np.random.random((20000, 4)) * 2 - 2
        scalar = StandardScaler()
        scalar.fit(observation_examples)

        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=0.05, n_components=1000)),
            ('rbf2', RBFSampler(gamma=1.0, n_components=1000)),
            ('rbf3', RBFSampler(gamma=0.5, n_components=1000)),
            ('rbf4', RBFSampler(gamma=0.1, n_components=1000))
        ])
        feature_example = featurizer.fit_transform(
            scalar.transform(observation_examples))

        self.dimension = feature_example.shape[1]
        self.scalar = scalar
        self.featurizer = featurizer
Exemplo n.º 56
0
class Featurizer:
    def __init__(self):
        # To add new features, just add a new pipeline to the feature union
        # The ItemSelector is used to select certain pieces of the input data
        # In this case, we are selecting the plaintext of the input data

        # TODO: Add any new feature transformers or other features to the FeatureUnion
        self.all_features = FeatureUnion([
            ('text_stats',
             Pipeline([('selector', ItemSelector(key='text')),
                       ('text_length', TextLengthTransformer())])),
            ('ngrams',
             Pipeline([('selector', ItemSelector(key='text')),
                       ('n_grmas', NGrams())])),
            ('tfidf',
             Pipeline([('selector', ItemSelector(key='text')),
                       ('tfidf', Tfidf())])),
            ('suffix',
             Pipeline([
                 ('selector', ItemSelector(key='text')),
                 ('suffix', Suffixes()),
             ])),
            ('pos_words',
             Pipeline([
                 ('selector', ItemSelector(key='text')),
                 ('positive_words', pos_words()),
             ])),
            ('neg_words',
             Pipeline([
                 ('selector', ItemSelector(key='text')),
                 ('negative_words', neg_words()),
             ])),
        ])

    def train_feature(self, examples):
        return self.all_features.fit_transform(examples)

    def test_feature(self, examples):
        return self.all_features.transform(examples)
Exemplo n.º 57
0
def main():
    raw_data = load_iris()
    data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])

    pipeline = FeatureUnion([
        (
            "1",
            make_pipeline(
                FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]),
                # other transformations
            )),
        (
            "2",
            make_pipeline(
                FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]),
                # other transformations
            ))
    ])

    X = pipeline.fit_transform(data)
    print(X["sepal length (cm)"].mean())
    print(X["sepal width (cm)"].mean())
class FeatureTransformer:
    def __init__(self):
        #normally use env.observation_space.sample() if all samples are likely
        observation_samples = np.random.random((20000, 4)) * 2 - 1
        self.sc = StandardScaler()
        self.sc.fit(observation_samples)

        self.feature_union = FeatureUnion([
            ("rbf0", RBFSampler(gamma=0.05, n_components=1000)),
            ("rbf1", RBFSampler(gamma=0.1, n_components=1000)),
            ("rbf2", RBFSampler(gamma=0.5, n_components=1000)),
            ("rbf3", RBFSampler(gamma=1, n_components=1000))
        ])
        feature_examples = self.feature_union.fit_transform(
            self.sc.transform(observation_samples))

        self.dimensions = feature_examples.shape[1]
        print("feature example: ", feature_examples.shape)

    def transform(self, observations):
        X = self.sc.transform(observations)
        return self.feature_union.transform(X)
Exemplo n.º 59
0
def dataprepare(data):
    cat_attribs = ["ocean_proximity"]
    num_attribs = list(data.drop(cat_attribs, axis=1))

    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder()),
    ])

    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

    return full_pipeline.fit_transform(data)
Exemplo n.º 60
0
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('desc_len', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('desc_len'))),
    ('name_len', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('name_len'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=100000,
        preprocessor=build_preprocessor('item_description'))),
])

X = vectorizer.fit_transform(full_df.values)

X_train = X[:n_trains]
Y_train = train_df.target.values.reshape(-1, 1)

X_dev = X[n_trains:n_trains+n_devs]
Y_dev = dev_df.target.values.reshape(-1, 1)

X_test = X[n_trains+n_devs:]
print(X.shape, X_train.shape, X_dev.shape, X_test.shape)

print("Fitting Ridge model on training examples...")
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=1.0,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)