def test_add_dummy_feature(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_numpy_array_almost_equal(result.data.values, expected) result = df.preprocessing.add_dummy_feature(value=2) expected = pp.add_dummy_feature(iris.data, value=2) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_numpy_array_almost_equal(result.data.values, expected) self.assert_index_equal(result.columns[1:], df.data.columns) s = df['sepal length (cm)'] self.assertTrue(isinstance(s, pdml.ModelSeries)) result = s.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data[:, [0]]) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_numpy_array_almost_equal(result.values, expected) self.assertEqual(result.columns[1], 'sepal length (cm)')
def percep(X_tr, y_tr, X_te): clf = Perceptron(n_iter = 1000) X_tr_aug = add_dummy_feature(X_tr) X_te_aug = add_dummy_feature(X_te) clf.fit(X_tr_aug, y_tr) y_pred = clf.predict(X_te_aug) return y_pred
def pinv(X_tr, y_tr, X_te): # augment the feature space X_tr_aug = add_dummy_feature(X_tr) X_te_aug = add_dummy_feature(X_te) X_tr_aug[np.where(y_tr == 1)] = -X_tr_aug[np.where(y_tr == 1)] b = np.ones((len(X_tr_aug),)) w = np.dot(np.linalg.pinv(X_tr_aug), b) indicator = np.dot(X_te_aug, w) for i in range(len(indicator)): if indicator[i] > 0: indicator[i] = 0 else: indicator[i] = 1 return indicator
def _augment(self, X): # for factorization machines, we add a dummy column for each order. if self.fit_lower == 'augment': k = 2 if self.fit_linear else 1 for _ in range(self.degree - k): X = add_dummy_feature(X, value=1) return X
def __init__(self, history, strength_model=None, content_features=None, using_delay=True, using_global_difficulty=True, using_item_bias=True, debug_mode_on=False): """ Initialize memory model object :param pd.DataFrame history: Interaction log data. Must contain the 'tlast' column, in addition to the other columns that belong to the dataframe in a lentil.datatools.InteractionHistory object. If strength_model is not None, then the history should also contain a column named by the strength_model (e.g., 'nreps' or 'deck') :param str|None strength_model: Corresponds to a column in the history dataframe (e.g., 'nreps' or 'deck') or simply None if memory strength is always 1. :param dict[str,np.array]|None content_features: A dictionary mapping item names to feature vectors. All items should be accounted for. :param bool using_delay: True if the delay term is included in the recall probability, False otherwise. :param bool using_global_difficulty: True if the global bias term should be included in the log-linear difficulty model, False otherwise. :param bool using_item_bias: True if the item-specific bias term should be included in the log-linear difficulty model, False otherwise. :param bool debug_mode_on: True if MAP estimation should log progress and plot learned difficulty parameters, False otherwise. """ self.history = history[history['module_type']==datatools.AssessmentInteraction.MODULETYPE] self.strength_model = strength_model self.using_delay = using_delay self.using_global_difficulty = using_global_difficulty self.using_item_bias = using_item_bias self.debug_mode_on = debug_mode_on self.idx_of_module_id = {x: i for i, x in enumerate(self.history['module_id'].unique())} self.difficulty = None if content_features is None: if self.using_global_difficulty: content_features = np.ones((len(self.idx_of_module_id), 1)) else: content_features = np.array([content_features[module_id] \ for module_id in self.history['module_id'].unique()]) content_features = preprocessing.scale(content_features) if self.using_global_difficulty: content_features = preprocessing.add_dummy_feature(content_features) self.content_features = content_features if self.content_features is None and not self.using_item_bias: raise ValueError('The log-linear difficulty model has not been defined!')
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : regressor Returns self. """ if self.fit_intercept: X = add_dummy_feature(X) n_samples, n_features = X.shape rs = self._get_random_state() self.outputs_2d_ = len(y.shape) == 2 if self.outputs_2d_: Y = y else: Y = y.reshape(-1, 1) Y = np.asfortranarray(Y) n_vectors = Y.shape[1] ds = get_dataset(X) if not self.warm_start or self.coef_ is None: self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.dual_coef_ = np.zeros((n_vectors, n_samples), dtype=np.float64) for i in xrange(n_vectors): _dual_cd_svr(self, self.coef_[i], self.dual_coef_[i], ds, Y[:, i], self.permute, self.C, self.epsilon, self._get_loss(), self.max_iter, rs, self.tol, self.callback, self.n_calls, verbose=self.verbose) if self.fit_intercept: self.intercept_ = self.coef_[:, 0] self.coef_ = self.coef_[:, 1:] return self
def predict(self, X, add_interactions=False, return_labels=False): # Add intercept X = add_dummy_feature(X) hypothesis = self.compute_hypothesis(X) if self.loss_fn == "logistic": if return_labels: labels = [1 if i > 0.5 else 0 for i in hypothesis] return labels else: return hypothesis elif self.loss_fn == "squared": return hypothesis
def fit(self, X, y): start_time = time.time() # Add intercept X = add_dummy_feature(X) # Store gradients self.grad_v = np.zeros((self.k, self.features)) m = len(X[0]) n = len(X) self.initialize_weights(m) for _ in xrange(self.epochs): # Kind of ugly and slow but all i care about is learning X, y = self.shuffle_data(X, y) # Create an iterator x_iter = self.batch(X, self.batch_size) y_iter = self.batch(y, self.batch_size) for X_batch, y_batch in izip(x_iter, y_iter): hypothesis = self.compute_hypothesis(X_batch) # Compute cost for entire dataset # cost = self.compute_cost(X, y) # print cost gradient = self.compute_gradient(X_batch, y_batch, hypothesis) self.update_weights(gradient) # Save model params for scikit compatibility self.intercept_ = self.w[0] self.coeff_ = self.w[1:,] print "elapsed time in training: %f" % (time.time() - start_time)
def __init__(self, history, strength_model=None, content_features=None, using_delay=True, using_global_difficulty=True, using_item_bias=True, debug_mode_on=False): """ Initialize memory model object :param pd.DataFrame history: Interaction log data. Must contain the 'tlast' column, in addition to the other columns that belong to the dataframe in a lentil.datatools.InteractionHistory object. If strength_model is not None, then the history should also contain a column named by the strength_model (e.g., 'nreps' or 'deck') :param str|None strength_model: Corresponds to a column in the history dataframe (e.g., 'nreps' or 'deck') or simply None if memory strength is always 1. :param dict[str,np.array]|None content_features: A dictionary mapping item names to feature vectors. All items should be accounted for. :param bool using_delay: True if the delay term is included in the recall probability, False otherwise. :param bool using_global_difficulty: True if the global bias term should be included in the log-linear difficulty model, False otherwise. :param bool using_item_bias: True if the item-specific bias term should be included in the log-linear difficulty model, False otherwise. :param bool debug_mode_on: True if MAP estimation should log progress and plot learned difficulty parameters, False otherwise. """ self.history = history[history['module_type'] == datatools.AssessmentInteraction.MODULETYPE] self.strength_model = strength_model self.using_delay = using_delay self.using_global_difficulty = using_global_difficulty self.using_item_bias = using_item_bias self.debug_mode_on = debug_mode_on self.idx_of_module_id = { x: i for i, x in enumerate(self.history['module_id'].unique()) } self.difficulty = None if content_features is None: if self.using_global_difficulty: content_features = np.ones((len(self.idx_of_module_id), 1)) else: content_features = np.array([content_features[module_id] \ for module_id in self.history['module_id'].unique()]) content_features = preprocessing.scale(content_features) if self.using_global_difficulty: content_features = preprocessing.add_dummy_feature( content_features) self.content_features = content_features if self.content_features is None and not self.using_item_bias: raise ValueError( 'The log-linear difficulty model has not been defined!')
def test_add_dummy_feature_csr(): X = sp.csr_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert_true(sp.isspmatrix_csr(X), X) assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_add_dummy_feature(): X = [[1, 0], [0, 1], [0, 1]] X = add_dummy_feature(X) assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def _augment(self, X): # for polynomial nets, we add a single dummy column if self.fit_lower == 'augment': X = add_dummy_feature(X, value=1) return X
def test_add_dummy_feature_coo(): X = sp.coo_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert_true(sp.isspmatrix_coo(X), X) assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def decision_boundary(w, c): b, w1, w2 = w y1 = -(w1 * -4 + b) / w2 y2 = -(w1 * 4 + b) / w2 plt.plot([-4, 4], [y1, y2], c) # y = w1 * x1 + w2 * x2 + b # 0 = w1 * x1 + w2 * x2 + b # -(w1 * x1 + b) = w2 * x2 # -(w1 * x1 + b) / w2 = x2 action = np.loadtxt('Data/action.txt') action = preprocessing.add_dummy_feature(action) # print(action[:3]) xx = action[:, :-1] yy = action[:, -1:] # print(xx.shape, yy.shape) # (100, 3) (100, 1) for _, x1, x2, y in action: # print(x1, x2) plt.plot(x1, x2, 'ro' if y else 'go') decision_boundary(gradient_descent(xx, yy), 'r') # decision_boundary(gradient_stochastic_1(xx, yy), 'g') # decision_boundary(gradient_stochastic_2(xx, yy), 'b') # decision_boundary(gradient_minibatch(xx, yy), 'y') # decision_boundary(gradient_minibatch_random_1(xx, yy), 'k')
def load_train_or_test_data(data_file, args, pp=True, pca_obj=None, srp_obj=None, for_test=False): print('loading %s...' % data_file) if args.pca or args.sparse_random_projection: max_dim = 0 include_offset = False else: max_dim = args.max_dimension include_offset = args.include_offset X, y, pp = load_data(data_file, data_file.split('.')[-1], max_dim=max_dim, preprocess=pp, include_offset=include_offset, target_dim=args.target_dim) if args.pca and args.max_dimension > 0: print('performing PCA') if pca_obj is None: pca_comps = args.max_dimension if args.include_offset: pca_comps -= 1 pca_obj = PCA(n_components=pca_comps).fit(X) X = pca_obj.transform(X) if args.include_offset: X = preprocessing.add_dummy_feature(X) if args.sparse_random_projection: print('performing sparse random projection') if srp_obj is None: if args.max_dimension > 0: n_components = args.max_dimension print(n_components * 10, X.shape[1]) dense_output = n_components * 10 < X.shape[1] else: n_components = 'auto' dense_output = True # not sure if this is a good idea... srp_obj = SparseRandomProjection(n_components=n_components, dense_output=dense_output, random_state=0).fit(X) X = srp_obj.transform(X) if args.include_offset: X = preprocessing.add_dummy_feature(X) if sp.issparse(X) and (X.nnz > np.prod(X.shape) / 3.0 or X.shape[1] <= 20): print("X is either low-dimensional or not very sparse, so " "converting to a numpy array") X = X.toarray() # Z = sp.diags(y).dot(X) num_features = X.shape[1] print('%d total training data points of dimension %d' % X.shape) if sp.issparse(X): print('density =', float(X.nnz) / np.prod(X.shape)) # split data further, if necessary split_size = 1e4 if for_test else 2e5 num_splits = max(1, int(X.shape[0] / split_size + .5)) if num_splits > 1: print('num splits =', num_splits) Xs = sparse_vsplit(X, num_splits) ys = sparse_vsplit(y, num_splits) return Xs, ys, pp, pca_obj, srp_obj
if __name__ == '__main__': # Check user input if len(sys.argv) != 3: print('ERROR: USAGE: python <input.csv> <output.csv>') sys.exit(0) else: input_file = sys.argv[1] output_file = sys.argv[2] # Read in data raw_data = np.genfromtxt(input_file, delimiter=',') X_raw = raw_data[:, :-1] Y = raw_data[:, [-1]] # Scale Xs X_scaled = preprocessing.scale(X_raw) # Add in the intercept column X = preprocessing.add_dummy_feature(X_scaled) # Run gradient descent for the nine pairs of learning rates and number of iterations given, and print the results to a file with open(output_file, 'w') as output: for alpha in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 0.3]: n_iter = 100 beta = GradientDescent(X, Y, alpha, n_iter) output.write( str(alpha) + ',' + str(n_iter) + ',%.3f' % beta[0] + ',%.3f' % beta[1] + ',%.3f' % beta[2] + '\n')
from sklearn import preprocessing import numpy as np # 때에 따라 존재하지 않는 임의의 특징값(feature)를 만들어야 하는 경우가 있을 수 있는데 # sklearn 의 preprocessing 을 이용하여 임의의 특징값을 생성해 봅시다, x = [[0, 1], [3, 5]] x2 = preprocessing.add_dummy_feature(x) # add_dummy_feature ==> value를 생략하면 1을 각 행마다 추가해 준다. print(x2) ''' [[1. 0. 1.] [1. 3. 5.]] ''' x = [[0, 1, 2], [3, 4, 5]] x2 = preprocessing.add_dummy_feature(x, 9) print(x2) ''' [[9. 0. 1. 2.] [9. 3. 4. 5.]] ''' x3 = preprocessing.add_dummy_feature(x2, 7) print(x3) ''' [[7. 9. 0. 1.] [7. 9. 3. 5.]] '''
print("Standardizing data") data = [[0, 0], [0, 0], [1, 1], [1, 1]] print("original data") print(data) scaler = StandardScaler() scaler.fit(data) print("Mean of the data") print(scaler.mean_) print("Standardized data") print(scaler.transform(data)) print(" ") le = preprocessing.LabelEncoder() print("Labels:") print(["paris", "paris", "tokyo", "amsterdam"]) le.fit(["paris", "paris", "tokyo", "amsterdam"]) print("Encodings for \n tokyo,amsterdam,paris::") print(le.transform(["tokyo", "amsterdam", "paris"]) ) print("") print("Adding dummy feature") X = [[0,1],[1,1]] print("Data :") print(X) print("adding dummy feature with value 5") X=add_dummy_feature(X,value=5.0) print(X)
else: return -1 # XTrain, XTest, YTrain, YTest = preprocess("../Databases/ionosphere.data",34, tranform_categorical) # Preprocessing print("READING DataBase....") data = pd.read_csv("../Databases/ionosphere.data", header = None) #Reading target = pd.DataFrame(data[34]) #Y features = data.drop([34], axis= 1) #X print("Preprocessing Data") #tranform categorical to discrete y = target.applymap(tranform_categorical) y.head() #Add of a column of 1's to X features_p = add_dummy_feature(features) x = pd.DataFrame(features_p ) sum_accu = 0 repeat = 100 for k in range(0,repeat): #Splitting between training and testing try: from sklearn.model_selection import train_test_split # sklearn > ... except: from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4) XTrain = X_train.values XTest = X_test.values
def predict(self, x, p_threshold=0.5): if self.fit_intercept and x.shape[1] < len(self.beta): x = add_dummy_feature(x) odds_threshold = p_threshold / (1.0 - p_threshold) return (np.exp(LogisticRegression.xbeta(x, self.beta)) > odds_threshold).astype(int)
def load_data(path, file_type, max_data=0, max_dim=0, preprocess=True, include_offset=True): """Load data from a variety of file types. Parameters ---------- path : string Data file path. file_type : string Supported file types are: 'svmlight', 'npy' (with the labels y in the rightmost col), 'npz', 'hdf5' (with datasets 'x' and 'y'), and 'csv' (with the labels y in the rightmost col) max_data : int If positive, maximum number of data points to use. If zero or negative, all data is used. Default is 0. max_dim : int If positive, maximum number of features to use. If zero or negative, all features are used. Default is 0. preprocess : boolean or Transformer, optional Flag indicating whether the data should be preprocessed. For sparse data, the features are scaled to [-1, 1]. For dense data, the features are scaled to have mean zero and variance one. Default is True. include_offset : boolean, optional Flag indicating that an offset feature should be added. Default is False. Returns ------- X : array-like matrix, shape=(n_samples, n_features) y : int ndarray, shape=(n_samples,) Each entry indicates whether each example is negative (-1 value) or positive (+1 value) pp_obj : None or Transformer Transformer object used on data, or None if ``preprocess=False`` """ if not isinstance(path, str): raise ValueError("'path' must be a string") if file_type in ["svmlight", "svm"]: X, y = _load_svmlight_data(path) else: raise ValueError("unsupported file type, %s" % file_type) y_vals = set(y) if len(y_vals) != 2: raise ValueError('Only expected y to take on two values, but instead' 'takes on the values ' + ', '.join(y_vals)) if 1.0 not in y_vals: raise ValueError('y does not take on 1.0 as one on of its values, but ' 'instead takes on the values ' + ', '.join(y_vals)) if -1.0 not in y_vals: y_vals.remove(1.0) print('converting y values of %s to -1.0' % y_vals.pop()) y[y != 1.0] = -1.0 if preprocess is False: pp_obj = None else: if preprocess is True: if sp.issparse(X): pp_obj = preprocessing.MaxAbsScaler(copy=False) else: pp_obj = preprocessing.StandardScaler(copy=False) pp_obj.fit(X) else: pp_obj = preprocess X = pp_obj.transform(X) if include_offset: X = preprocessing.add_dummy_feature(X) X = np.flip(X, -1) # move intercept to the last column of the array if sp.issparse(X) and (X.nnz > np.prod(X.shape) / 10 or X.shape[1] <= 20): print("X is either low-dimensional or not very sparse, so converting " "to a numpy array") X = X.toarray() if isinstance(max_data, int) and max_data > 0 and max_data < X.shape[0]: X = X[:max_data, :] y = y[:max_data] if isinstance(max_dim, int) and max_dim > 0 and max_dim < X.shape[1]: X = X[:, :max_dim] return X, y, pp_obj
def decision_function(self, X): if self.fit_intercept: X = add_dummy_feature(X) return safe_sparse_dot(X, self.coef_.T)
def add_dummy_feature(): x = [[0, 1], [2, 3]] print(preprocessing.add_dummy_feature(x))