def transform(self, X): #print 'getting metadata features' features_to_use = [ "requester_account_age_in_days_at_request", \ "requester_days_since_first_post_on_raop_at_request", \ "requester_number_of_comments_at_request", \ "requester_number_of_comments_in_raop_at_request", \ "requester_number_of_posts_at_request", \ "requester_number_of_posts_on_raop_at_request", \ "requester_number_of_subreddits_at_request", \ "requester_upvotes_minus_downvotes_at_request", \ "requester_upvotes_plus_downvotes_at_request", \ ] utc_difference = (X["unix_timestamp_of_request_utc"] - X["unix_timestamp_of_request"]).as_matrix() length_of_post = [len(post) for post in X['request_text_edit_aware']] length_of_title = [len(title) for title in X['request_title']] timestamps = X["unix_timestamp_of_request"] date_times = [datetime.fromtimestamp(ts) for ts in timestamps] year = np.array([dt.year for dt in date_times]) month = np.array([dt.month for dt in date_times]) enc = OneHotEncoder() weekday = np.array([[dt.isocalendar()[2]] for dt in date_times]) weekday = enc.fit_transform(weekday).toarray() hours = np.array([[dt.hour] for dt in date_times]) hours = enc.fit_transform(hours).toarray() return np.c_[X[features_to_use].as_matrix(), utc_difference,length_of_title,length_of_post, year , month, weekday]
def transformTestData(self, train_data, test_data): #Select the right features for both training and testing data X_train, y_train = self.__selectRelevantFeatures(train_data) X_test, y_test = self.__selectRelevantFeatures(test_data) #Transform categorical variables into integer labels martial_le = LabelEncoder() occupation_le = LabelEncoder() relationship_le = LabelEncoder() race_le = LabelEncoder() sex_le = LabelEncoder() transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le] for i in range(len(transformers)): X_train[:,i] = transformers[i].fit_transform(X_train[:,i]) X_test[:,i] = transformers[i].transform(X_test[:,i]) #Dummy code categorical variables dummy_code = OneHotEncoder(categorical_features = range(5)) X_train = dummy_code.fit_transform(X_train).toarray() X_test = dummy_code.transform(X_test).toarray() #Normalize all features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Encode y class_le = LabelEncoder() y_train = class_le.fit_transform(y_train) y_test = class_le.transform(y_test) #print class_le.transform(["<=50K", ">50K"]) return X_train, X_test, y_train, y_test
def load_data(): # Read file content training_file_content = pd.read_csv(TRAINING_FILE_PATH) testing_file_content = pd.read_csv(TESTING_FILE_PATH) combined_file_content = pd.concat([training_file_content, testing_file_content]) # Manipulate file content X = combined_file_content.drop([ID_COLUMN_NAME, LABEL_COLUMN_NAME], axis=1).as_matrix() categorical_features_mask_list = [] for column_vector in X.T: valid_elements_mask = np.logical_not(pd.isnull(column_vector)) if np.can_cast(type(column_vector[valid_elements_mask][0]), np.float): categorical_features_mask_list.append(False) min_value = np.min(column_vector[valid_elements_mask]) column_vector[np.logical_not(valid_elements_mask)] = min_value - 1 else: categorical_features_mask_list.append(True) column_vector[np.logical_not(valid_elements_mask)] = "Missing" column_vector[:] = perform_categorization(column_vector) encoder = OneHotEncoder(categorical_features=categorical_features_mask_list) X = encoder.fit_transform(X).toarray() # Separate the data set Y = combined_file_content[LABEL_COLUMN_NAME].as_matrix() ID = combined_file_content[ID_COLUMN_NAME].as_matrix() test_data_mask = pd.isnull(Y) X_train = X[np.logical_not(test_data_mask)] Y_train = Y[np.logical_not(test_data_mask)] X_test = X[test_data_mask] ID_test = ID[test_data_mask] return X_train, Y_train, X_test, ID_test
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"): # generate 2d classification dataset if (type_data == "blobs"): X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features) elif(type_data == "moons"): X, y = make_moons(n_samples=n_samples, noise=0.1) elif(type_data == "circles"): X, y = make_circles(n_samples=n_samples, noise=0.05) # scatter plot, dots colored by class value # df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) # colors = {0:'red', 1:'blue', 2:'green'} # fig, ax = pyplot.subplots() # grouped = df.groupby('label') # for key, group in grouped: # group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) # pyplot.show() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None) classes = np.unique(y_train) if(0): enc = OneHotEncoder().fit(classes.reshape(-1,1)) y_train = enc.transform(y_train.reshape(-1, 1)) print (y_test) y_test = enc.transform(y_test.reshape(-1, 1)) print (y_test) y_train = one_hot_encode(y_train, classes) y_test = one_hot_encode(y_test, classes) return X_train, y_train, X_test, y_test, classes
def get_coded_data(cases_df, case_ids, coded_feature_names): """ Retrieves the valences corresponding to case_ids, along with coded features, if any Recode unknown valences to neutral. args: cases_df: A dataframe containing the case variables. case_ids: list of sorted case_ids coded_feature_names: list of column names to pull from cases_df (ie 'geniss' or ['geniss','casetyp1']) returns: valences: np array of valences coded_feature_array: np array of coded features filtered_cases_df: Dataframe containing the sorted, filtered case variables """ UNKNOWN_VALENCE = 0 NEUTRAL_VALENCE = 2 if isinstance(coded_feature_names, str): coded_feature_names = [coded_feature_names] print "coded_feature_names: ",coded_feature_names valences = [] coded_feature_list = [] for case_id in case_ids: valence = cases_df[cases_df['caseid'] == case_id]['direct1'].values[0] if np.isnan(valence)==False: valence = int(valence) else: valence = 2 if coded_feature_names is not None: coded_feature_row = cases_df[cases_df['caseid'] == case_id][coded_feature_names].values[0] clean_row = [] #clean row for val in coded_feature_row: if val and np.isnan(val) == False: clean_row.append(int(val)) else: clean_row.append(0) assert clean_row[0]>=0, "" coded_feature_list.append(clean_row) # Replacing unknown valence variables with netural scores. if valence == UNKNOWN_VALENCE: valence = NEUTRAL_VALENCE valences.append(valence) #one-hot encoding if coded_feature_names is not None: enc = OneHotEncoder() coded_feature_array = enc.fit_transform(np.array(coded_feature_list)) print "Coded Feature Array shape: ", coded_feature_array.shape else: coded_feature_array = np.array([]) #Filter case df filtered_case_df = filter_cases_df(cases_df,case_ids) return np.array(valences),coded_feature_array,filtered_case_df
def process(discrete, cont): # Create discrete and continuous data matrices discrete_X = np.array(discrete) cont_X = np.array(cont) # Impute discrete values imp = Imputer(strategy='most_frequent') discrete_X = imp.fit_transform(discrete_X) # Impute continuous values imp_c = Imputer(strategy='mean') cont_X = imp_c.fit_transform(cont_X) # Discrete basis representation enc = OneHotEncoder() enc.fit(discrete_X) discrete_X = enc.transform(discrete_X).toarray() # Continuous scaling scaler = StandardScaler() scaler.fit(cont_X) cont_X = scaler.transform(cont_X) # Merge to one array X = np.concatenate((discrete_X, cont_X), axis=1) return X
def load_bees(): ''' helper function to load our data ''' train_fp = "/home/ubuntu/bee_images/train" labels = "/home/ubuntu/bee_images" train_labels = pd.read_csv(labels + '/' + "train_labels.csv") train_labels.set_index('id', inplace = True) bee_images = os.listdir(train_fp) bee_images = filter(lambda f: f[-3:] == 'jpg', bee_images) bee_images = filter(lambda f: f != '1974.jpg', bee_images) bees = [] for i in bee_images: im = imread(train_fp + "/" + i, as_grey = False) im = resize(im, (48, 48)) bees.append(im) # divide bees by 255 to give it a 0 - 1 scale # (255 is the current max val and zero is the min) bees = np.array(bees)/255.0 Y = train_labels.ix[[int(x.split('.')[0]) for x in bee_images]].values onehot = OneHotEncoder(sparse = False, n_values = 2) Y = onehot.fit_transform(Y) bees, Y = gen_data(bees, Y) return balance(bees, Y)
def test_one_hot_encoder_not_fitted(): X = np.array([['a'], ['b']]) enc = OneHotEncoder(categories=['a', 'b']) msg = ("This OneHotEncoder instance is not fitted yet. " "Call 'fit' with appropriate arguments before using this method.") with pytest.raises(NotFittedError, match=msg): enc.transform(X)
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5): clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth) clf = clf.fit(tr_x, tr_y) """ #Node count estimators = clf.estimators_ for row in estimators: for e in row: print(e.tree_.node_count)""" leaf_indices = clf.apply(tr_x) leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1) ts_leaf_indices = clf.apply(ts_x) ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1) enc = OneHotEncoder() enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0)) tr_cat_features = enc.transform(leaf_indices).toarray() ts_cat_features = enc.transform(ts_leaf_indices).toarray() header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])] print("[gbm_cat] Features size: ", len(header)) return header, tr_cat_features, ts_cat_features
def prepare_features(data, enc=None, scaler=None): ''' One-hot encode all boolean/string (categorical) features, and shift/scale integer/float features ''' # X needs to contain only non-negative integers bfs = data['bfeatures'] + 1 sfs = data['sfeatures'] + 1 # Shift/scale integer and float features to have mean=0, std=1 ifs = data['ifeatures'] ffs = data['ffeatures'] x2 = np.hstack((ifs,ffs)) if scaler is None: scaler = StandardScaler() x2 = scaler.fit_transform(x2) print "Training features have mean: %s" % scaler.mean_ print "and standard deviation: %s" % scaler.std_ else: x2 = scaler.transform(x2, copy=False) # one-hot encode categorical features X = np.hstack((bfs,sfs,x2)) categorical = np.arange(bfs.shape[1]+sfs.shape[1]) if enc is None: enc = OneHotEncoder(n_values='auto', categorical_features=categorical) X = enc.fit_transform(X) print "One-hot encoded features have dimension %d" % X.shape[1] else: X = enc.transform(X) return X, enc, scaler
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100): # Perform some model selection to determine good parameters # Load data X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_train = encoder.transform(forest.apply(X_train)) learner = SGDClassifier( loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.001, average=10 ** 4, eta0=0.5, class_weight="balanced", ) metric = "f1" losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"] penalties = ["l2", "l1", "elasticnet"] alphas = 10.0 ** numpy.arange(-5, 0) learning_rates = ["constant", "optimal", "invscaling"] param_grid = [{"alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates}] grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True) grid_search.fit(X_train, y_train) print(grid_search.best_params_, grid_search.best_score_) return grid_search
class CategoricalExpansion(BaseEstimator, TransformerMixin): """ Uses one hot encoder to expand categorical columns Don't use this in a pipeline Arguments: ========= threshold: int The maximum number of unique values that a column can have for it to be considered categorical Returns: ======== Sparse matrix of expanded column. """ def __init__(self, threshold): self.threshold = threshold def fit(self, X, y=None): uniques = [(len(x.unique()), x.dtype.kind) for n, x in X.iteritems()] self.mask_ = [(x[0] < self.threshold and x[1] == 'i') for x in uniques] self.encoder_ = OneHotEncoder() self.encoder_.fit(X.loc[:, self.mask_]) return self def transform(self, X): return self.encoder_.transform(X.loc[:, self.mask_])
class ExpandCategorical(BaseEstimator, TransformerMixin): def __init__(self, columns, append=False, only_new=False): if isinstance(columns, str): columns = [columns] self.columns = columns self.append = append self.only_new = only_new def fit(self, X=None, y=None): self.encoder_ = OneHotEncoder() self.encoder_.fit(X.loc[:, self.columns]) # Expand the column names new_colnames = [] for i, c in enumerate(self.columns): this_map = self.encoder_.active_features_[self.encoder_.feature_indices_[i]:self.encoder_.feature_indices_[i+1]] for n in this_map: new_colnames.append("{}_{}".format(c, str(n))) self.new_colnames_ = new_colnames return self def transform(self, X): new_data = pd.DataFrame(self.encoder_.transform(X.loc[:, self.columns]).toarray(), index=X.index, columns=self.new_colnames_) assert new_data.shape[0] == X.shape[0], "Row lengths do not match" if self.only_new: return new_data res = X.copy() if not self.append: # Remove the unexpanded columns from the data frame for c in self.columns: res.drop(c, 1, inplace=True) return res.join(new_data)
def apply_onehot(self, columns=[]): enc = OneHotEncoder() enc.fit(self.M[:, columns]) R = enc.transform(self.M[:, columns]).toarray() self.M = np.c_[self.M[:,[x for x in range(self.M.shape[1]) if x not in columns]], R] self.class_index -= len([c for c in columns if c < self.class_index]) return self
def cost(all_thetas, weights, X, y, lamb): thetas = unpack_thetas(all_thetas, weights) # add column of 1's X = X/255 a1 = np.insert(X, 0, 1, 1) # create a binary index matrix of y data and initialize activation layers encoder = OneHotEncoder(sparse=False) y_matrix = encoder.fit_transform(y.T) act_layers = activation_layers(a1, thetas) # cost function created in seperate parts first = np.multiply(-y_matrix, np.log(act_layers[-1])) second = np.multiply(1 - y_matrix, np.log(1 - act_layers[-1])) # regularization reg_1 = lamb/(2 * len(X)) reg_2 = 0 for i in range(len(thetas)): reg_2 += np.power(thetas[i][...,1:], 2).sum() J = 1/len(X) * (first - second).sum() + (reg_1 * reg_2) print('Current Cost') print(J) print('*' * 20) return J
def prepare_items_features(user_items_csv, out_dir): array = np.loadtxt(user_items_csv, delimiter='|', dtype=np.dtype(np.uint64)) le = LabelEncoder() col1 = le.fit_transform(array[:, 1].T) col2 = le.fit_transform(array[:, 2].T) col3 = le.fit_transform(array[:, 3].T) col4 = le.fit_transform(array[:, 4].T) columns = np.array([col1, col2, col3, col4]).T enc = OneHotEncoder() print(array[:10]) encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()] print(encoded[:10]) print(encoded.shape) user_id = encoded[0][0] rows = [] current = np.zeros(encoded.shape[1]-1) for i in range(encoded.shape[0]): if encoded[i][0] != user_id: rows.append(np.concatenate([[user_id], current])) user_id = encoded[i][0] current = np.zeros(encoded.shape[1]-1) else: current = np.sum([current, encoded[i, 1:]], axis=0) rows.append(np.concatenate([[user_id], current])) array = np.array(rows) print(array.shape) # let's serialize array np.save(os.path.join(out_dir, "user_items"), array)
def getdataset(datasetname, onehot_encode_strings=True): # load dataset = fetch_mldata(datasetname) # get X and y X = dshape(dataset.data) try: target = dshape(dataset.target) except: print("WARNING: No target found. Taking last column of data matrix as target") target = X[:, -1] X = X[:, :-1] if len(target.shape) > 1 and target.shape[1] > X.shape[1]: # some mldata sets are mixed up... X = target target = dshape(dataset.data) if len(X.shape) == 1 or X.shape[1] <= 1: for k in dataset.keys(): if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]: X = np.hstack((X, dshape(dataset[k]))) # one-hot for categorical values if onehot_encode_strings: cat_ft = [i for i in range(X.shape[1]) if 'str' in str( type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))] if len(cat_ft): for i in cat_ft: X[:, i] = tonumeric(X[:, i]) X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X) # if sparse, make dense try: X = X.toarray() except: pass # convert y to monotonically increasing ints y = tonumeric(target).astype(int) return np.nan_to_num(X.astype(float)), y
def one_hot_encode(train_discrete_features, test_discrete_features): """ Perform one hot encoding to both train and test set. Use this when having memory limitation, otherwise to use scikit-learn's OneHotEncoder. parameters: -------------------------------------------------------- train_discrete_features: discrete features of training data test_discrete_features: discrete features of test data """ m, n = train_discrete_features.shape train_encoded_features = lil_matrix((LENGTH_OF_TRAIN, MAX_OF_DIM)) test_encoded_features = lil_matrix((LENGTH_OF_TEST, MAX_OF_DIM)) cnt = 0 for i in range(n): print "processing " + str(i) + "th feature..." train_column = train_discrete_features[:, i] test_column = test_discrete_features[:, i] # one hot encode the value in train and test encoder = OneHotEncoder(handle_unknown="ignore") train_encoded_column = lil_matrix(encoder.fit_transform(np.mat(train_column).T)) test_encoded_column = lil_matrix(encoder.transform(np.mat(test_column).T)) # get number of features _, num = train_encoded_column.shape # put the column into matrix for j in range(num): train_encoded_features[:,cnt+j] = train_encoded_column[:,j] test_encoded_features[:,cnt+j] = test_encoded_column[:,j] cnt += num return csr_matrix(train_encoded_features[:, 0:cnt]), csr_matrix(test_encoded_features[:, 0:cnt])
def encode_non_numeric(train, test, column): # compose full list of options options = list(set(list(train[column].unique()) + list(test[column].unique()))) # encode them with integers for i, option in enumerate(options): train.loc[:, column] = train.loc[:, column].replace(option, i + 1) test.loc[:, column] = test.loc[:, column].replace(option, i + 1) # recode into one-hot vectors options = list(set(list(train[column].unique()) + list(test[column].unique()))) enc = OneHotEncoder(sparse=False) enc.fit(np.matrix(options).T) original_names = dict((i, a) for i, a in enumerate(train.columns.values)) train = pd.concat([train, pd.DataFrame(enc.transform(np.matrix(train[column]).T))], axis=1, ignore_index=True) test = pd.concat([test, pd.DataFrame(enc.transform(np.matrix(test[column]).T))], axis=1, ignore_index=True) train = train.rename(columns=original_names) test = test.rename(columns=original_names) # drop the original of the encoded column train = train.drop(column, axis=1) test = test.drop(column, axis=1) return train, test
class Fileio(object): """ Fileio helper """ def __init__(self, train='../data/train.csv', test='../data/test.csv'): # Create a OneHotEncoder self.encoder = OneHotEncoder() self.trainDF = pd.read_csv(train,usecols=[0]) self.trainDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['train']*NUMTRAIN, range(NUMTRAIN))) self.testDF = pd.read_csv(test) self.testDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['test']*NUMTEST, range(NUMTEST))) def encode(self,usecols): self.encoder.fit(np.array(self.df.ix[:,usecols],dtype='float')) def transformTrain(self,cols,idCol=8): """ Transform the training set""" x = pd.merge(self.trainDF,self.df.ix[:,[idCol]+cols],how='left',on='ID',sort=False) ignore = ['ID','ACTION'] usecols = [c for c in x.columns if c not in ignore] return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float')), np.array(x.ACTION) def transformTest(self,cols,idCol=8): """ Transform the testing set""" x = pd.merge(self.testDF.ix[:,['ID','ROLL_CODE']],self.df.ix[:,[idCol]+cols] ,how='left',on='ID',sort=False) ignore = ['ID','ROLL_CODE'] usecols = [c for c in x.columns if c not in ignore] return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float'))
def main(): enc = OneHotEncoder(n_values=[7,7,7,7,7,7]) conn = sqlite3.connect('server.db') cursor = conn.cursor() all_ = pandas.read_sql_query('SELECT layers.burger, labels.output, layers.layer0, layers.layer1, layers.layer2, layers.layer3, layers.layer4, layers.layer5 FROM layers,labels WHERE layers.burger = labels.burger', conn, index_col='burger') X = all_.drop(['output'], axis=1) y = all_['output'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) clf = MLPClassifier(solver='adam', activation='relu', verbose=False, max_iter=10000, tol=1e-9, random_state=1) X_train_categoricals = X_train[column_names] tX_train_categoricals = enc.fit_transform(X_train_categoricals) clf.fit(tX_train_categoricals, y_train.as_matrix().astype(int)) X_test_categoricals = X_test[column_names] tX_test_categoricals = enc.fit_transform(X_test_categoricals) prediction = clf.predict(tX_test_categoricals) print(classification_report(y_test, prediction)) print_eval(y_test, prediction)
def vectorize_data(df): cat_vars = ["UniqueCarrier", "OriginAirportID", "OriginAirportSeqID", "OriginCityMarketID", "OriginState", "DestAirportID", "DestAirportSeqID", "DestCityMarketID", "DepTimeBlk", "ArrTimeBlk", "DistanceGroup", "DestState"] con_vars = ["CRSElapsedTime", "Distance", "CRSDepTime", "CRSArrTime", "WeekDay", "YearDay"] df = df.dropna() Xenc = OneHotEncoder() X1 = Xenc.fit_transform(df[cat_vars].as_matrix()) X2 = df[con_vars].as_matrix() X = sparse.hstack((X1, X2)) X = X.tocsr() y = df["Cancelled"].as_matrix() return X, y, Xenc
def load_dataset_from_file(filename, examples_count, is_labeled=True, expand_categorical=True): data = open (filename, 'r').readlines() # Next two lines verifies that the parsing result of header is what # we expect. header, _unused = parse_line(data[0], is_labeled, is_header=True) assert header == EXPECTED_HEADER data_X = [] data_y = [] cnt = 0 for line in data[1:]: cnt += 1 if len(data_X) == examples_count: break parse_result = get_features(line, is_labeled) if parse_result == None: continue (features, label) = parse_result data_X.append(np.array(features)) data_y.append(label) if len(data_X) % 100000 == 0: print "Processed %d rows, loaded %d examples." % ( cnt, len(data_X)) cat_X = data_X if expand_categorical: encoder = OneHotEncoder(categorical_features=list(CATEGORICAL_FEATURES), sparse=False) cat_X = encoder.fit_transform(cat_X) cat_X = MaxAbsScaler().fit_transform(cat_X) print "Feature indices: ", encoder.feature_indices_ print "Cat_X shape: ", cat_X.shape return (data_X, cat_X, np.array(data_y) if is_labeled else None)
def pywfmLocalModel(trainFeature, testFeature, trainLabel, testLabel, trainIndex, testIndex, fm, cvIndex): print 'run local: folds: ' + str(cvIndex) trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex) encoder = OneHotEncoder(n_values=[value1, value2]) trainIndex_encode = encoder.fit_transform(trainIndex) testIndex_encode = encoder.transform(testIndex) trainFeature = hstack((trainIndex_encode, trainFeature)) testFeature = hstack((testIndex_encode, testFeature)) ''' for i in range(len(trainLabel)): if i == 0: trainLabel[i] = -1 for i in range(len(testLabel)): if i == 0: testLabel[i] = -1 ''' model = fm.run(trainIndex_encode, trainLabel, testIndex_encode, testLabel) predict = model.predictions predict = np.array(predict, np.float) predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict)) return predict
def convert_network(filename,final_filename, var_flag = 0): ''' Filename : input filename of csv filename final_filename : o/p filename of .pickle file ''' res = {'x':[],'y':[]} with open(filename,'rb') as csvfile: f = csv.reader(csvfile) count = 0 for line in f: if count != 0: if var_flag == 0: res['x'].append(line[:-2]+[line[-1]]) res['y'].append(float(line[-2])) else: res['x'].append(line[:-1]) res['y'].append(float(line[-1])) count += 1 res['x'] = get_num(res['x']) m = len(res['x'][0])-1 enc = OneHotEncoder(categorical_features = range(m),sparse = False) enc.fit(res['x']) res['x'] = enc.transform(res['x']) with open(final_filename,'wb') as f: pickle.dump(res,f)
class CategoricalColumn(BaseEstimator, TransformerMixin): ''' Take a string or key categorical column and transform it to one hot encodings. ''' def __init__(self): ''' Set up the internal transformation. ''' self._labeler = LabelEncoder() self._encoder = OneHotEncoder() def fit(self, X, y=None): ''' Fit the label and encoding ''' handle_none = list(map(str, X)) encoded = self._labeler.fit_transform(handle_none) self._encoder.fit(encoded.reshape(-1, 1)) return self def transform(self, X): ''' Transform a column of data into one hot encodings. Parameters ---------- X : pandas series or numpy array ''' handle_none = list(map(str, X)) encoded = self._labeler.transform(handle_none) return self._encoder.transform(encoded.reshape(-1, 1)).todense().astype(np.float32)
def pywfmPredictModel(trainFeature, testFeature, trainLabel, trainIndex, testIndex, fm): print 'run online!' trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex) encoder = OneHotEncoder(n_values=[value1, value2]) trainIndex_encode = encoder.fit_transform(trainIndex) testIndex_encode = encoder.transform(testIndex) trainFeature = hstack((trainIndex_encode, trainFeature)) testFeature = hstack((testIndex_encode, testFeature)) #print trainFeature ''' for i in range(len(trainLabel)): if i == 0: trainLabel[i] = -1 for i in range(len(testLabel)): if i == 0: testLabel[i] = -1 ''' testLabel = np.zeros((testFeature.shape[0])) model = fm.run(trainFeature, trainLabel, testFeature, testLabel) predict = model.predictions predict = np.array(predict, np.float) print np.max(predict), np.min(predict) #predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict)) return predict
def convert_categorical_to_numeric(state_holiday): enc = OneHotEncoder() state_holiday[state_holiday=='a'] = 1 state_holiday[state_holiday=='b'] = 2 state_holiday[state_holiday=='c'] = 3 enc.fit(state_holiday) return enc.transform(state_holiday).toarray()
def loadData(experiment): if experiment.has_key("size"): size = experiment["size"] else: size = 0 data, label, description, reduce = experiment["dataset"]() if size > 0: initialReduceBlockSize = np.arange(size, size+0.2, 0.1) testSetPercentage = 0.2 trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] # if required (cancer datasets) perform binary encoding if experiment['binary_encode']: print "perform binary encode" analyze(data, label, "before encode") # encode features (one-hot-encoder / dummy coding) enc = OneHotEncoder() enc.fit(data) data = enc.transform(data).toarray() analyze(data, label, "after encode") return data, label, description, reduce
def _to_one_hot_encoding(labels, dtype=np.float64): labels = labels.reshape((labels.shape[0], 1)) """Creates a one-hot encoding of the labels.""" from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder(dtype=dtype) return enc.fit_transform(labels).toarray()
X = df[c_vars.header_useful].as_matrix() y = df['click'].as_matrix() del df print (str(datetime.now()) + ' Label Encoding Started') label_encoder = [LabelEncoder() for _ in range(3)] for i in range(len(label_encoder)): label_encoder[i].fit(X[:,i]) # print (i, c_vars.header_useful[i], label_encoder[i].get_params(deep=True)) X[:,i] = label_encoder[i].transform(X[:,i]) print (str(datetime.now()) + ' Label Encoding Completed') print (str(datetime.now()) + ' OHE Started') ohe = OneHotEncoder(sparse = False) ohe.fit(X[:,[0,1,2,3,4]]) # X_ohe = ohe.transform(X[:,[0,1,2,3,4]]) print (str(datetime.now()) + ' OHE Completed') # X = X[:,[i for i in range(len(c_vars.header_useful)) if i not in [0,1,2,3,4,5]]] # X = np.hstack((X, X_ohe)) ''' ''' # save the label encoder and the one hot encoding to disk with open('../analysis_graphs/label_encoder', 'wb') as f: pickle.dump(label_encoder, f) with open('../analysis_graphs/ohe', 'wb') as f:
def test_one_hot_encoder_inverse(sparse_, drop): X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] enc = OneHotEncoder(sparse=sparse_, drop=drop) X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) X = [[2, 55], [1, 55], [3, 55]] enc = OneHotEncoder(sparse=sparse_, categories="auto", drop=drop) X_tr = enc.fit_transform(X) exp = np.array(X) assert_array_equal(enc.inverse_transform(X_tr), exp) if drop is None: # with unknown categories # drop is incompatible with handle_unknown=ignore X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] enc = OneHotEncoder( sparse=sparse_, handle_unknown="ignore", categories=[["abc", "def"], [1, 2], [54, 55, 56]], ) X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) exp[2, 1] = None assert_array_equal(enc.inverse_transform(X_tr), exp) # with an otherwise numerical output, still object if unknown X = [[2, 55], [1, 55], [3, 55]] enc = OneHotEncoder( sparse=sparse_, categories=[[1, 2], [54, 56]], handle_unknown="ignore" ) X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) exp[2, 0] = None exp[:, 1] = None assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1], [1, 0, 1]]) msg = re.escape("Shape of the passed X data is not correct") with pytest.raises(ValueError, match=msg): enc.inverse_transform(X_tr)
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression # Importing the dataset data = pd.read_csv('50_Startups.csv') x = data.iloc[:, :-1].values y = data.iloc[:, -1].values ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough') x = np.array(ct.fit_transform(x)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) regressor = LinearRegression() regressor.fit(x_train, y_train) percentErros = (abs(regressor.predict(x_test) - y_test) /y_test)*100 AveragePercentError = sum(percentErros)/len(percentErros) y_pred = regressor.predict(x_test) np.set_printoptions(precision=2) print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) print("the accruacy of the model is:", 100 - AveragePercentError)
X_train[:,1] = sexe_le.fit_transform(X_train[:,1]) X_test[:,1] = sexe_le.transform(X_test[:,1]) # In[35]: X_train[:,5] = embark_le.fit_transform(X_train[:,5]) X_test[:,5] = embark_le.transform(X_test[:,5]) # In[37]: from sklearn.preprocessing import OneHotEncoder embark_ohe = OneHotEncoder(categorical_features = [5]) X_train = embark_ohe.fit_transform(X_train) X_test = embark_ohe.transform(X_test) # In[40]: X_train = X_train.toarray() X_test = X_test.toarray() # In[42]: X_train = X_train[:,1:]
X = dataset.iloc[:, -9:-1].values Y = dataset.iloc[:, -1].values #Transformation du valeur NaN imptr = Imputer(missing_values="NaN", strategy="mean", axis=0) imptr.fit(X[:, 0:1]) imptr.fit(X[:, 7:8]) X[:, 0:1] = imptr.transform(X[:, 0:1]) X[:, 7:8] = imptr.transform(X[:, 7:8]) from sklearn.preprocessing import LabelEncoder, OneHotEncoder labEncr_X = LabelEncoder() X[:, 2] = labEncr_X.fit_transform(X[:, 2]) X[:, 5] = labEncr_X.fit_transform(X[:, 5]) onehotEncr = OneHotEncoder(categorical_features=[2]) onehotEncr = OneHotEncoder(categorical_features=[5]) X = onehotEncr.fit_transform(X).toarray() #Codage de valeur a predit labEnc_Y = LabelEncoder() Y = labEnc_Y.fit_transform(Y) #base d'apprentissag e de test from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
from sklearn.svm import SVC from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OneHotEncoder, StandardScaler dataset = pd.read_csv("Social_Network_Ads.csv") print(dataset.shape) print(dataset.head()) columns_to_encode = ['Gender'] columns_to_scale = ['Age', 'EstimatedSalary'] encoder = OneHotEncoder(sparse=False) scaler = StandardScaler() encoded_columns = encoder.fit_transform(dataset[columns_to_encode]) scaled_columns = scaler.fit_transform(dataset[columns_to_scale]) print("shape: ", encoded_columns.shape) processed_dataset = np.concatenate([encoded_columns, scaled_columns], axis=1) dataset = pd.concat([pd.DataFrame(processed_dataset), dataset.Purchased], axis=1) print(dataset.head()) X = dataset.iloc[:, :-1].values
hour_x_train['dteday'] = (hour_x_train['dteday'] - pd.to_datetime('2011-01-01') ) / pd.Timedelta('1 days') hour_x_val['dteday'] = (hour_x_val['dteday'] - pd.to_datetime('2011-01-01')) / pd.Timedelta('1 days') numeric_features = ['dteday', 'temp', 'atemp', 'hum', 'windspeed'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = [ 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit' ] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) preprocessor.fit(hour_x_train) x_train = preprocessor.transform(hour_x_train).todense() x_val = preprocessor.transform(hour_x_val).todense() pickle.dump(preprocessor, open('encoder.p', "wb")) # Save encoder print('Predictors prepared') # Prepare targets y_train = hour_y_train.values.astype(float) y_val = hour_y_val.values.astype(float) print('all data prepared')
if __name__ == '__main__': files=os.listdir(".") files=[i for i in files if i.split(spliter)[-1]=="p"] output=[file+spliter+"toPre.fa" for file in files] featureSize=200 lists=[[files[i],output[i],200] for i in range(0,len(files))] pool = multiprocessing.Pool(int(t)) d = pool.map(getSeqFragment,lists) pool.close() pool.join() integer_encoder = LabelEncoder() one_hot_encoder = OneHotEncoder() input_features = [] def getData(file): feature_integer_encoder = LabelEncoder() input_features = [] records=SeqIO.parse(file,"fasta") l_seq=[str(rec.seq) for rec in records] records=SeqIO.parse(file,"fasta") l_target=[rec1.id.split("_")[-1] for rec1 in records] voc=["A","C","G","T","N"] feature_integer_encoder.fit(voc) sequences = list(filter(None, l_seq)) for sequence in sequences: integer_encoded = feature_integer_encoder.transform(list(sequence))
import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Churn_Modelling.csv') X = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features = [1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Part 2 - Now let's make the ANN!
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))} inv_class_mapping = {v: k for k, v in class_mapping.items()} df['classlabel'] = df['classlabel'].map(class_mapping) print(df) df['classlabel'] = df['classlabel'].map(inv_class_mapping) print(df) #2method from sklearn.preprocessing import LabelEncoder #make instance of label encoder class_le = LabelEncoder() #convert classlabel ro integer #fit + transform y = class_le.fit_transform(df['classlabel'].values) print(df) print(y) #extract color size price X = df[['color', 'size', 'price']].values color_le = LabelEncoder() #covert color label to integer X[:, 0] = color_le.fit_transform(X[:, 0]) print(X) from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(categorical_features=[0], sparse=True) print(ohe.fit_transform(X)) print(ohe.fit_transform(X).toarray()) #automataically calculate one hot vector print(pd.get_dummies(df[['price', 'color', 'size']])) #escape from multicolinearlity print(pd.get_dummies(df[['price', 'color', 'size']], drop_first=True))
import dash_html_components as html import dash_daq as daq import flask import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.preprocessing import OneHotEncoder # load our data mtcars = pd.read_csv('mtcars.csv', dtype={'cyl': str, 'am': np.float64}) # create and fit a one-hot encoder--we'll want to reuse this in the app as well cyl_enc = OneHotEncoder(categories='auto', sparse=False) cyl_enc.fit(mtcars['cyl'].values.reshape(-1, 1)) y = mtcars['mpg'] # we need to concatenate the one-hot (dummy) encoded values with # the values from mtcars X = np.concatenate((mtcars[['disp', 'qsec', 'am']].values, cyl_enc.transform(mtcars['cyl'].values.reshape(-1, 1))), axis=1) # fit our regression model fit = LinearRegression() fit.fit(X=X, y=y) def preds(fit, cyl_enc, disp, qsec, am, cyl):
labelencoder_X_col = LabelEncoder() df[col] = labelencoder_X_col.fit_transform(df[col]) return df elif axis == 1: labelencoder_y = LabelEncoder() df = labelencoder_y.fit_transform(df) return df for col in [ 'Generic Group', 'Generic Brand', 'Generic Product Category', 'Generic Product', 'Variable Group', 'Units' ]: X = label_encode(X, col, axis=0) onehotencoder = OneHotEncoder(sparse=False) X = onehotencoder.fit_transform(X) y = pd.get_dummies(y) import keras from keras.models import Sequential from keras.layers import Dense, Dropout, PReLU from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) classifier = Sequential() classifier.add(
data_voiced = [] data_unvoiced = [] for i in range(len(labels_tot)): if labels_tot[i] == 1: #voiced data_voiced.append(data_tot[i]) else: #unvoiced data_unvoiced.append(data_tot[i]) data_length = min(len(data_unvoiced), len(data_voiced)) data_voiced = np.asarray(data_voiced[:data_length]) data_unvoiced = np.asarray(data_unvoiced[:data_length]) data = np.concatenate((data_voiced, data_unvoiced), axis=0) labels = np.concatenate((np.ones(data_length), np.zeros(data_length))) # One hot encoding the labels onehotencoder = OneHotEncoder(categorical_features='all') labels_encoded = onehotencoder.fit_transform( np.asarray(labels).reshape(-1, 1)).toarray() training_ratio = 0.75 training_index = int(data_length * training_ratio) training_data, test_data = np.concatenate( (data_voiced[:training_index], data_unvoiced[:training_index])), np.concatenate( (data_voiced[training_index:], data_unvoiced[training_index:])) # Build the neural networks # The autoencoder encoding_dim = 200 input_dim = Input(shape=(max_length, )) encoded = Dense(encoding_dim, activation='relu')(input_dim)
class BaseModel(mlflow.pyfunc.PythonModel): def __init__(self, model_cls, model_params={}, table_columns=[]): self._model_params = model_params self._model_obj = model_cls(**model_params) self._features = DEFAULT_FEATURES self._cat_features = DEFAULT_CATEGORICAL_FEATURES self._cont_features = [ f for f in self._features if f not in self._cat_features ] self._label = 'device_operational_status' self._table_columns = table_columns def fit(self, X, y): self._cat_x_encoder = OneHotEncoder(handle_unknown='ignore').fit( X[self._cat_features]) self._y_encoder = LabelEncoder().fit(y) _X = self._preprocess_X(X) _y = self._preprocess_y(y) self._model_obj = self._model_obj.fit(X=_X, y=_y) return self def predict(self, context, X): _y = self._predict(X) return _y def _predict(self, X): #TODO: hide if len(X.columns) == len(self._table_columns): X.columns = self._table_columns _X = self._preprocess_X(X) _y_num = self._model_obj.predict(_X) _y = self._y_encoder.inverse_transform(_y_num) return _y def load_context(self, context): with open(context.artifacts['model'], 'rb') as file: self._model_obj = pickle.load(file) def get_label_names(self): out = self._y_encoder.classes_ return out def _preprocess_X(self, X): _X_processed = np.concatenate([ X[self._cont_features], self._cat_x_encoder.transform(X[self._cat_features]).todense() ], axis=1) return _X_processed def _preprocess_y(self, y): _y_preprocessed = self._y_encoder.transform(y) return _y_preprocessed def log_to_mlflow(self): with TempDir() as local_artifacts_dir: # dumping model model_path = local_artifacts_dir.path('model.pkl') with open(model_path, 'wb') as m: pickle.dump(self._model_obj, m) # dumping feature encoder cat_encoder_path = local_artifacts_dir.path('cat_encoder.pkl') with open(cat_encoder_path, 'wb') as m: pickle.dump(self._cat_x_encoder, m) # dumping label encoder label_encoder_path = local_artifacts_dir.path('label_encoder.pkl') with open(label_encoder_path, 'wb') as m: pickle.dump(self._y_encoder, m) # all of the model subcomponents will need to go here artifacts = { 'model': model_path, 'cat_encoder': cat_encoder_path, 'label_encoder': label_encoder_path } mlflow.pyfunc.log_model(artifact_path='model', python_model=self, artifacts=artifacts)
else: x_test2[i, 6] = 1 unique, counts = np.unique(x_train[:, 6], return_counts=True) #Categorical from sklearn.preprocessing import LabelEncoder, OneHotEncoder label1 = LabelEncoder() x_train[:, 1] = label1.fit_transform(x_train[:, 1]) label2 = LabelEncoder() x_train[:, 7] = label2.fit_transform(x_train[:, 7]) label3 = LabelEncoder() x_train[:, 0] = label3.fit_transform(x_train[:, 0]) from sklearn.compose import ColumnTransformer ct = ColumnTransformer( [ ('one_hot_encoder', OneHotEncoder(), [7]) ], # The column numbers to be transformed (here is [0] but can be [0, 1, 3]) remainder='passthrough' # Leave the rest of the columns untouched ) x_train = np.array(ct.fit_transform(x_train), dtype=np.float) ct = ColumnTransformer( [ ('one_hot_encoder', OneHotEncoder(), [3]) ], # The column numbers to be transformed (here is [0] but can be [0, 1, 3]) remainder='passthrough' # Leave the rest of the columns untouched ) x_train = np.array(ct.fit_transform(x_train), dtype=np.float) x_train = x_train[:, [1, 2, 4, 5, 6, 7, 8, 9, 10, 11]] #For test2 label1 = LabelEncoder()
def train_model(train, target, features = train.columns, model = LinearRegression()): pipe = make_pipeline(OneHotEncoder(handle_unknown = 'ignore'), model) mod = pipe.fit(train, target) return mod
standing_filename = 'NBA/standings.csv' standings = pd.read_csv(standing_filename, skiprows=[0]) dataset = pd.read_csv('NBA/March.csv', parse_dates=["Date"]) dataset.columns = [ "Date", "Start", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "Score Type", "OT", "Notes" ] # 球队编号并转化为二进制 encoding.fit(dataset['Home Team'].values) home_teams = encoding.transform(dataset["Home Team"].values) visitor_teams = encoding.transform(dataset["Visitor Team"].values) X_teams = np.vstack([home_teams, visitor_teams]).T onehot = OneHotEncoder() X_teams = onehot.fit_transform(X_teams).todense() # step 1 won_last = defaultdict(int) dataset["HomeLastWin"] = False dataset["VisitorLastWin"] = False dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"] for index, row in dataset.iterrows(): home_team = row["Home Team"] visitor_team = row["Visitor Team"] row["HomeLastWin"] = won_last[home_team] row["VisitorLastWin"] = won_last[visitor_team] dataset.iloc[index] = row won_last[home_team] = row["HomeWin"]
train_y = train_data['Survived'] train_x = train_data.drop( ['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket'], axis=1) test_x = pd.read_csv('./data/test.csv') test_x = test_x.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1) total_data = [train_x, test_x] # sex离散化 one-hot编码 for data in total_data: data['Sex'] = data.Sex.map({'male': 0, 'female': 1}) data['Embarked'] = data['Embarked'].fillna("S") # age 字段的null采用均值填充 for data in total_data: data['Age'] = data.Age.fillna(data['Age'].mean()) enc = OneHotEncoder(sparse=False) sex_onehot = enc.fit_transform(pd.DataFrame(train_x['Sex'])) train_x["sex_0"] = sex_onehot[:, 0] train_x["sex_1"] = sex_onehot[:, 1] train_x = train_x.drop(["Sex"], axis=1) sex_onehot_test = enc.transform(pd.DataFrame(test_x['Sex'])) test_x["sex_0"] = sex_onehot_test[:, 0] test_x["sex_1"] = sex_onehot_test[:, 1] test_x = test_x.drop(["Sex"], axis=1) Embarked_onehot = OneHotEncoder(sparse=False) Embarked_onehot_data = Embarked_onehot.fit_transform( pd.DataFrame(train_x['Embarked'])) train_x["Embarked_0"] = Embarked_onehot_data[:, 0] train_x["Embarked_1"] = Embarked_onehot_data[:, 1]
def make_model(file_name="TD20200309210544.json", column_review="reviewText", column_rating="overall", json_balanced=True, have_corpus=True, size=10000): # Making a json file with balanced ratings if json_balanced == False: make_balance_json(r'static/DBAlpha/TrainingDB/Files/' + file_name, column_review, column_rating, "main/files/uniform_json.json", size / 5) dataset = read_json('main/files/uniform_json.json', lines=True) dataset = dataset[:size] # Making corpus, in case corpus doesn't exists if have_corpus == False: corpus = basic.preprocess_lemm_dataset(dataset, 'review') process_corpus.write_corpus(corpus) # If corpus exists, read it directly else: corpus = [] corpus = process_corpus.read_corpus() corpus = corpus[:size] # Getting the ratings y = dataset.iloc[:size, 0] # Maximum words to consider TRAINING_VOCAB = 5000 # Tokenizing the words upto the maximum vocabulary tokenizer = Tokenizer(num_words=TRAINING_VOCAB, lower=True, char_level=False) # Fitting the corpus to tokenizer tokenizer.fit_on_texts(corpus) training_sequences = tokenizer.texts_to_sequences(corpus) # Getting the encoding dictionary vocab_to_int = tokenizer.word_index sequence_length = 150 # Padding to maximum sequence length features = pad_sequences(training_sequences, maxlen=sequence_length) """ EMBEDDING_DIM = 300 # Loading google's words to vect embedding print("\nLoading the Google's word2vec \nPlease Wait...") word2vec_path = 'resources/GoogleNews-vectors-negative300.bin' word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True) train_embedding_weights = np.zeros((len(vocab_to_int), EMBEDDING_DIM)) for word,index in vocab_to_int.items(): if word in word2vec: train_embedding_weights[index,:] = word2vec[word] else: np.random.rand(EMBEDDING_DIM) print(train_embedding_weights.shape) """ # Variables for RNN LSTM vocab_size = len(vocab_to_int) embedding_dim = 512 # Training parameters batch_size = int(size // 100) num_epochs = 30 # Encoding y data into diffeerent categorical columns labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) y = y.reshape(len(y), 1) onehotencoder = OneHotEncoder() y = onehotencoder.fit_transform(y).toarray() # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.20, random_state=0) # Initialising the RNN model = Sequential() # Adding Layers to RNN #model.add(Embedding(vocab_size, embedding_dim, weights = [train_embedding_weights],input_length=sequence_length)) if size > 2000: model.add( Embedding(TRAINING_VOCAB, embedding_dim, input_length=sequence_length)) else: model.add( Embedding(TRAINING_VOCAB, size / 10, input_length=sequence_length)) model.add(LSTM(100, return_sequences=True)) model.add(LSTM(100)) model.add(Dense(units=200, kernel_initializer='uniform', activation='relu')) model.add(Dense(5, activation='sigmoid')) #rmsprop=optimizers.rmsprop(lr=0.01) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Fitting the ANN to the Training set model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs) # Predicting the Test set results over trained model y_pred = model.predict(X_test) # Getting result in proper format that is initially probabilistic for i in range(len(y_pred)): ind_ = 0 max_ = y_pred[i][0] for j in range(5): if y_pred[i][j] > max_: max_ = y_pred[i][j] ind_ = j y_pred[i][j] = 0 y_pred[i][ind_] = 1 # Inverse Transforming the categorical encodings on y_pred and y_test y_pred = onehotencoder.inverse_transform(y_pred) y_test = onehotencoder.inverse_transform(y_test) # Measuring the performance accuracy = accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) # file_name = re.sub(".json", "", file_name) with open(r'static/DBAlpha/TrainingDB/Models/TOKEN_' + file_name + ".pkl", 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) model.save(r'static/DBAlpha/TrainingDB/Models/' + file_name + '.h5') # Returning the performance parameters return accuracy
#handling missing values #for Age ds.Age = ds.Age.fillna(ds.Age.median()) dssub.Age = dssub.Age.fillna(dssub.Age.median()) #for Embarked ds.Embarked = ds.Embarked.fillna('S') dssub.Embarked = dssub.Embarked.fillna('S') X_all = np.concatenate((X, X_sub), axis=0) y = dataset.loc[:, 'Survived'].values #Handling categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X_all[:, 1] = labelencoder_X.fit_transform(X_all[:, 1]) X_all[:, 5] = labelencoder_X.fit_transform(X_all[:, 5]) onehotencoder = OneHotEncoder(categorical_features=[0, 5]) X_all = onehotencoder.fit_transform(X_all).toarray() X = X_all[:891, :] X_sub = X_all[891:, :] from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=1) from keras.models import Sequential from keras.layers import Dense model = Sequential() model.add( Dense(26,
import pandas as pd from tensorflow import keras import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer ct = OneHotEncoder(sparse=False) # function to load data from csv def load_data(filename): return pd.read_csv(filename) def processFlightData(data): return data.filter(['Month', 'Day', 'Origin_Airport', 'WeatherDelay']) # def main(): # loading the smaller 2017 flight data flight2017 = load_data('flight-delays/fl_samp.csv') flight2017Processed = processFlightData(flight2017) # loading the other flight data flightData = load_data('flight-delays/flight.csv') flightDateProcessed = processFlightData(flightData) # Combine the two datasets frames = [flight2017Processed, flightDateProcessed] combinedFlightData = pd.concat(frames)
'num_common_interest5', 'num_common_topic1' ]].values) train_x = scaler.transform(train[[ 'num_advertise_touser', 'num_common_interest1', 'num_common_interest2', 'num_common_interest5', 'num_common_topic1' ]].values) test_x = scaler.transform(test[[ 'num_advertise_touser', 'num_common_interest1', 'num_common_interest2', 'num_common_interest5', 'num_common_topic1' ]].values) train_x = np.hstack((train_x, ct_trains)) test_x = np.hstack((test_x, ct_tests)) # 特征进行onehot处理 enc = OneHotEncoder() oc_encoder = OneHotEncoder() for feature in one_hot_feature: oc_encoder.fit(data[feature].values.reshape(-1, 1)) train_a = oc_encoder.transform(train[feature].values.reshape(-1, 1)) test_a = oc_encoder.transform(test[feature].values.reshape(-1, 1)) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) print('one-hot prepared !') # 处理count特征向量 ct_encoder = CountVectorizer(min_df=0.0009) for feature in vector_feature: ct_encoder.fit(data[feature])
day and month are cyclical in nature, so we can do the following: """ train['dy_sin'] = np.sin((train['day']-1)*(2.*np.pi/7)) train['dy_cos'] = np.cos((train['day']-1)*(2.*np.pi/7)) train['mnth_sin'] = np.sin((train['month']-1)*(2.*np.pi/12)) train['mnth_cos'] = np.cos((train['month']-1)*(2.*np.pi/12)) train = train.drop(columns="day") train = train.drop(columns="month") train = train.drop(columns="id") train.head() """# Nominal Features (Low Cardinality)""" column_trans = make_column_transformer((OneHotEncoder(sparse=False),['nom_0','nom_1','nom_2','nom_3','nom_4']),remainder='passthrough') train_after_low_car_nom = column_trans.fit_transform(train) pd.DataFrame(train_after_low_car_nom).head() """# Nominal Features (High Cardinality) ## Dummy Encoding """ train_after_low_car_nom = pd.DataFrame(train_after_low_car_nom) train_after_high_car_nom = pd.get_dummies(train_after_low_car_nom, columns=train_after_low_car_nom.columns, drop_first=True, sparse=True) """## Hash Encoding (not used anymore)""" #hashing_encoder = ce.HashingEncoder(cols=[30, 31, 32, 33, 34])
import numpy as np from seqlearn.evaluation import bio_f_score from seqlearn.hmm import MultinomialHMM from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.model_selection import cross_validate from data import * from epam_nlp import CustomHMM, get_bio_f1 DATA_PATH = Path('../data') RAW_DATA_PATH = DATA_PATH / 'processed.tsv' df = load_data(RAW_DATA_PATH, nrows=1000) X, y, lengths = get_X_y_lengths(df, cols_to_keep={'token'}) le = LabelEncoder() ohe = OneHotEncoder(handle_unknown='ignore') clf = CustomHMM(y=y) pipeline = Pipeline([('one_hot', ohe), ('hmm', clf)]) cv = get_cv(lengths=lengths) res = cross_validate(pipeline, X.reshape(-1, 1), y, cv=cv, n_jobs=1, scoring=get_bio_f1) print(res) # cv = get_cv(X, y, lengths) # i = 1 # scores = []
X5 = X5.reshape(-1, 1) missingvalues = missingvalues.fit(X5) X5 = missingvalues.transform(X5) X6 = X2[:, 0] X6 = X6.reshape(-1, 1) X7 = X2[:, 2:4] X_train = np.concatenate((X1, X6, X5, X7, X3, X4), axis=1) X_class = X_train[:, 0] X_class = X_class.reshape(-1, 1) from sklearn.preprocessing import LabelEncoder, OneHotEncoder ohe = OneHotEncoder() X_class = ohe.fit_transform(X_class).toarray() X_class = X_class[:, 1:3] X_embark = X_train[:, 6] X_embark = X_embark.reshape(-1, 1) missingvalues1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent', verbose=0) missingvalues1 = missingvalues1.fit(X_embark) X_embark = missingvalues1.transform(X_embark) le = LabelEncoder() ohe1 = OneHotEncoder() X_embark = le.fit_transform(X_embark)
def onehot(x): return np.array(OneHotEncoder().fit_transform(x.values.reshape(-1,1)).todense()) def format(data):
from sklearn.model_selection import train_test_split X_train, X_valid, y_train, y_valid = train_test_split(dataset, labels, test_size=0.25, random_state=random_state, shuffle=True) print("Train dataset shape:", X_train.shape) print("Train label shape:", y_train.shape) print("Test dataset shape:", X_valid.shape) print("Test label shape:", y_valid.shape) print("Dataset example:") print(X_train[0, 2].reshape(height, width)) print(X_valid[0, 2].reshape(height, width)) # Label encoding from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(sparse=False) y_train = y_train.reshape(-1, 1) y_train = ohe.fit_transform(y_train) y_valid = y_valid.reshape(-1, 1) y_valid = ohe.transform(y_valid) with open(results_path + "/ohe", "wb") as file: pickle.dump(ohe, file) # Dataset Normalization from sklearn.preprocessing import StandardScaler scaler = StandardScaler()
def main(): # Load data and run brief analysis on it raw_data = load_data('train.csv') quick_analysis(raw_data) plt.hist(raw_data['SalePrice']) plt.show() # View all unique values of categorical features non_numeric_cols = raw_data.loc[:, raw_data.dtypes == object] for col in non_numeric_cols.columns: print(non_numeric_cols[col].value_counts()) # Analize correlations between features and the label corr_matrix = raw_data.corr() sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False) print(sale_correl) # Feature engineering the following: # Grade = OverallQual / OverallCond # Age = YrSold - YearBuilt # RemodAge = YrSold - YearRemodAdd # TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF raw_data['Grade'] = raw_data['OverallQual'] / raw_data['OverallCond'] raw_data['Age'] = raw_data['YrSold'] - raw_data['YearBuilt'] raw_data['RemodAge'] = raw_data['YrSold'] - raw_data['YearRemodAdd'] raw_data['TotalSF'] = raw_data['TotalBsmtSF'] + raw_data[ '1stFlrSF'] + raw_data['2ndFlrSF'] # Correlation matrix for the new features corr_matrix = raw_data.corr() sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False) print(sale_correl) # Check correlation of new features with their respective components age_correl = corr_matrix['Age'].sort_values(ascending=False) print('Age correlations:', age_correl, '\n') remod_age_correl = corr_matrix['RemodAge'].sort_values(ascending=False) print('RemodAge correlations:', remod_age_correl, '\n') grade_correl = corr_matrix['Grade'].sort_values(ascending=False) print('Grade correlations:', grade_correl, '\n') totalsf_correl = corr_matrix['TotalSF'].sort_values(ascending=False) print('TotalSF correlations:', totalsf_correl, '\n') # Correlation matrix vizualization corr_plot(raw_data, 'SalePrice', fig_size=(4, 4)) corr_plot(raw_data, 'SalePrice', plot_type='hist', fig_size=(4, 4)) # Change type of columns to reflect their nature. Concretely, change the YrSold, MoSold, MSZoning and OverallCond features to categorical ones raw_data['YrSold_C'] = raw_data['YrSold'].copy().astype(str) raw_data['MoSold'] = raw_data['MoSold'].astype(str) raw_data['MSZoning'] = raw_data['MSZoning'].astype(str) raw_data['OverallCond_C'] = raw_data['OverallCond'].copy().astype(str) num_cols = [ 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GarageCars', 'GarageArea', 'FullBath', 'YrSold', ] cat_cols = [ 'MSZoning', 'Street', 'Utilities', 'Neighborhood', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'Heating', 'CentralAir', 'PavedDrive', 'SaleType', 'SaleCondition', 'YrSold_C', 'MoSold', 'OverallCond_C', ] # Create a list of all values that the categorical features can take cat_cols_categs = [raw_data[col].unique() for col in cat_cols] print(cat_cols_categs) # Create the pipeline to process data num_pipeline = Pipeline([ ('feat_sel', FeatureSelector(num_cols, True)), ('Grade', FeatureCreator(['OverallCond', 'OverallQual'], lambda x, y: x / y, as_dataframe=True, feat_name='Grade')), ('Age', FeatureCreator(['YrSold', 'YearBuilt'], lambda x, y: x - y, as_dataframe=True, feat_name='Age')), ('RemodAge', FeatureCreator(['YrSold', 'YearRemodAdd'], lambda x, y: x - y, as_dataframe=True, feat_name='RemodAge')), ('TotalSF', FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], lambda x, y: x + y, as_dataframe=True, feat_name='TotalSF')), ('drop_cat_feat', FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)), ('imputer_mean', Imputer(strategy='mean')), ('std_scaler', RobustScaler()) ]) cat_pipeline = Pipeline([ ('feat_sel', FeatureSelector(cat_cols, True)), ('imputer_most_frequent', CategoricalImputer()), ('encode', OneHotEncoder(categories=cat_cols_categs, sparse=False)), ]) feat_union = FeatureUnion(transformer_list=[ ('num_features', num_pipeline), ('cat_features', cat_pipeline), ]) # Create the train data and labels train_labels = raw_data['SalePrice'].copy() train_feat = feat_union.fit_transform(raw_data) # Check the linear regression model lin_reg = LinearRegression() print('Linear regression best hyperparameters:') final_lr_model = find_best_estimator(lin_reg, [{}], train_feat, train_labels) # Check the decision tree model hyperparams_vals = [ { 'max_features': [6, 10, 12, 16, 18, 20, 24] }, ] dt_reg = DecisionTreeRegressor(random_state=42) print('Decision tree best hyperparameters:') final_dt_model = find_best_estimator(dt_reg, hyperparams_vals, train_feat, train_labels) # Check the random forest model hyperparams_vals = [ { 'n_estimators': [200, 225, 250], 'max_features': [16, 24, 30] }, { 'bootstrap': [False], 'n_estimators': [220, 225], 'max_features': [24, 28] }, ] forest_reg = RandomForestRegressor(n_jobs=-1, random_state=42) print('Random forest best hyperparameters:') final_rf_model = find_best_estimator(forest_reg, hyperparams_vals, train_feat, train_labels) # Check the XGBoost model hyperparams_vals = [ { 'n_estimators': [450, 500, 400], 'max_features': [2, 4, 8], 'max_depth': [3, 4, None] }, ] xgbr_reg = XGBRegressor(learning_rate=0.05, n_threads=-1, random_state=42) print('XGBoost regressor best hyperparameters:') final_xgb_model = find_best_estimator(xgbr_reg, hyperparams_vals, train_feat, train_labels) # Check the SVM model hyperparams_vals = [ { 'kernel': ['linear', 'sigmoid', 'rbf'], 'gamma': ['auto', 'scale'] }, { 'kernel': ['poly'], 'gamma': ['auto', 'scale'], 'degree': [3, 4, 5] }, ] svm_reg = SVR() print('Support vector machine best hyperparameters:') final_svm_model = find_best_estimator(svm_reg, hyperparams_vals, train_feat, train_labels) # Check the ElasticNet model hyperparams_vals = [ { 'alpha': [0.0005, 0.005, 0.05, 0.2], 'l1_ratio': [0.1, 0.25, 0.75, 0.9] }, ] enet_reg = ElasticNet(max_iter=100000000, tol=0.001) print('ElasticNet best hyperparameters:') final_enet_model = find_best_estimator(enet_reg, hyperparams_vals, train_feat, train_labels) # Check the feature importances for both random forest algorithms rf_feat_imp = final_rf_model.feature_importances_ xgb_feat_imp = final_xgb_model.feature_importances_ other_feat = ['Grade', 'RemodAge', 'TotalSF'] all_features = num_cols.copy() print(num_cols) for cat_values in cat_cols_categs.copy(): all_features.extend(cat_values) all_features.extend(other_feat.copy()) print('Random forest feature importances:') for feat in sorted(zip(rf_feat_imp, all_features), reverse=True): print(feat) print('\nXGBoost feature importances:') for feat in zip(xgb_feat_imp, all_features): print(feat) # Load and process test data test_data = load_data('test.csv') test_data['YrSold_C'] = test_data['YrSold'].copy().astype(str).replace( 'nan', None) test_data['MoSold'] = test_data['MoSold'].astype(str).replace('nan', None) test_data['MSZoning'] = test_data['MSZoning'].astype(str).replace( 'nan', None) test_data['OverallCond_C'] = test_data['OverallCond'].copy().astype( str).replace('nan', None) test_feat = feat_union.transform(test_data) # Predict using the combination of Random Forest and XGBoost rf_predictions = final_rf_model.predict(test_feat) xgb_predictions = final_xgb_model.predict(test_feat) predictions = rf_predictions * 0.35 + xgb_predictions * 0.65 # Save resulting predictions pred_df = pd.DataFrame() pred_df['Id'] = test_data['Id'] pred_df['SalePrice'] = predictions.flatten() print(pred_df) pred_df.to_csv('submission_rf_xgb.csv', index=False) # Predict using only the XGBoost model xgb_predictions = final_xgb_model.predict(test_feat) predictions = xgb_predictions.copy() pred_df = pd.DataFrame() pred_df['Id'] = test_data['Id'] pred_df['SalePrice'] = predictions.flatten() print(pred_df) pred_df.to_csv('submission_xgb.csv', index=False)
def data_processing(positive_data_file, negative_data_file): # 处理输入的影评,每一行是一些词 # x = [N,max_len,300] neg_dir = negative_data_file pos_dir = positive_data_file with open(neg_dir, "r", encoding='Windows-1252') as f: data = f.read().split('\n') # 读取每一行的单词 neg_words = [0] * len(data) for d in range(len(data)): neg_words[d] = data[d].split(' ') # 以空格划分单词 print(neg_words[1]) max_len = 0 for d in neg_words: if len(d) > max_len: max_len = len(d) print(max_len) with open(pos_dir, "r", encoding='Windows-1252') as f: data = f.read().split('\n') pos_words = [0] * len(data) for d in range(len(data)): pos_words[d] = data[d].split(' ') print(pos_words[1]) for d in pos_words: if len(d) > max_len: max_len = len(d) print(max_len) # word_to_vector vectors_dir = r'rt-polaritydata/test2.w2v' with open(vectors_dir, "r", encoding='Windows-1252') as f: data = f.read() # data = str(data) data = data.split('\n') i = 0 word_to_vec = {} vec = [] print(len(data)) for d in range(len(data)): if i: dd = data[d].split(' ') word = dd[0] vecs = dd[1:] vec = [] for v in vecs: if v and word is not '': vec.append(float(v)) word_to_vec[word] = vec i += 1 # text word to vector worddim = 300 null_fill = [0.0] * worddim x_neg = np.zeros((len(neg_words), max_len, worddim)) print(x_neg.shape) for line in range(len(neg_words)): for word_ind in range(max_len): if word_ind >= len(neg_words[line]): x_neg[line][word_ind] = np.array(null_fill) else: if neg_words[line][word_ind] in word_to_vec and word_to_vec[ neg_words[line][word_ind]]: x_neg[line][word_ind] = np.array( word_to_vec[neg_words[line][word_ind]]) else: x_neg[line][word_ind] = np.array(null_fill) null_fill = [0.0] * worddim x_pos = np.zeros((len(pos_words), max_len, worddim)) print(x_pos.shape) for line in range(len(pos_words)): for word_ind in range(max_len): if word_ind >= len(pos_words[line]): x_pos[line][word_ind] = np.array(null_fill) else: if pos_words[line][word_ind] in word_to_vec and word_to_vec[ pos_words[line][word_ind]]: x_pos[line][word_ind] = np.array( word_to_vec[pos_words[line][word_ind]]) else: x_pos[line][word_ind] = np.array(null_fill) # x of shape(14012, 447, 300) and y of shape(14012,) from sklearn.preprocessing import OneHotEncoder y_neg = np.zeros((len(neg_words))) y_pos = np.ones((len(pos_words))) # y_ = np.concatenate((y_neg,y_pos),axis=0) # x_ = np.concatenate((x_neg,x_pos),axis=0) train_data = np.concatenate( (x_neg[len(neg_words) // 10:], x_pos[len(neg_words) // 10:]), axis=0) # 将积极和消极的数据连接起来 train_label = np.concatenate( (y_neg[len(neg_words) // 10:], y_pos[len(neg_words) // 10:]), axis=0) ohe = OneHotEncoder() ohe.fit([[0], [1]]) train_label = np.array( ohe.transform(np.transpose([ train_label, ])).toarray()) # trainsform into onehot label np.random.seed(231) np.random.shuffle(train_data) np.random.seed(231) np.random.shuffle(train_label) test_data = np.concatenate( (x_neg[:len(neg_words) // 10], x_pos[:len(pos_words) // 10]), axis=0) test_label = np.concatenate( (y_neg[:len(neg_words) // 10], y_pos[:len(pos_words) // 10]), axis=0) test_label = np.array( ohe.transform(np.transpose([ test_label, ])).toarray()) # trainsform into onehot label np.random.seed(131) np.random.shuffle(test_data) np.random.seed(131) np.random.shuffle(test_label) print(test_data.shape) print(test_label.shape) print(train_data.shape) return [train_data, train_label, test_data, test_label]
# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # the last column (Profit) # Data encoding from sklearn.preprocessing import LabelEncoder, OneHotEncoder le = LabelEncoder() X[:, -1] = le.fit_transform(X[:, -1]) # labelling the State enc = OneHotEncoder(categorical_features=[3]) X = enc.fit_transform(X).toarray() # Avoiding the dummy variable trap X = X[:, 1:] # ignoring the first dummy column :) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Fitting Multiple Linear Regression to the Training Set from sklearn.linear_model import LinearRegression regressor = LinearRegression()