def test_get_dummies_errors(): msg = 'data must have category dtype' with tm.assertRaisesRegexp(ValueError, msg): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds)
def test_get_dummies_kwargs(): s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category') exp = pd.get_dummies(s, prefix='X', prefix_sep='-') ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, prefix='X', prefix_sep='-') assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4'])) exp = pd.get_dummies(s, drop_first=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, drop_first=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # nan s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category') exp = pd.get_dummies(s) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # dummy_na exp = pd.get_dummies(s, dummy_na=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, dummy_na=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
def test_get_dummies_dtype_raises(): df = pd.DataFrame({ "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']), "B": [0, 0, 1], }) ddf = dd.from_pandas(df, 2) with pytest.raises(ValueError) as m: dd.get_dummies(ddf, dtype='float64') assert m.match("0.23.0")
def test_get_dummies_dtype(): df = pd.DataFrame({ "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']), "B": [0, 0, 1], }) ddf = dd.from_pandas(df, 2) exp = pd.get_dummies(df, dtype='float64') res = dd.get_dummies(ddf, dtype='float64') assert_eq(exp, res) assert res.compute().A_a.dtype == 'float64' # dask's get_dummies on a pandas dataframe. assert_eq(dd.get_dummies(df, dtype='float64'), exp) assert res.compute().A_a.dtype == 'float64'
def test_get_dummies_sparse(): s = pd.Series(pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])) ds = dd.from_pandas(s, 2) exp = pd.get_dummies(s, sparse=True) res = dd.get_dummies(ds, sparse=True) assert_eq(exp, res) assert res.compute().a.dtype == 'uint8' assert pd.api.types.is_sparse(res.a.compute()) exp = pd.get_dummies(s.to_frame(name='a'), sparse=True) res = dd.get_dummies(ds.to_frame(name='a'), sparse=True) assert_eq(exp, res) assert pd.api.types.is_sparse(res.a_a.compute())
def test_get_dummies(data): exp = pd.get_dummies(data) ddata = dd.from_pandas(data, 2) res = dd.get_dummies(ddata) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns)
def test_get_dummies_sparse_mix(): df = pd.DataFrame({ "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']), "B": [0, 0, 1], }) ddf = dd.from_pandas(df, 2) exp = pd.get_dummies(df, sparse=True) res = dd.get_dummies(ddf, sparse=True) assert_eq(exp, res) assert res.compute().A_a.dtype == 'uint8' assert pd.api.types.is_sparse(res.A_a.compute())
def test_get_dummies_object(): df = pd.DataFrame({'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), 'b': list('abcdabcd'), 'c': pd.Categorical(list('abcdabcd'))}) # exclude object columns exp = pd.get_dummies(df, columns=['a', 'c']) ddf = dd.from_pandas(df, 2) res = dd.get_dummies(ddf) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) exp = pd.get_dummies(df, columns=['a']) ddf = dd.from_pandas(df, 2) res = dd.get_dummies(ddf, columns=['a']) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # cannot target object columns msg = 'target columns must have category dtype' with tm.assertRaisesRegexp(ValueError, msg): dd.get_dummies(ddf, columns=['b'])
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds) # unknown categories df = pd.DataFrame({'x': list('abcbc'), 'y': list('bcbcb')}) ddf = dd.from_pandas(df, npartitions=2) ddf._meta = make_meta({'x': 'category', 'y': 'category'}) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=['x', 'y']) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.x)
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds) # unknown categories df = pd.DataFrame({"x": list("abcbc"), "y": list("bcbcb")}) ddf = dd.from_pandas(df, npartitions=2) ddf._meta = make_meta({"x": "category", "y": "category"}) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=["x", "y"]) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.x)
def test_get_dummies_sparse_mix(): df = pd.DataFrame( { "A": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]), "B": [0, 0, 1], } ) ddf = dd.from_pandas(df, 2) exp = pd.get_dummies(df, sparse=True) res = dd.get_dummies(ddf, sparse=True) assert_eq(exp, res) if PANDAS_GT_0240: exp_dtype = "Sparse[uint8, 0]" else: exp_dtype = "uint8" assert res.compute().A_a.dtype == exp_dtype assert pd.api.types.is_sparse(res.A_a.compute())
def run(self): dtype = { 'bedrooms': 'float32', 'beds': 'float32', 'review_scores_accuracy': 'float32', 'review_scores_checkin': 'float32', 'review_scores_cleanliness': 'float32', 'review_scores_communication': 'float32', 'review_scores_location': 'float32', 'review_scores_rating': 'float32', 'review_scores_value': 'float32' } ddf_listing = dd.read_csv(self.listings_csv_filename, dtype=dtype) use_columns_in_listing = [ 'id', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'cancellation_policy', ] ddf_listing = ddf_listing.loc[:, use_columns_in_listing] # property_type, room_type, cancellation_policy ddf_listing = ddf_listing.categorize( columns=['property_type', 'room_type', 'cancellation_policy']) ddf_listing = dd.get_dummies( ddf_listing, columns=['property_type', 'room_type', 'cancellation_policy']) # ddf_listing = ddf_listing.reset_index() ddf_listing = ddf_listing.rename(columns={'id': 'listing_id'}) ddf_listing = ddf_listing.compute() print(ddf_listing.shape) print(ddf_listing.head()) with open(self.output().path, "w") as target: ddf_listing.to_csv(target)
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: y = da.ones(X.shape[0]) y_uniqs = np.unique(y) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) samples = list() for _ in range(10): for y_uniq in y_uniqs: sample = list() for xa, ya in zip(chunks(X, 10),chunks(y, 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) if len(sample) >= 500: break except: pass samples += sample samples = da.vstack(samples) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def preprocess(csv_dir, types_path, preproc_path, output_path): """ Perform preprocessing steps on the raw CSV data as prescribed in the preprocessors file and store the output to HDF. :param csv_dir: directory with ram CSV data. :param types_path: JSON with type information to correctly parse CSVs. :param preproc_path: JSON with preprocessors descriptions. :param output_path: destination path for the resulting HDF file with processed data. :return: """ print("** Start preprocessing **") if not os.path.exists(output_path): # Read type & transform configs with open(types_path) as jf: dtypes = json.load(jf) with open(preproc_path) as jf: preproc = json.load(jf) with LocalCluster() as cluster: with Client(cluster) as client: df = dd.read_csv(os.path.join(csv_dir, '*.csv'), dtype=dtypes) # Separate categoricals cat_cols = [col for col, tp in dtypes.items() if tp == 'category' and col in df.columns] num_cols = [col for col in df.columns if col not in cat_cols] for desc in preproc["preprocessors"]: p = create_preprocessor(desc) df = p.apply(df) # {"column": "TailNum", "name": "FillValue", "value": "UNKNOW"}, # Convert to known categoricals df = dd.get_dummies(df.categorize()) print("Columns after preprocessing: ", df.columns) df.to_hdf(output_path, '/data') else: print("- file {} already exists, skipping preprocessing".format(output_path)) print("** Finished preprocessing **")
def make_categorical( client: Client, n_samples: int, n_features: int, n_categories: int, onehot: bool = False, ) -> Tuple[dd.DataFrame, dd.Series]: workers = _get_client_workers(client) n_workers = len(workers) dfs = [] def pack(**kwargs: Any) -> dd.DataFrame: X, y = tm.make_categorical(**kwargs) X["label"] = y return X meta = pack(n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False) for i, worker in enumerate(workers): l_n_samples = min(n_samples // n_workers, n_samples - i * (n_samples // n_workers)) future = client.submit( pack, n_samples=l_n_samples, n_features=n_features, n_categories=n_categories, onehot=False, workers=[worker], ) dfs.append(future) df = dd.from_delayed(dfs, meta=meta) y = df["label"] X = df[df.columns.difference(["label"])] if onehot: return dd.get_dummies(X), y return X, y
def test_get_dummies_object(): df = pd.DataFrame({'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), 'b': list('abcdabcd'), 'c': pd.Categorical(list('abcdabcd'))}) ddf = dd.from_pandas(df, 2) # Explicitly exclude object columns exp = pd.get_dummies(df, columns=['a', 'c']) res = dd.get_dummies(ddf, columns=['a', 'c']) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.b) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=['b'])
def weather_cluster(data): """ Creates a column that gives a cluster id based on KMeans clustering of only weather-related features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding clustering variable based on weather-related features...") df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]] to_cluster = dd.get_dummies(df) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit(train) # magic numbers, blech data["weather_cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout)) data["weather_cluster"] = data["weather_cluster"].astype("category") return data
def cluster_variable(data): """ Creates a column that gives a cluster id based on KMeans clustering of all features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding cluster variable...") data = data.copy() to_cluster = dd.get_dummies(data) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit( train.drop("cnt", axis=1)) # magic numbers, blech data["cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout.drop("cnt", axis=1))) data["cluster"] = data["cluster"].astype("category") return data
def preprocessing(data): print('Preprocessing started!') start_time = time.time() # One-Hot Encoding data['DayofWeek'] = data['DayofWeek'].astype('category') data_encoded = dd.get_dummies(data[['UniqueCarrier', 'Origin', 'Dest', 'DayofWeek']].categorize()).compute() print('Data enocded: ', (time.time()-start_time)) data_reduced = data.drop(['UniqueCarrier', 'Origin', 'Dest', 'FlightNum', 'Diverted','DayofWeek'], axis=1).compute() print('Data reduced: ', (time.time() - start_time)) X = pd.concat([data_reduced, data_encoded], axis=1) print('Data concatenated: ', (time.time() - start_time)) #y[y<0] = 0 end_time = time.time() duration = end_time - start_time # print(data_encoded.info()) # print(data_full.info()) # print(data_reduced.info()) # # print(h.heap()) del data_reduced del data_encoded gc.collect() #print('Afer Deletion:', h.heap()) print('Duration Preprocessing: ', duration) return X
def transform(self, X, y=None): """Dummy encode the categorical columns in X Parameters ---------- X : pd.DataFrame or dd.DataFrame y : ignored Returns ------- transformed : pd.DataFrame or dd.DataFrame Same type as the input """ if not X.columns.equals(self.columns_): raise ValueError("Columns of 'X' do not match the training " "columns. Got {!r}, expected {!r}".format( X.columns, self.columns)) if isinstance(X, pd.DataFrame): return pd.get_dummies(X, drop_first=self.drop_first) elif isinstance(X, dd.DataFrame): return dd.get_dummies(X, drop_first=self.drop_first) else: raise TypeError("Unexpected type {}".format(type(X)))
def test_get_dummies_object(): df = pd.DataFrame({ 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), 'b': list('abcdabcd'), 'c': pd.Categorical(list('abcdabcd')) }) ddf = dd.from_pandas(df, 2) # Explicitly exclude object columns exp = pd.get_dummies(df, columns=['a', 'c']) res = dd.get_dummies(ddf, columns=['a', 'c']) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.b) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=['b'])
def test_get_dummies_object(): df = pd.DataFrame({ "a": pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), "b": list("abcdabcd"), "c": pd.Categorical(list("abcdabcd")), }) ddf = dd.from_pandas(df, 2) # Explicitly exclude object columns exp = pd.get_dummies(df, columns=["a", "c"]) res = dd.get_dummies(ddf, columns=["a", "c"]) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.b) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=["b"])
def test_get_dummies_kwargs(): s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category') exp = pd.get_dummies(s, prefix='X', prefix_sep='-') ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, prefix='X', prefix_sep='-') assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4'])) exp = pd.get_dummies(s, drop_first=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, drop_first=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # nan s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category') exp = pd.get_dummies(s) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # dummy_na exp = pd.get_dummies(s, dummy_na=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, dummy_na=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan])) msg = 'sparse=True is not supported' with pytest.raises(NotImplementedError) as err: dd.get_dummies(ds, sparse=True) assert msg in str(err.value)
def pipeline_casero(data, preprocessing=[], creation=[], reduction=[], selection=[], models=[]): """ A homemade pipeline to automate all the steps of data preparation, feature creation, feature selection, feature reduction, and outputting a fitted model This is not strictly necessary as it does not add more functionality than an sklearn Pipeline, but we thought it would be easier to use for our purposes and it has the added benefit of allowing us to control the verbosity of the output. :param path: A path to the file containing the data for the pipeline :param preprocessing: An iterable containing all the preprocessing steps (functions with signature [DataFrame -> DataFrame] :param creation: An iterable containing all the feature creation steps (functions with signature [DataFrame -> DataFrame] :param reduction: An iterable containing all the dimensionality reduction steps (functions with signature [DataFrame -> DataFrame] :param selection: An iterable containing all the feature selection steps (functions with signature [DataFrame -> DataFrame] :param models: An array of dicts containing the name for the model ("name"), the sklearn estimator ("model"), and the parameters for Grid Search Cross Validation ("params") :return: A fitted model that represents the best model out of all the ones in 'models' """ print("Beginning pipeline at {}\n".format(datetime.now())) print("Performing preprocessing steps...") data = update_df(data, preprocessing) print("Preprocessing completed at {}, performed {} steps".format(datetime.now(), len(preprocessing))) print("New Shape of data: {0}\n".format(len(data.columns))) print("Performing feature creation...") data = update_df(data, creation) print("Feature Creation completed at {}, performed {} steps".format(datetime.now(), len(creation))) print("New Shape of data: {0}\n".format(len(data.columns))) print("Dummifying...") data = dd.get_dummies(data) print("New Shape of data: {0}\n".format(len(data.columns))) print("Performing dimensionality reduction...") data = update_df(data, reduction) print("Dimensionality reduction completed at {}, performed {} steps".format(datetime.now(), len(reduction))) print("New Shape of data: {0}\n".format(len(data.columns))) train = get_train(data) holdout = get_holdout(data) print("Performing feature selection...") train = update_df(train, selection) print("Feature Selection completed at {}, performed {} steps".format(datetime.now(), len(selection))) print("New Shape of train: {0}\n".format(len(train.columns))) holdout = holdout[train.columns] # The issue here is this assumes the model need the data in the same format, unless we submit pipelines # here which seems silly.... print("Scoring models....") best_model = select_best_model(models, train) print("Evaluating model on the holdout...") final_r2 = r2_score(holdout.cnt, best_model.predict(holdout.drop("cnt", axis=1))) print("Final R2: {0}".format(final_r2)) print("\nPipeline finished! Completed execution at {}. Returning model...".format(datetime.now())) return best_model
dummies = pd.get_dummies(race_column, prefix='race') ipums_rejoined = ipums_race_df_sample.join(dummies) ipums_rejoined['Hisp_1'] = ipums_rejoined['Hisp'].apply(lambda x: 1 if (x > 0 ) and (x != 9) else 0) ipums_rejoined['Educ_1'] = ipums_rejoined['Educ'].apply(lambda x: 1 if x > 9 else 0) ipums_grped = ipums_rejoined.groupby(['State', 'Year'])[['Per_Urban', 'Mean_Num_Veh', 'Educ_1', 'Poverty', 'race_1', 'race_2', 'race_3', 'race_4', 'Hisp_1']].mean() ipums_race_df = dd.read_csv('/home/jaala/win-Python/Projects/Abortion/data/ipums_demo_various.csv', \ names=['Year', 'State', 'Per_Urban', 'Mean_Num_Veh', 'Hisp', 'Race', 'Educ_1', 'Per_Poverty'], \ usecols=[0, 6, 7, 9, 12, 14, 16, 18], header=0, blocksize=25e6, \ dtype={'Race': 'category', 'Hisp': 'category', 'State': 'category', 'Region': 'category', 'Per_Urban': 'float64'}) race_column = ipums_race_df['Race'] dummies = dd.get_dummies(race_column.to_frame().categorize(), prefix='race') ipums_rejoined = ipums_race_df.join(dummies) ipums_rejoined['Per_Hisp'] = ipums_rejoined['Hisp'].apply(lambda x: 1 if (x > 0 ) and (x != 9) else 0, meta=('float')) ipums_rejoined['Perc_w_Bachelors'] = ipums_rejoined['Educ_1'].apply(lambda x: 1 if x > 9 else 0, meta=('float')) ipums_grped = ipums_rejoined.groupby(['State', 'Year'])[['Per_Urban', 'Mean_Num_Veh', 'Perc_w_Bachelors', 'Per_Poverty', 'race_1', 'race_2', 'race_3', 'race_4', 'race_5', 'Per_Hisp']].mean() ipums_df = ipums_grped.compute() #df_ipums ipums_grped['race_1'].describe() ################################################################################
}, blocksize="16 MiB", storage_options={"anon": True}, ) #.head(n=1000) print(df.columns) print(len(df)) # 7,667,792 #storage_options={'key': settings.AWS_ACCESS_KEY_ID, # 'secret': settings.AWS_SECRET_ACCESS_KEY}) df = df.repartition(partition_size="10 MiB").persist() # one hot encode the categorical columns df = df.categorize(categorical_features) df = dd.get_dummies(df, columns=categorical_features) # persist so only download once df = df.persist() data = df[[c for c in df.columns if c not in output]] data = data.fillna(0) durations = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60 # minutes from dask_ml.model_selection import train_test_split import dask X = data.to_dask_array(lengths=True).astype("float32") y = durations.to_dask_array(lengths=True).astype("float32")
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds)
#ddf = ddf.drop(["cell_line"],axis=1) print("filling NA") # y = y.compute() with ProgressBar(): for m in ["p.HER2", "p.PLCg2"]: # print (df[m]) train["%s_c" % (m)] = train[m].fillna( train[m].mean()) #, inplace=True ) test["%s_c" % (m)] = test[m].fillna(test[m].mean()) # ddf["%s_c"%(m)] =ddf[m].fillna(ddf[m].mean() )#, inplace=True ) # In[7]: #ddf = dd.get_dummies(ddf.categorize()).persist() train = dd.get_dummies(train.categorize()).persist() test = dd.get_dummies(test.categorize()).persist() # In[8]: #ddf = ddf.drop(["p.HER2","p.PLCg2","cellID","fileID"],axis=1) train = train.drop(["p.HER2", "p.PLCg2", "cellID", "fileID"], axis=1) test = test.drop(["p.HER2", "p.PLCg2", "cellID", "fileID"], axis=1) # In[9]: rounds = {} genes = [ 'b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67', 'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK', 'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.HER2_c', 'p.JNK', 'p.MAP2K3', 'p.MAPKAPK2',
#having a look at the head of the dataset df.head() #finding the null values in the dataset df.isnull().sum().compute() #defining the data and target categorical_variables = df[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] target = df['Purchase'] #creating dummies for the categorical variables data = dd.get_dummies(categorical_variables.categorize()).compute() #converting dataframe to array datanew = data.values #fit the model from dask_ml.linear_model import LinearRegression lr = LinearRegression() lr.fit(datanew, target) #preparing the test data test_categorical = test[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] test_dummy = dd.get_dummies(test_categorical.categorize()).compute()
# In[22]: #Now that we have extracted required derived features from the pickup and dropoff datetime, drop them ddf = ddf.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1) # #### One Hot encoding # In[23]: get_dummy_col = [ "VendorID", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "pickup_hour", "dropoff_hour", "pickup_day", "dropoff_day" ] ddf = ddf.categorize(columns=get_dummy_col) # In[24]: ddf = dpd.get_dummies(ddf, columns=get_dummy_col) print("After one-hot encoding") print(ddf.__class__) print(ddf.shape[0].compute()) # #### Use tip_amount as the target label for training # In[25]: label = ddf[['tip_amount']].compute() ddf = ddf.drop(['tip_amount'], axis=1) ddf = ddf.drop(['total_amount'], axis=1) # #### Transform features and then normalize values # In[26]: from dask_ml.preprocessing import MinMaxScaler min_max_scaler = MinMaxScaler()
X = train_df.drop(target_genes,axis=1) y = train_df[target_genes] # In[14]: print ("cat4egorize") X = X.categorize(columns=["treatment"]) print ("dummies") my_dummies = dd.get_dummies(X["treatment"]) X= X.drop(['treatment', 'cell_line', 'time', 'cellID', 'fileID'],axis=1) # In[15]: # y.columns # In[16]: # test = my_dummies.compute()
cat_variables = hour.dtypes[hour.dtypes == "object"].index cat_variables # In[29]: ddf_hour = hour.categorize() # In[30]: ddf_hour.head() # Now we proceed to dummify the selected variables to be able to be used in our models. # In[31]: hour = ddf.get_dummies(ddf_hour, columns=cat_variables) print("The dataset now contains {} columns.".format(ddf_hour.shape[1])) # ## Skewness # Now we will check if there is any skewness in our target variable, if so we will proceed to take the log in order to make it normally distributed. # In[32]: plt.subplots(figsize=(15, 6)) sns.distplot(ddf_hour.cnt.compute(), color="red") plt.title("Distribution of Total Count") # In[33]: ddf_hour.cnt = np.log1p(ddf_hour.cnt)