X = incomedata.to_numpy()
incomedata_np = incomedata.to_numpy()
X = X[:,0:41]
## Get norminal data
X = np.delete(X,[0,5,16,17,18,29,38],axis=1)
## Missing values
X = SimpleImputer(missing_values='?', strategy='most_frequent').fit_transform(X)
## Convert to continuous int values

##enc = preprocessing.OneHotEncoder()
##X = enc.fit_transform(X)
enc = preprocessing.LabelEncoder()
for i in range(34):
    X[:,i] = enc.fit_transform(X[:,i])
#X = X.toarray()
X = X.astype(int)
## Missing values and Convert to int values
Y = incomedata_np[:,[0,5,16,17,18,29,38]]
Y = SimpleImputer(missing_values='?', strategy='most_frequent').fit_transform(Y)
Y = Y.astype(int)
## Get the X, Y for regression
X = np.hstack((X,Y))
## Y = incomedata[:,0]
## save to text
np.savetxt('incomedata/LRtestData',X)

########################################################################################################################

incomedata = pd.read_csv('incomedata/census-income.data',header=None)
X = incomedata.to_numpy()
incomedata_np = incomedata.to_numpy()
Exemplo n.º 2
0
mat = df[Ratings_cols].values
mat = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(mat)
np.save("UCI_dataset_04", mat)

np.save("UCI_dataset_05",
        pd.read_csv("yacht_hydrodynamics.data", header=None, sep="\s+").values)

np.save("UCI_dataset_06",
        pd.read_excel("Concrete_Data.xls", header=0).values.astype(float))

np.save("UCI_dataset_07",
        pd.read_csv("airfoil_self_noise.dat", header=None, sep="\t").values)

mat = pd.read_csv("communities.data", header=None, sep=",").values[:, 5:]
mat[mat == "?"] = None
mat = mat.astype(float)
mat[mat == None] = np.nan
mat = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(mat)
np.save("UCI_dataset_08", mat)

np.save(
    "UCI_dataset_09",
    pd.read_csv("CASP.csv", header=0)[[
        'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'RMSD'
    ]].values.astype(float))

np.save(
    "UCI_dataset_10",
    pd.read_csv("Relation Network (Directed).data", header=None,
                sep=",").values[:, 1:].astype(float))