def load_swbd_data(data_dir=os.getcwd() + "/data/switchboard"): from statsmodels.tools import categorical import pandas as pd prev_dir = os.getcwd() print(data_dir) os.chdir(data_dir) x_text = list(open(data_dir + "/swbd_utterance.csv", "r").readlines()) x_text = [s.strip() for s in x_text] x_text = [clean_str(sent) for sent in x_text] y = pd.read_csv(data_dir + "/swbd_act.csv") y = list(open(data_dir + "/swbd_act.csv", "r").readlines()) a = np.array([s.strip() for s in y]) y = categorical(a, drop=True) #y = y.argmax(1) ''' from scikits.statsmodels.tools import categorical In [61]: a = np.array( ['a', 'b', 'c', 'a', 'b', 'c']) In [62]: b = categorical(a, drop=True) In [63]: b.argmax(1) Out[63]: array([0, 1, 2, 0, 1, 2]) ''' #return [x_text, y] return [x_text, y]
def kmeans(request): if "header" in request.GET and "n_clusters" in request.GET and "set_id" in request.GET: # weight = [float(w) for w in request.GET["weight"].split(",")] if "metric" in request.GET: metric = request.GET["metric"] else: metric = "cosine" header = request.GET["header"].split(",") n_clusters = int(request.GET["n_clusters"]) cust_set = CustomerSet.objects.filter(id=request.GET["set_id"]) cust_matrix = numpy.array([]) dbpk_list = numpy.array([entity.cust.dbpk for entity in cust_set]) for h in header: # Choose header cust_column = numpy.array([getattr(entity.cust, h) for entity in cust_set]) if h in CATEGORICAL_COLUMNS: cust_column = categorical(cust_column, drop=True) # Stack to matrix if cust_matrix.size == 0: cust_matrix = cust_column else: cust_matrix = numpy.column_stack((cust_matrix, cust_column)) # Normalize cust_matrix = scale_linear_by_column(cust_matrix) # Weight # cust_matrix = numpy.nan_to_num(numpy.multiply(cust_matrix, numpy.array([numpy.array(weight)] * cust_set.count()))) # Clustering kmeans_centres, kmeans_xtoc, kmeans_dist = joker_kmeans.kmeans(cust_matrix, joker_kmeans.randomsample(cust_matrix, n_clusters), metric=metric) # Output result = [] for i in range(0, len(dbpk_list)): # Update cust set configurations cust = Customer.objects.get(dbpk=dbpk_list[i]) cust_set_entity = cust_set.get(cust=cust) cust_set_entity.cluster = kmeans_xtoc[i] cust_set_entity.cluster_time = datetime.now() cust_set_entity.cluster_features = ";".join(header) cust_set_entity.cluster_count = n_clusters cust_set_entity.cluster_metric = metric cust_set_entity.save() # Construct response entity = { "id": cust.id, "cluster": kmeans_xtoc[i] } for h in header: entity[h] = cust.__dict__[h] result.append(entity) return Response(result) else: return Response(status=status.HTTP_400_BAD_REQUEST)
def load(): """ Loads the Grunfeld data and returns a Dataset class. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- raw_data has the firm variable expanded to dummy variables for each firm (ie., there is no reference dummy) """ data = _get_data() raw_data = categorical(data, col='firm', drop=True) ds = du.process_recarray(data, endog_idx=0, stack=False) ds.raw_data = raw_data return ds
def load_kdd(seed, anomaly_type): data = pd.read_csv(os.path.join('data','kddcup.data.corrected'), header=None) for i in [1,2,3]: a = np.asarray(data.iloc[:,i]) b = categorical(a, drop=True) data.iloc[:,i] = b.argmax(1) data.iloc[:,:-1] = normalize(data.iloc[:,:-1], axis=0) data_normal = data.iloc[np.where(data.iloc[:,-1]=='normal.')[0],:-1] mask = np.random.choice(np.arange(data_normal.shape[0]), data_normal.shape[0]) data_normal = data_normal.iloc[mask,:] data_anomaly = data.iloc[np.where(data.iloc[:,-1]==anomaly_type)[0],:-1] x_train, x_test, t_train, t_test = train_test_split(data_normal, [0] * data_normal.shape[0], test_size=0.2, random_state=seed) x_test = np.vstack((x_test, data_anomaly)) t_test = np.hstack((t_test, [1] * data_anomaly.shape[0])) print data_normal.shape print data_anomaly.shape return [x_train, t_train, x_test, t_test]
def load_pandas(): """ Loads the Grunfeld data and returns a Dataset class. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- raw_data has the firm variable expanded to dummy variables for each firm (ie., there is no reference dummy) """ from pandas import DataFrame from statsmodels.tools import categorical data = _get_data() raw_data = categorical(data, col='firm', drop=True) ds = du.process_recarray_pandas(data, endog_idx=0) ds.raw_data = DataFrame(raw_data) return ds
imp = sklp.Imputer(missing_values=0,strategy='mean',axis=0) imp.fit_transform(Dataset) # PCA import sklearn.decomposition as skd pca = skd.PCA(n_components=n, whiten=False) pca.fit(Dataset) Dataset_Reduced_Dim = pca.transform(Dataset) # Train and Test x_train, x_test, y_train, y_test = sklm.train_test_split(x,y,test_size = 0.2) # Dummy encoding from statsmodels.tools import categorical cat_encod = categorical(data, dictnames=False, drop=False) #may need reshape(-1,1) ### plot ------------------------------------------------------------------ plt.plot(x,y) plt.title('Training Error by Iteration') plt.xlabel('Iteration Number') plt.ylabel('Error') # plot data in 3D plot plt.figure()
import sklearn.model_selection as skms from sklearn.svm import SVC from sklearn.metrics import confusion_matrix from mpl_toolkits.mplot3d import Axes3D from sklearn.ensemble import RandomForestClassifier #from sklearn.model_selection import cross_val_score #from sklearn.metrics import mean_squared_error from sklearn.cross_validation import KFold warnings.filterwarnings('ignore') #----------- -----------Pre Pre-Procesing of Timeseries with Pandas dframe = pd.read_csv('Q1Data.csv', header=None) #header none because no column names dframe.info() numdframe = dframe.iloc[:, 1:] catdframe = dframe.iloc[:, 0] catdf_encod = categorical(catdframe.values, dictnames=False, drop=True) numArr = np.asarray(numdframe.values) catArr = np.asarray(catdf_encod) Output = numArr[:, 5] Inp_num = numArr[:, 0:5] Input = np.concatenate((Inp_num, catArr), axis=1) Input = np.c_[catArr, Inp_num] print(Input.shape) ####Q1 (b)######## imp = skp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0) Input_new = imp.fit_transform(Input) ####Q1 (c)######
if e[2] != 'non-event': labels.append(e[2]) features.append( featurize( np.array(data[[ 'ax', 'ay', 'az', 'g1', 'g2', 'g3', 'gx', 'gy', 'gz', 'r1', 'r2', 'r3' ]]), e, templates)) raw_data.append( np.array(data[['ax', 'ay', 'az']].iloc[e[0]:e[1], ])) else: pass # Step 4: Classify Features features = np.array(features) labels = np.array(labels) labels_c = categorical(np.array(labels), drop=True) labels_num = np.argmax(labels_c, axis=1) features_complete = features[~np.isnan(features).any(axis=1), :] labels_complete = labels[~np.isnan(features).any(axis=1)] f_train, f_test, l_train, l_test = train_test_split(features_complete, labels_complete, test_size=0.2, random_state=42) clf = RandomForestClassifier(n_estimators=250, max_features=.33, oob_score=True) clf = clf.fit(f_train, l_train) importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) y_predicted = clf.predict(f_test)
""" import seaborn as sns from sklearn.metrics import confusion_matrix from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt import pandas as pd import numpy as np from statsmodels.tools import categorical from sklearn import tree # ------------------ Loading Dataset --------------------------# dataframe = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data") dataframe = dataframe .drop(dataframe .index[:-1000]) numericdf = dataframe[dataframe .columns[1:9]] categordf = dataframe[dataframe .columns[0]] categordf_en = categorical(categordf.values , drop=True) categordf_en = categordf_en[:, 0:2] numeric_arr = np.asarray(numericdf.values) categor_arr = np.asarray(categordf_en) Output = numeric_arr[:, 7] Input_numeric = numeric_arr[:, 0:6] Input_categor = categor_arr Input = np.concatenate((Input_numeric, Input_categor), axis=1) #---------------------------------------------------------------# RF = RandomForestClassifier(n_estimators=5, random_state=12) RF.fit(Input, Output) Z_RF = RF.predict(Input) CM_RF= confusion_matrix(Output, Z_RF) #--------------------------------------------------------------- DT = tree.DecisionTreeClassifier() DT.fit(Input, Output)
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Exercise – Week VIII Data Programming With Python – Fall / 2017 Multi-class Classification - Data Pre-processing, SVM, NB, Decision Tree, Random Forest, Classification Metrics - Confusion Matrix """ ### Dummy Coding / Encoding from statsmodels.tools import categorical import numpy as np a = np.array(['Type1', 'Type2', 'Type3', 'Type1', 'Type2', 'Type3']) cat_encod = categorical(a, dictnames=False, drop=True) print(a.reshape(-1, 1)) print(cat_encod) ### Support Vector from sklearn import svm from sklearn.model_selection import train_test_split import numpy as np X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 2, 2]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) svc_linear = svm.SVC(kernel='linear', C=1)
# Classification of data # Qualitative Data # Quantitative Data # Dummy encoding (similar to one-hot encoding) from statsmodels.tools import categorical import numpy as np a = np.array(['Type1', 'Type2','Type3','Type1','Type2','Type3',]) cat_encod = categorical(a, dictnames=False, drop=True) cat_encod2 = categorical(a, dictnames=False, drop=False) print(a.reshape(-1,1)) print(cat_encod) #with Drop print(cat_encod2) #without Drop
# Divide data into 80% training, 20% testing. train_indices = list(range(40)) + list(range(50,90)) test_indices = list(range(40,50)) + list(range(90,100)) X_train = X[train_indices] X_test = X[test_indices] y_train = Y[train_indices] y_test = Y[test_indices] ########################################################################### ###### Convert categorical variable to matrix and merge back with training ###### data. # Fake categorical variable. catVar = np.array(['a']*40 + ['b']*40) catVar = categorical(catVar, drop=True) X_train = np.concatenate((X_train, catVar), axis = 1) catVar = np.array(['a']*10 + ['b']*10) catVar = categorical(catVar, drop=True) X_test = np.concatenate((X_test, catVar), axis = 1) ########################################################################### # Model and test. clf = GradientBoostingClassifier(learning_rate=0.01,max_depth=8,n_estimators=50).fit(X_train, y_train) prob = clf.predict_proba(X_test)[:,1] # Only look at P(y==1). fpr, tpr, thresholds = roc_curve(y_test, prob) roc_auc_prob = auc(fpr, tpr)
def one_hot_encode(labels): labels = categorical(labels, drop=True) return labels
temp.append(tube_specs[key][i]) tube_specs_unique=list(set(temp)) tube_specs_unique.sort() #dataframe of features n=df_train_set.shape[0] #generate month and year columns X_time=np.zeros((n,3)) for i in range(n): X_time[i,:]=[int(y) for y in df_train_set['quote_date'][i].split('-')] cols_year=[str(x) for x in list(set(X_time[:,0]))] cols_month=[str(x) for x in list(set(X_time[:,1]))] from statsmodels.tools import categorical df_year=pd.DataFrame(categorical(X_time[:,0],drop=True),columns=cols_year) df_month=pd.DataFrame(categorical(X_time[:,1],drop=True),columns=cols_month) df_tube['end_a_1x']=pd.Categorical(df_tube['end_a_1x']).labels df_tube['end_a_2x']=pd.Categorical(df_tube['end_a_2x']).labels df_tube['end_x_1x']=pd.Categorical(df_tube['end_x_1x']).labels df_tube['end_x_2x']=pd.Categorical(df_tube['end_x_2x']).labels df_train_set=pd.merge(df_train_set, df_tube, on ='tube_assembly_id') df_train_set.drop('material_id', axis=1, inplace=True) df_train_set.drop('end_a', axis=1, inplace=True) df_train_set.drop('end_x', axis=1, inplace=True) df=pd.DataFrame() #fill out with zeros, one column for each specs category
'GB180': [400, 2, 5], 'GB1440': [400, 2, 5] } pars = par_dict[model + str(resolution)] file_df = pd.read_csv(working_directory + '\\' + fitting_file, header=0) X = file_df.loc[:, 'MX.5P_up':] y = file_df['MX.5P_brkpt'] month_labels = [ 'mo-' + str(item) for item in list(sorted(file_df['Month'].unique())) ] month_list = np.array(file_df['Month'].tolist()) month_cat = categorical(month_list, drop=True) columns = list(X.columns) columns.extend(month_labels) X_cat = np.concatenate((X, month_cat), axis=1) X = pd.DataFrame(X_cat, columns=columns) ############################################################################### 'prediction data' ############################################################################### user_df = pd.read_csv(working_directory + '\\' + user_file, header=0) X_user = user_df.loc[:, :] month_labels_user = [
import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.tools as stools from scipy import stats from mpl_toolkits.mplot3d import Axes3D def to_num(data): return sum([data[b] << b for b in range(len(data))]) raw = pd.read_csv("C:\\Temp\\Random\\Crime_Map.csv") count = 10000 longi = raw["Longitude"][:count] lat = raw["Latitude"][:count] encoded = np.array([to_num(x) for x in stools.categorical(np.array(raw["Offense Type"]),drop=True).astype(int)])[:count] kde = stats.gaussian_kde(np.row_stack((longi,lat))) plt.scatter(longi,lat,c=kde(np.row_stack((longi,lat)))) plt.show()
## 2. Modeling ###################################################################################################### # focus only on **TRANSFER** and **CASH_OUT** (where there are fraud), data slicing and data transformation. Keep only interested transaction type ('TRANSFER', 'CASH_OUT') print("## Focus only on **TRANSFER** and **CASH_OUT** (where there are fraud)") tmpData = raw_data.loc[(raw_data['type'].isin(['TRANSFER', 'CASH_OUT'])), :] # Data slicing - Drop unnecessary data ('step', 'nameOrig', 'nameDest', 'isFlaggedFraud') print( "## Data slicing - Drop unnecessary data ('step', 'nameOrig', 'nameDest', 'isFlaggedFraud')" ) # tmpData.drop(['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True) tmpData.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True) tmpData = tmpData.reset_index(drop=True) # Convert categorical variables to numeric variable a = np.array(tmpData['type']) b = categorical(a, drop=True) tmpData['type_num'] = b.argmax(1) tmpData.drop(['type'], axis=1, inplace=True) ## Plot Correlations of TRANSFER and CASH_OUT transactions and selected features # print("## Plot Correlations of TRANSFER and CASH_OUT transactions and selected features") # plotCorrelationHeatmap(tmpData, "TRANSFER and CASH_OUT Correlation") # plotCorrelationHeatmap(raw_data.loc[(raw_data.type == 'TRANSFER'), :], "TRANSFER Correlation") # plotCorrelationHeatmap(raw_data.loc[(raw_data.type == 'CASH_OUT'), :], "CASH_OUT Correlation") ## Quickly get the count and the target variable count. # print("## Plot Transaction count by type") # ax = tmpData.type.value_counts().plot(kind='bar', title="Transaction count by type", figsize=(6,6)) # for p in ax.patches: # ax.annotate(str(format(int(p.get_height()), ',d')), (p.get_x(), p.get_height()*1.01)) # plt.show()
# -*- coding: utf-8 -*- """ Created on Tue Dec 18 03:43:20 2018 @author: Ashtami """ from statsmodels.tools import categorical import numpy as np a = np.array(['Type1', 'Type2', 'Type3', 'Type1', 'Type2', 'Type3']) cat_encod = categorical(a, dictnames=False, drop=True) #a.values print(a.reshape(-1, 1)) print(cat_encod)
import numpy as np # from keras.utils import np_utils from statsmodels.tools import categorical nb_classes = 30 test_X = np.load('test_features.npy') test_labels = np.load('test_labels.npy') # print(test_X.shape) # print(test_labels.shape) training_X = np.load('train_features.npy') training_labels = np.load('train_labels.npy') # print(training_X.shape) # print(training_labels.shape) test_Y = categorical(test_labels, drop=True) training_Y = categorical(training_labels, drop=True) # print(test_Y.shape) # print(training_Y.shape)