data.append(items) data = np.array(data) # Convert string data to numerical data # 숫자가 아닌 특징을 숫자형태로 인코딩하는 것. label_encoder = [] X_encoded = np.empty( data.shape ) # => data의 형태 처럼 만들기 print(data.shape) 해보기 => 아마도 (시간,경기유무) 일듯? for i, item in enumerate(data[0]): # => enumerate는 i에는 인덱스 item에는 값이 들어간다. if item.isdigit(): X_encoded[:, i] = data[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i]) X = X_encoded[:, :-1].astype(int) # input으로 들어갈 특징들 y = X_encoded[:, -1].astype(int) # 오토바이 갯수 # Split data into training and testing datasets X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25, random_state=5) # Extremely Random Forests regressor params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0} regressor = ExtraTreesRegressor(**params) regressor.fit(X_train, y_train) # Compute the regressor performance on test data
Y_data, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=1511) y_lgbmx1 = lgbmx1.predict(X_test) trainrms = sqrt(mean_squared_error(y_test, y_lgbmx1)) print("lgbmreg trainrms {}".format(trainrms)) # In[75]: from sklearn import utils X_Cdata, X_Ctest, Y_Cdata, Y_Ctest = train_test_split(data, Y, test_size=0.20, random_state=42) lab_enc = preprocessing.LabelEncoder() lab_enc.fit(Y) Y_FULL_encoded = lab_enc.transform(Y) Y_data_encoded = lab_enc.transform(Y_Cdata) Y_test_encoded = lab_enc.transform(Y_Ctest) Y_data_encoded.shape print(utils.multiclass.type_of_target(Y_data_encoded)) lgbmClass1 = lgb.LGBMClassifier(n_estimators=171, num_threads=6, objective='multiclassova') lgbmClass1.fit(data, Y_FULL_encoded,
Pclass1_mean_fare = test_data['Fare'].groupby( by=test_data['Pclass']).mean().get([1]).values[0] Pclass2_mean_fare = test_data['Fare'].groupby( by=test_data['Pclass']).mean().get([2]).values[0] Pclass3_mean_fare = test_data['Fare'].groupby( by=test_data['Pclass']).mean().get([3]).values[0] # 建立Pclass_Fare Category test_data['Pclass_Fare_Category'] = test_data.apply(pclass_fare_category, args=(Pclass1_mean_fare, Pclass2_mean_fare, Pclass3_mean_fare), axis=1) pclass_level = preprocessing.LabelEncoder() # 给每一项添加标签 pclass_level.fit( np.array([ 'Pclass1_Low', 'Pclass1_High', 'Pclass2_Low', 'Pclass2_High', 'Pclass3_Low', 'Pclass3_High' ])) # 转换成数值 test_data['Pclass_Fare_Category'] = pclass_level.transform( test_data['Pclass_Fare_Category']) # dummy 转换 pclass_dummies_df = pd.get_dummies(test_data['Pclass_Fare_Category']).rename( columns=lambda x: 'Pclass_' + str(x)) test_data = pd.concat([test_data, pclass_dummies_df], axis=1) # 将 Pclass 特征factorize化: test_data['Pclass'] = pd.factorize(test_data['Pclass'])[0]
def read_xy(PATH): dataset = pd.read_csv(PATH) #用pandas读取原始数据 col = dataset.columns.values.tolist() #取第一行 col1 = col[1:] #取特征 print(len(col1)) #特征维数 X_train = np.array(dataset[col1]) #取数据 y_train = preprocessing.LabelEncoder().fit_transform( dataset['class']) #标签标准化 print(len(y_train)) #标准化 scale = StandardScaler().fit( X_train) #特征矩阵标准化(与距离计算无关的概率模型、与距离计算无关的基于树的模型不需要) X_train = scale.transform(X_train) #带L1/L2/L1+L2惩罚项的逻辑回归作为基模型的特征选择——SelectFromModel #小的C会导致少的特征被选择。使用Lasso,alpha的值越大,越少的特征会被选择。 ######################################针对clf.coef_:1*n_features##################################### ''' #clf=Lasso(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#Lasso回归 #clf = LassoCV() #clf=Ridge(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#岭回归 #clf=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.1,max_iter=5000,random_state=0)#弹性网络正则 clf=LinearRegression(normalize=True) clf.fit(X_train, y_train) #print(clf.coef_) importance=np.abs(clf.coef_) #print(importance) ''' ######################################针对clf.coef_:n_classes*n_features##################################### #‘newton-cg’,‘sag’和‘lbfgs’等solvers仅支持‘L2’regularization, #‘liblinear’ solver同时支持‘L1’、‘L2’regularization, #若dual=Ture,则仅支持L2 penalty。 clf = LogisticRegression(penalty='l1', C=0.1, solver='liblinear', random_state=0) #clf.coef_:n_classes*n_features #clf=LogisticRegression(penalty='l2',C=0.1,random_state=0) #clf=LR(threshold=0.5, C=0.1)#参数threshold为权值系数之差的阈值 #clf=LinearSVC(penalty='l1',C=0.1,dual=False,random_state=0) #clf=LinearSVC(penalty='l2',C=0.1,random_state=0) clf.fit(X_train, y_train) #print(clf.coef_) #每个类别--每个属性--都有一个权重,将不同类别同一属性权重相加--即为该维度的--重要程度得分 #方法一: importance = np.linalg.norm(clf.coef_, axis=0, ord=1) #方法二: #coef=np.abs(clf.coef_) #importance=np.sum(coef,axis=0) #print(importance) mean = np.mean(importance) #print(mean) #median=np.median(importance) #print(median) #model=SelectFromModel(clf,prefit=True) model = SelectFromModel(clf, prefit=True, threshold=2.0 * mean) ''' model=SelectFromModel(estimator=clf).fit(X_train, y_train) importance=model.estimator_.coef_ threshold=model.threshold_ print(threshold) ''' #threshold : 阈值,string, float, optional default None #可以使用:median 或者 mean 或者 1.25 * mean 这种格式。 #如果使用参数惩罚设置为L1,则使用的阈值为1e-5,否则默认使用用mean X_train = model.transform(X_train) f_dim = X_train.shape[1] print(f_dim) y_train = np_utils.to_categorical(y_train) return X_train, y_train, f_dim
def prepareData(): target = pd.read_csv("dt-data.txt", names=[ 'Size', 'Occupied', 'Price', 'Music', 'Location', 'VIP', 'Favorite Beer', 'Enjoy' ], skipinitialspace=True, skiprows=[0], index_col=False) target['Size'] = target['Size'].str.replace('\d+:', '') target['Enjoy'] = target['Enjoy'].str.replace(';', '') from sklearn import preprocessing label_processor = preprocessing.LabelEncoder() target.VIP = label_processor.fit_transform(target.VIP) vipKeys = {} for val in target.VIP.unique(): vipKeys[val] = label_processor.inverse_transform(val) target.Enjoy = label_processor.fit_transform(target.Enjoy) target.Size = label_processor.fit_transform(target.Size) sizeKeys = {} for val in target.Size.unique(): sizeKeys[val] = label_processor.inverse_transform(val) target.Occupied = label_processor.fit_transform(target.Occupied) occKeys = {} for val in target.Occupied.unique(): occKeys[val] = label_processor.inverse_transform(val) target.Price = label_processor.fit_transform(target.Price) priceKeys = {} for val in target.Price.unique(): priceKeys[val] = label_processor.inverse_transform(val) target.Music = label_processor.fit_transform(target.Music) musicKeys = {} for val in target.Music.unique(): musicKeys[val] = label_processor.inverse_transform(val) target.Location = label_processor.fit_transform(target.Location) locKeys = {} for val in target.Location.unique(): locKeys[val] = label_processor.inverse_transform(val) target['Favorite Beer'] = label_processor.fit_transform( target['Favorite Beer']) beerKeys = {} for val in target['Favorite Beer'].unique(): beerKeys[val] = label_processor.inverse_transform(val) global inverseKeys inverseKeys = { 'Size': sizeKeys, 'Occupied': occKeys, 'Price': priceKeys, 'Music': musicKeys, 'Location': locKeys, 'VIP': vipKeys, 'Favorite Beer': beerKeys } # -------------------------------------------Tennis Data---------------------------------------------------------------- # target = pd.read_csv("tennis.csv", # names=['outlook', 'temp', 'humidity', 'windy', 'play'], # skipinitialspace=True, skiprows=[0], index_col=False) # from sklearn import preprocessing # label_processor = preprocessing.LabelEncoder() # target.outlook = label_processor.fit_transform(target.outlook) # outlookKeys = {} # for val in target.outlook.unique(): # outlookKeys[val]=label_processor.inverse_transform(val) # target.temp = label_processor.fit_transform(target.temp) # tempKeys = {} # for val in target.temp.unique(): # tempKeys[val] = label_processor.inverse_transform(val) # target.humidity = label_processor.fit_transform(target.humidity) # humKeys = {} # for val in target.humidity.unique(): # humKeys[val] = label_processor.inverse_transform(val) # target.windy = label_processor.fit_transform(target.windy) # winKeys = {} # for val in target.humidity.unique(): # winKeys[val] = label_processor.inverse_transform(val) # target.play = label_processor.fit_transform(target.play) # global inverseKeys # inverseKeys = {"outlook":outlookKeys, "temp":tempKeys, "humidity":humKeys, "windy":winKeys} return target
def fxn(): #read in the data df = pd.read_csv('data.csv') #columns to drop df = df.drop(['id'], axis=1) df.sample(frac=1) #gets rid of ? and one hot encoding for all columns that need it index = [] count = 0 for val in range(len(df.ix[:, 0])): flag = False for column in df: if df[column][val] == '?': flag = True break if flag: continue if count < 1000: index.append(val) count += 1 df = df[df.index.isin(index)] #gets all columns which are not ints and integer encodes them obj_df = df.select_dtypes(include=['object']).copy() for column in obj_df: le = preprocessing.LabelEncoder() le.fit(df[column]) df[column] = le.transform(df[column]) #normalize all points between [0,1] x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) # In[589]: #make dataset only 1100 #create 500/500 split between labelled on nonlablled array, 1000 semi-sup data set, and 100 validation dataset train, test = np.split(df.sample(frac=1), [int(.8 * len(df))]) #print(train) train = train.values.tolist() test = test.values.tolist() df_unsupervised = [] label_nolabels = {} for point in train: #unlablled 1000 points data df_unsupervised.append(point[1:]) label_nolabels[tuple(point[1:])] = [point[0]] # In[590]: ##### #kmeans_forest 1-10, unsupervised learning adaboosting # kmeans1 = KMeans(n_clusters=2).fit(df_unsupervised) # # #kmeans2 = SpectralClustering(n_clusters = 2).fit_predict(df_unsupervised).tolist() # # kmeans3 = MeanShift().fit(df_unsupervised) # # #kmeans4 = AgglomerativeClustering(n_clusters=2).fit_predict(df_unsupervised).tolist() # # kmeans5 = DBSCAN().fit_predict(df_unsupervised).tolist() # # kmeans6 = GaussianMixture(n_components=2).fit(df_unsupervised) # # kmeans7 = Birch(n_clusters=2).fit(df_unsupervised) # # kmeans8 = BayesianGaussianMixture(n_components=2).fit(df_unsupervised) # classifiers = [kmeans1, kmeans3, kmeans5, kmeans6, kmeans7, kmeans8] #kmeans_forest 1-10, unsupervised learning adaboosting kmeans1 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans2 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans3 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans4 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans5 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans6 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans7 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans8 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans9 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans10 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans11 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans12 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans13 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans14 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans15 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans16 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans17 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans18 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans19 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans20 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans21 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans22 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans23 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans24 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans25 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans26 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans27 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans28 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans29 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans30 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans31 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans32 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans33 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans34 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans35 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans36 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans37 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans38 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans39 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans40 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans41 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans42 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans43 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans44 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans45 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans46 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans47 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans48 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans49 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans50 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) classifiers = [ kmeans1, kmeans2, kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8, kmeans9, kmeans10, kmeans11, kmeans12, kmeans13, kmeans14, kmeans15, kmeans16, kmeans17, kmeans18, kmeans19, kmeans20, kmeans21, kmeans22, kmeans23, kmeans24, kmeans25, kmeans26, kmeans27, kmeans28, kmeans29, kmeans30, kmeans31, kmeans32, kmeans33, kmeans34, kmeans35, kmeans36, kmeans37, kmeans38, kmeans39, kmeans40, kmeans41, kmeans42, kmeans43, kmeans44, kmeans45, kmeans46, kmeans47, kmeans48, kmeans49, kmeans50 ] # In[591]: # make csv in form of rowNumber, clfNumber, clf prediction on that row answers = [] for point in range(len(df_unsupervised)): for clf in range(len(classifiers)): answers.append([ point, clf, classifiers[clf].predict([df_unsupervised[point]]) ]) count = 0 f = open("answer_file.csv", "w") f.write('question,worker,answer;\n') for answer in answers: count += 1 f.write( str(answer[0]) + ',' + str(answer[1]) + ',' + str(int(answer[2])) + '\n') f.close() p = open("result_file.csv", "w") p.close() # In[592]: #run VI BP import subprocess subprocess.call([ "python", "run.py", "methods/c_EM/method.py", "answer_file.csv", "result_file.csv", "decision-making" ]) # In[593]: #extract results, get noisy labels and filepath = "result_file.csv" noisy_labels = [] with open(filepath) as fp: for line in fp: questionAnswer = line.split(',') noisy_labels.append(questionAnswer) # In[594]: #assign noisy label to proper row df_noise_x = [] df_noise_y = [] for question in noisy_labels: if question[0].rstrip() == 'question': continue df_noise_x += [df_unsupervised[int(question[0].rstrip())]] df_noise_y.append(int(question[1].rstrip())) count_vi = 0 for el in range(len(df_noise_x)): if label_nolabels[tuple(df_noise_x[el])][0] != df_noise_y[el]: count_vi += 1 print(count_vi, len(df_noise_x)) # In[595]: df_noise_y2 = [] for el in df_noise_y: df_noise_y2.append(int(el)) df_noise = [] for el in range(len(df_noise_x)): new = df_noise_x[el] new.append(df_noise_y2[el]) df_noise.append(new) #need to shuffle the data random.shuffle(df_noise) df_noise_x = [] df_noise_y = [] for row in df_noise: df_noise_x.append(row[:-1]) df_noise_y.append(row[-1:][0]) # In[596]: #run AdaBoost from Sklearn on noisy data bdt2 = AdaBoostClassifier(DecisionTreeClassifier(), algorithm="SAMME", n_estimators=20) bdt2.fit(df_noise_x, df_noise_y) # In[597]: #Ada boosting on noisy data error rate errors = [] count1 = 0 for point in test: est = bdt2.predict([point[:-1]]) true = int(point[-1:][0]) est = int(est[0]) if est == true: errors.append([point[:-1], 0]) else: count1 += 1 errors.append([point[:-1], 1]) # error rate, noisy -> baseline return (count1 / len(test))
def NominalToNumeric(self): l_pre = preprocessing.LabelEncoder() self.dataset = self.dataset.apply(l_pre.fit_transform)
def process_reference_log(parameters, verbose): output_path = os.path.join(parameters['output_folder'], parameters['event_log']) if not os.path.exists(output_path): os.mkdir( os.path.join(parameters['output_folder'], parameters['event_log'])) else: print("The directory for the event log already exists!") parameters['output_folder'] = output_path reference_log_df = analyzer.load_reference_log(parameters['reference_log']) max_trace_length, n_caseid, n_activity, activities = analyzer.prescriptive_analysis( reference_log_df) parameters['max_trace_length'] = max_trace_length parameters['n_caseid'] = n_caseid parameters['n_activities'] = n_activity parameters['activities'] = activities if verbose > 1: # Print the distribution of the activities and store the plot in the output folder activities_counted = analyzer.get_activity_distribution( reference_log_df, activities) plotting.plot_barchart_from_dictionary( activities_counted, "Activity Distribution Reference Log (" + parameters['event_log'] + ")", "Activity", "Number of Occurrence", save=True, output_file=parameters['output_folder']) # Extract labels (i.e. names of activities that occur as labels) for the data set and encode them reference_y = example_creator.get_label( reference_log_df.groupby('CaseID').agg({'Activity': lambda x: list(x)})) # Calculate imbalance degree imbalance.calculate_imbalance_degree(reference_y) if verbose > 1: # Print the distribution of the labels of the reference log labels_counted = analyzer.get_label_distribution( reference_y, set(reference_y)) plotting.plot_barchart_from_dictionary( labels_counted, "Label Distribution Reference Log (" + parameters['event_log'] + ")", "Label", "Number of Occurrence", save=True, output_file=parameters['output_folder']) # Encode the labels extracted from the reference log and export them to the output folder label_encoder = preprocessing.LabelEncoder() label_encoder.fit(reference_y) support.export_encoding(parameters['output_folder'], label_encoder) le_name_mapping = dict( zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))) parameters['encoding'] = le_name_mapping if verbose > 1: print("Encoding Mapping: ") print(parameters['encoding']) # Encode the the reference training samples reference_y_enc = label_encoder.transform(reference_y) # Depending on the variant calculate a cost matrix if parameters['cost'] == 'COST_SUM' or parameters[ 'cost'] == 'OPTIMIZED_COST' or parameters[ 'cost'] == 'APPROXIMATE_COST': cost_matrix = cm.calculate_cost_matrix(reference_y_enc) print(cost_matrix) else: cost_matrix = None reference_y_enc = np.asarray(reference_y_enc) reference_y_one_hot = np_utils.to_categorical(reference_y_enc, label_encoder.classes_.size) all_labels_enc = set(reference_y_enc) parameters['labels_enc'] = all_labels_enc parameters['labels'] = set(reference_y) return reference_y_one_hot, cost_matrix, parameters
def label_X(X_train, X_dev, X_test): le = preprocessing.LabelEncoder() X_train = label(le, X_train) X_dev = label(le, X_dev) X_test = label(le, X_test) return X_train, X_dev, X_test
data1['STARTING_LATITUDE'].fillna(data1['STARTING_LATITUDE'].mean(), inplace=True) data1['TIMESTAMP'] = pd.to_datetime(data1['TIMESTAMP']) data1['TIMESTAMP'] = (data1['TIMESTAMP'] - data1['TIMESTAMP'].min()) / np.timedelta64(1, 'D') data1['STARTING_LONGITUDE'].fillna(data1['STARTING_LONGITUDE'].mean(), inplace=True) data1['DESTINATION_LATITUDE'].fillna(data1['DESTINATION_LATITUDE'].mean(), inplace=True) data1['DESTINATION_LONGITUDE'].fillna( data1['DESTINATION_LONGITUDE'].mean(), inplace=True) data1['TOTAL_LUGGAGE_WEIGHT'].fillna(0.0, inplace=True) data1['WAIT_TIME'].fillna(0.0, inplace=True) lbl1 = preprocessing.LabelEncoder() lbl1.fit(list(data1['VEHICLE_TYPE'].values)) data1['VEHICLE_TYPE'] = lbl1.transform(list(data1['VEHICLE_TYPE'].values)) #data1.hist() #plt.show() fancy = data1.corr() fancy.to_csv('correlation1.csv') y1 = data1['FARE'] del data1['FARE'] del data1['ID'] X1 = data1 data2['STARTING_LATITUDE'].fillna(data2['STARTING_LATITUDE'].mean(), inplace=True)
def classify(algorithm, fname, input_data, label_name, n_cores, random_state): train_y = np.array(input_data[label_name]) input_data = input_data.drop('ID', axis=1) training_x = input_data.drop(label_name, axis=1) le = preprocessing.LabelEncoder() le.fit(train_y) train_y = le.transform(train_y) cv_metrics = pd.DataFrame() # 10-fold cross validation predicted_n_actual_pd = pd.DataFrame( columns=['ID', 'predicted', 'actual', 'fold']) kf = KFold(n_splits=10, shuffle=True, random_state=random_state) fold = 1 for train, test in kf.split(training_x): # number of train and test instances is based on training_x. train_cv_features, test_cv_features, train_cv_label, test_cv_label = training_x.iloc[ train], training_x.iloc[test], train_y[train], train_y[test] if algorithm == 'GB': temp_classifier = GradientBoostingClassifier(n_estimators=300, random_state=1) elif (algorithm == 'RF'): temp_classifier = RandomForestClassifier(n_estimators=300, random_state=1, n_jobs=n_cores) elif (algorithm == 'M5P'): temp_classifier = ExtraTreesClassifier(n_estimators=300, random_state=1, n_jobs=n_cores) elif (algorithm == 'KNN'): temp_classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=n_cores) elif (algorithm == 'NEURAL'): temp_classifier = MLPClassifier(random_state=1) temp_classifier.fit(train_cv_features, train_cv_label) temp_prediction = temp_classifier.predict(test_cv_features) predicted_n_actual_pd = predicted_n_actual_pd.append(pd.DataFrame({ 'ID': test, 'actual': test_cv_label, 'predicted': temp_prediction, 'fold': fold }), ignore_index=True, sort=True) fold += 1 try: roc_auc = round( roc_auc_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) except ValueError: roc_auc = 0.0 matthews = round( matthews_corrcoef(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) balanced_accuracy = round( balanced_accuracy_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) f1 = round( f1_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) try: tn, fp, fn, tp = confusion_matrix( predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()).ravel() except: tn, fp, fn, tp = 0, 0, 0, 0 cv_metrics = cv_metrics.append(pd.DataFrame(np.column_stack(['cv',roc_auc, matthews,\ balanced_accuracy, f1, tn, fp, fn, tp]),\ columns=['type','roc_auc','matthew','bacc','f1','TN','FP','FN','TP']), ignore_index=True, sort=True) cv_metrics = cv_metrics.round(3) cv_metrics = cv_metrics.astype({ 'TP': 'int64', 'TN': 'int64', 'FP': 'int64', 'FN': 'int64' }) cv_metrics = cv_metrics[[ 'type', 'matthew', 'f1', 'bacc', 'roc_auc', 'TP', 'TN', 'FP', 'FN' ]] predicted_n_actual_pd['predicted'] = le.inverse_transform( predicted_n_actual_pd['predicted'].to_list()) predicted_n_actual_pd['actual'] = le.inverse_transform( predicted_n_actual_pd['actual'].to_list()) fname_predicted_n_actual_pd = os.path.join( output_result_dir, 'cv_{}_predited_data.csv'.format(algorithm)) predicted_n_actual_pd['ID'] = predicted_n_actual_pd['ID'] + 1 predicted_n_actual_pd = predicted_n_actual_pd.sort_values(by=['ID']) predicted_n_actual_pd.to_csv(fname_predicted_n_actual_pd, index=False) return cv_metrics
def read_files(tarfname): """Read the training and development data from the sentiment tar file. The returned object contains various fields that store sentiment data, such as: train_data,dev_data: array of documents (array of words) train_fnames,dev_fnames: list of filenames of the doccuments (same length as data) train_labels,dev_labels: the true string label for each document (same length as data) The data is also preprocessed for use with scikit-learn, as: count_vec: CountVectorizer used to process the data (for reapplication on new data) trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication) target_labels: List of labels (same order as used in le) trainy,devy: array of int labels, one for each document """ import tarfile tar = tarfile.open(tarfname, "r:gz") trainname = "train.tsv" devname = "dev.tsv" for member in tar.getmembers(): if 'train.tsv' in member.name: trainname = member.name print("trainname: ", trainname) elif 'dev.tsv' in member.name: devname = member.name class Data: pass sentiment = Data() print("-- train data") sentiment.train_data, sentiment.train_labels = read_tsv(tar, trainname) print(len(sentiment.train_data)) print("-- dev data") sentiment.dev_data, sentiment.dev_labels = read_tsv(tar, devname) print(len(sentiment.dev_data)) print("-- transforming data and labels") ### without any vectorizer sentiment.trainX = sentiment.train_data sentiment.devX = sentiment.dev_data from sklearn import preprocessing sentiment.le = preprocessing.LabelEncoder() sentiment.le.fit(sentiment.train_labels) sentiment.target_labels = sentiment.le.classes_ sentiment.trainy = sentiment.le.transform(sentiment.train_labels) sentiment.devy = sentiment.le.transform(sentiment.dev_labels) ## feature generation sentiment.train_posX, sentiment.train_negX = splitPosNegData( sentiment.trainX, sentiment.trainy) '''tfidf vectorizer''' # sentiment.pos_vec = TfidfVectorizer(ngram_range = (1,2)) # sentiment.pos_vocab = sentiment.pos_vec.fit(sentiment.train_posX).vocabulary_ # # print(sentiment.pos_vocab) # sentiment.neg_vec = TfidfVectorizer(ngram_range = (1, 2)) # sentiment.neg_vocab = sentiment.neg_vec.fit(sentiment.train_negX).vocabulary_ # print(sentiment.neg_vocab) ### get pos, neg vector on train import pickle from sklearn.feature_extraction.text import TfidfVectorizer sentiment.tfidf_vect = TfidfVectorizer(ngram_range=(1, 2)) print("train_data type: ", type(sentiment.train_data)) sentiment.trainX = sentiment.tfidf_vect.fit_transform(sentiment.train_data) sentiment.pos_matrix = sentiment.tfidf_vect.transform(sentiment.train_posX) sentiment.neg_matrix = sentiment.tfidf_vect.transform(sentiment.train_negX) print("feature names:") print(len(sentiment.tfidf_vect.get_feature_names())) output = open('pos_neg_matrix.pkl', 'wb') pickle.dump([ sentiment.pos_matrix, sentiment.neg_matrix, sentiment.tfidf_vect.get_feature_names() ], output) print("dump matrix done...") output.close() sentiment.devX = sentiment.tfidf_vect.transform(sentiment.dev_data) tar.close() return sentiment
df.columns # In[5]: X = df.drop(['default payment next month', 'target'], axis=1) # In[6]: encoders = {} X_num = X.copy() label_cols = ['sex', 'education', 'marriage', 'age', 'target'] for col in X_num.columns.tolist(): if col in label_cols: encoders[col] = preprocessing.LabelEncoder().fit(X_num[col]) X_num[col] = encoders[col].transform(X_num[col]) X_num = X_num.drop(['sex'], axis=1) y = df['target'].copy() s = encoders['sex'].transform(X['sex']) # ## Create Tensors # In[7]: X_tensor = torch.tensor(X_num.values, device=device).double() noise = torch.randn([X_tensor.shape[0], 5], device=device).double() X_noised = torch.cat((X_tensor, noise), 1) s_tensor = torch.tensor(s, device=device).double().unsqueeze(1)
from sklearn import preprocessing, metrics from sklearn.metrics import average_precision_score from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt import pandas as pd import numpy as np df = pd.read_csv("student-prf.csv", sep=';', header=0) df = df.apply(preprocessing.LabelEncoder().fit_transform) df = np.array(df) selected_column = np.arange(25) selected_column = np.append(selected_column, [28, 29]) X = df[:400, selected_column] y = df[:400, 26] K = np.arange(1, 23, 2) K = np.append(K, 30) for i in range(12): knn_clf = KNeighborsClassifier(n_neighbors=K[i]) knn_clf.fit(X, y) predicted = knn_clf.predict(df[400:, selected_column]) expected = df[400:, 26] report = metrics.classification_report(expected, predicted) print(report) print # average is manually collected from the print statement precision = [ 0.54, 0.53, 0.53, 0.53, 0.49, 0.50, 0.54, 0.63, 0.63, 0.63, 0.45, 0.45 ]
def load_data(p=100, type="doc2vec"): sources = {'./negative.txt': 'DOC_NEG', './positive.txt': 'DOC_POS'} sentences = LabeledLineSentence(sources) pos_lst = np.genfromtxt('pos_lst', dtype='str') neg_lst = np.genfromtxt('neg_lst', dtype='str') pos_lst, neg_lst = set(pos_lst), set(neg_lst) senword_lst = pos_lst.union(neg_lst) X, y, words, vocab = [], [], [], [] if type == "doc2vec": model = Doc2Vec(min_count=1, window=10, vector_size=p, sample=1e-4, negative=5, workers=8) model.build_vocab(sentences.to_array()) model.train(sentences.sentences_perm(), epochs=20, total_examples=model.corpus_count) for line in sentences.to_array(): sen_tmp = [s for s in line[0] if s in senword_lst] words.append(sen_tmp) vocab.extend(sen_tmp) vocab = list(set(vocab)) prefix_tmp = line[1] X.append(model[prefix_tmp][0]) if 'POS' in prefix_tmp[0]: y.append(1) if 'NEG' in prefix_tmp[0]: y.append(-1) if type == "word2vec": words_word2vec = [] for line in sentences.to_array(): words_word2vec.append(line[0]) model_ug_cbow = Word2Vec(sg=0, size=p, negative=5, window=2, min_count=2, workers=2, alpha=0.065, min_alpha=0.065) model_ug_cbow.build_vocab(words_word2vec) vocab_lst = set(model_ug_cbow.wv.vocab) for epoch in range(30): model_ug_cbow.train(words_word2vec, total_examples=len(words_word2vec), epochs=1) model_ug_cbow.alpha -= 0.002 model_ug_cbow.min_alpha = model_ug_cbow.alpha for line in sentences.to_array(): sen_tmp = set(line[0]) sen_tmp = set(line[0]).intersection(vocab_lst) word_ave = np.array([model_ug_cbow.wv[wd] for wd in sen_tmp]) if len(word_ave) > 0: word_ave = np.mean(word_ave, axis=0) else: word_ave = np.zeros(p) prefix_tmp = line[1] X.append(word_ave) if 'POS' in prefix_tmp[0]: y.append(1) if 'NEG' in prefix_tmp[0]: y.append(-1) if type == "googlenews": googlenews = KeyedVectors.load_word2vec_format( '../GoogleNews-vectors-negative300.bin', binary=True) vocab_lst = set(googlenews.vocab).intersection(senword_lst) for line in sentences.to_array(): sen_tmp = [s for s in line[0] if s in vocab_lst] word_ave = np.array([googlenews[wd] for wd in sen_tmp]) words.append(sen_tmp) vocab.extend(sen_tmp) vocab = list(set(vocab)) # if len(word_ave) > 0: # word_ave = np.mean(word_ave, axis=0) # else: # word_ave = np.zeros(300) prefix_tmp = line[1] # X.append(word_ave) if 'POS' in prefix_tmp[0]: y.append(1) if 'NEG' in prefix_tmp[0]: y.append(-1) le = preprocessing.LabelEncoder() le.fit(vocab) vocab_num = le.transform(vocab) dict_emb = [] for i in range(len(vocab_num)): wd = le.inverse_transform([i]) dict_emb.append(googlenews[wd][0]) dict_emb, y, words, vocab = np.array(dict_emb), np.array(y), np.array( words), np.array(vocab) le_lst = [] for i in range(len(words)): le_lst.append(le.transform(words[i])) le_lst = np.array(le_lst) return dict_emb, y, le_lst, vocab
import numpy as np import pandas as pd from sklearn.tree import DecisionTreeClassifier # step 2 : downloading the data : !wget -O drug200.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv # step 3 : reading data using Pandas data frame my_data = pd.read_csv("drug200.csv", delimiter=",") my_data[0:5] # --> Pre-processing X = my_data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values X[0:5] from sklearn import preprocessing le_sex = preprocessing.LabelEncoder() le_sex.fit(['F','M']) X[:,1] = le_sex.transform(X[:,1]) le_BP = preprocessing.LabelEncoder() le_BP.fit([ 'LOW', 'NORMAL', 'HIGH']) X[:,2] = le_BP.transform(X[:,2]) le_Chol = preprocessing.LabelEncoder() le_Chol.fit([ 'NORMAL', 'HIGH']) X[:,3] = le_Chol.transform(X[:,3]) X[0:5]
titanic_test.Fare[titanic_test['Fare'].isnull()] = titanic_test['Fare'].mean() ###type casting means conversion of object variables to categorical variables. ###if we look at data type of Sex , as off now it is an object or sometimes it is int also, ##it should be converted to categorical variable titanic_test['Sex'] = titanic_test['Sex'].astype('category') ###we can check with info() method weather sex changed to categorical or not titanic_test.info() ####like this change all categorical variables titanic_test['Pclass'] = titanic_test['Pclass'].astype('category') titanic_test['Embarked'] = titanic_test['Embarked'].astype('category') #####Lebel Encoding , all input categorical variables should be converted to #####numerics coz decision tree algorithm can not understand the categorical variables titanic_test1 = titanic_test.copy() ##le = preprocessing.LabelEncoder() ### le is label encoder already created in train we can use this le le = preprocessing.LabelEncoder() ### le is label encoder titanic_test1.Pclass = le.fit_transform(titanic_test1.Pclass) titanic_test1.Sex = le.fit_transform(titanic_test1.Sex) titanic_test1.Embarked = le.fit_transform(titanic_test1.Embarked) x_test = titanic_test1[['Pclass', 'Sex', 'Embarked', 'Fare']] #x_train = titanic_train1[['Fare']] dt = joblib.load("dt_fit2.pkl") titanic_test1['Survived'] = dt.predict(x_test) titanic_test1.to_csv("submission.csv", columns=['PassengerId', 'Survived'], index=False)
def MLP(name, input_dir, best_dir, output): if not os.path.exists(best_dir): os.makedirs(best_dir) best_dir_dat = "/".join((best_dir, name)) if not os.path.exists(best_dir_dat): os.makedirs(best_dir_dat) colnames = "HType,ABType,dimension,learnFac,margin,constr,LType,MLP_acc,MLP_wF1,MLP_epoch" with open(output, "w") as file: file.write(colnames) file.write("\n") models = sorted(os.listdir(input_dir)) for model in models: modelpath = "/".join((input_dir, model)) files = sorted(os.listdir(modelpath)) # create model subdir to store best MLP models best_subdir = "/".join((best_dir_dat, model)) if not os.path.exists(best_subdir): os.makedirs(best_subdir) for i, file in enumerate(files): print(i) # embedding datasets labelpath = "/".join((modelpath, file)) dataset = pd.read_csv(labelpath, index_col=0) # specify file path to store best MLP model [for later] filepath = best_subdir + "/" + file[:-4] + ".hdf5" ################################################################################ ############################# DATA SPLIT ############################## ################################################################################ lb = preprocessing.LabelBinarizer() lb.fit(list(dataset["class"])) X_train = dataset[dataset["split"] == "LRN"].iloc[:, 1:-2].values y_train = dataset[dataset["split"] == "LRN"].iloc[:, -1].values # get weights first weights = compute_class_weight("balanced", np.unique(y_train), y_train) # then transform y_train = lb.transform(y_train) X_valid = dataset[dataset["split"] == "VLD"].iloc[:, 1:-2].values y_valid = dataset[dataset["split"] == "VLD"].iloc[:, -1].values y_valid = lb.transform(y_valid) X_test = dataset[dataset["split"] == "TST"].iloc[:, 1:-2].values y_test = dataset[dataset["split"] == "TST"].iloc[:, -1].values y_test = lb.transform(y_test) ################################################################################ ############################# CLASSIFIER STRUCTURE ############################## ################################################################################ classifier = Sequential() dim = len(dataset.iloc[0, 1:-2]) nodes = dim * 2 # Hidden layer classifier.add( Dense(nodes, activation="sigmoid", kernel_initializer="uniform", input_dim=dim)) # Output layer classifier.add( Dense(9, activation="softmax", kernel_initializer="uniform")) # compile the model sgd = optimizers.SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False) classifier.compile(optimizer=sgd, loss="categorical_crossentropy", metrics=["accuracy"]) ################################################################################ ############################# MODEL FITTING ############################## ################################################################################ # checkpoint best model checkpoint = ModelCheckpoint(filepath, monitor="val_acc", verbose=0, save_best_only=True, mode="auto") # model settings and fit history = classifier.fit(X_train, y_train, validation_data=(X_valid, \ y_valid), epochs=5000, verbose=0, callbacks=[checkpoint], \ class_weight=weights) ################################################################################ ############################# MAKE PREDICTIONS ############################## ################################################################################ #load best model final_model = load_model(filepath) # get accuracy scores = final_model.evaluate(X_test, y_test, verbose=0) # get weighted F1-by-class le = preprocessing.LabelEncoder() le.fit(list(dataset["class"])) y_test2 = dataset[dataset["split"] == "TST"].iloc[:, -1].values y_test2 = le.transform(y_test2) y_pred = final_model.predict_classes(X_test, verbose=0) weighted_f1 = f1_score(y_test2, y_pred, average="weighted") # get best epoch acc_history = history.history["val_acc"] best_epoch = acc_history.index(max(acc_history)) + 1 K.clear_session() # destroy TF graph to avoid loop slowing down ################################################################################ ############################# ASSEMBLE W/ CONFIG ############################## ################################################################################ # get model type (H1-4, A/B) modelType = model.split("-")[1] # ["H1A"] HType = modelType[0:2] ABType = modelType[-1] # get dimension filenamesplit = file.split("-") dimension = int([s for s in filenamesplit if "D00" in s][0][1:]) # get learnFac learnFac = int([s for s in filenamesplit if "LF0" in s][0][3:]) # get margin margin = float([s for s in filenamesplit if "LM" in s][0][2:]) # get constraint constr = [s for s in filenamesplit if "_VALUE" in s][0][:-6].lower() # get LType LType = filenamesplit[-1][:2] with open(output, "a") as file: file.write("%s,%s,%d,%d,%.1f,%s,%s,%.17f,%.17f,%d" % (HType, ABType, dimension, learnFac, margin, constr, LType, scores[1], weighted_f1, best_epoch)) file.write("\n")
train['price_doc'] = train['price_doc'] * mult y_train = train["price_doc"] ######################################################################################################### print('Running Model 1...') x_train = train.drop(["id", "timestamp", "price_doc", "average_q_price"], axis=1) #x_test = test.drop(["id", "timestamp", "average_q_price"], axis=1) x_test = test.drop(["id", "timestamp"], axis=1) num_train = len(x_train) x_all = pd.concat([x_train, x_test]) for c in x_all.columns: if x_all[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(x_all[c].values)) x_all[c] = lbl.transform(list(x_all[c].values)) x_train = x_all[:num_train] x_test = x_all[num_train:] xgb_params = { 'eta': 0.05, 'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 }
def main(): nRowsRead = 10 # specify 'None' if want to read whole file # KDD training data 125974 df = pd.read_csv('NIDS\KDDAll.txt', delimiter=',', nrows=125974) df = df[[ 'duration', 'protocol_type', 'service', 'src_bytes', 'dst_bytes', 'num_failed_logins', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label' ]] le = preprocessing.LabelEncoder() df = df.apply(le.fit_transform) #print(df.head(10)) x = df.drop('label', axis=1) y = df['label'] train_X, test_X, train_Y, test_Y = train_test_split(x, y, test_size=0.25, random_state=40) #Naive Bayes gnb = GaussianNB() #random forest rf = RandomForestClassifier(n_estimators=100, bootstrap=True) #KNN knn = KNeighborsClassifier(n_neighbors=1) #dtree dt = DecisionTreeClassifier() #voting classifer #hard voting - Majority #vclf =VotingClassifier(estimators=[('gnb', gnb), ('rf', rf), ('knn', knn),('dt',dt)], voting='hard') #vclf = vclf.fit(x,y) #pred_Y = vclf.predict(test_X) #print(classification_report(test_Y,pred_Y)) #CM = confusion_matrix(test_Y,pred_Y) #TN = CM[0][0] #FN = CM[1][0] #TP = CM[1][1] #FP = CM[0][1] #FPR = FP/(FP + TN) #print("FPR:",FPR) #plot_confusion_matrix(vclf,test_X,test_Y) #plt.show() #hard voting - Mean vclf = VotingClassifier(estimators=[('gnb', gnb), ('rf', rf), ('knn', knn), ('dt', dt)], voting='soft') vclf = vclf.fit(x, y) pred_Y = vclf.predict(test_X) print(classification_report(test_Y, pred_Y)) CM = confusion_matrix(test_Y, pred_Y) TN = CM[0][0] FN = CM[1][0] TP = CM[1][1] FP = CM[0][1] FPR = FP / (FP + TN) print("FPR:", FPR) plot_confusion_matrix(vclf, test_X, test_Y) plt.show()
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn import metrics salarydata_train.columns salarydata_test.columns salarydata_train.shape salarydata_test.shape salarydata_train.isnull().sum salarydata_test.isnull().sum salarydata_train.head salarydata_test.head salary_columns=['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native'] from sklearn import preprocessing number = preprocessing.LabelEncoder() for i in salary_columns: salarydata_train[i] = number.fit_transform(salarydata_train[i]) salarydata_test[i] = number.fit_transform(salarydata_test[i]) colnames = salarydata_train.columns len(colnames[0:13]) trainX=salarydata_train[colnames[0:13]] trainY=salarydata_train[colnames[13]] testX=salarydata_test[colnames[0:13]] testY=salarydata_test[colnames[13]] ignb = GaussianNB() # normal distribution pred_gnb = ignb.fit(trainX,trainY).predict(testX) confusion_matrix(testY,pred_gnb) #([[10759, 601],
def create_synthetic_dataset(n_samples=500, cube_size=32, template_size=7, density_min=.1, density_max=.5, proportions=[0.3, 0.7]): ''' Creates a basic 3D texture synthetic dataset. Returns the volumes X and labels y. n_samples: number of samples per class (default 500) cube_size: size of the cubes, i.e. training andtest volumes (default 32) template_size: size of the templates rotated and pasted in the volumes (default 7) density_min: minimum density of patterns (default 0.1) density_max: maximum density of patterns (default 0.5) proportions: proportion of template 1 for the two classes (the proportion of template 2 is 1-p) (default= [0.3,0.7]) ''' np.random.seed(seed=0) # number of classes (only designed for 2 classes here) n_class = 2 # Rotation range rot = 360 range_rot = [0, rot] # Generate empty templates template = np.zeros((2, template_size, template_size, template_size)) # Fill the templates # For now a simple line for t1 template[0, int(template_size / 2) - 1:int(template_size / 2) + 1, int(template_size / 2) - 1:int(template_size / 2) + 1, :] = 1 # And a cross for t2 template[1, int(template_size / 2) - 1:int(template_size / 2) + 1, int(template_size / 2) - 1:int(template_size / 2) + 1, int(template_size / 4):int(3 * template_size / 4) + 1] = 1 template[1, int(template_size / 2) - 1:int(template_size / 2) + 1, int(template_size / 4):int(3 * template_size / 4), int(template_size / 2) - 1:int(template_size / 2) + 1] = 1 # Initialize dataset lists X = [] y = [] for c in range(n_class): for s in range(n_samples): # Generate an empty 64x64x64 cube cube = np.zeros((cube_size, cube_size, cube_size)) # Generate random density density = np.random.uniform(density_min, density_max) # Number of patterns in volume based on the density n_templates = int((cube_size**3) / (template_size**3) * density) # Crop size after rotation: crop_size = int(template_size * np.sqrt(3)) # place the rotated patterns in the cube for t in range(n_templates): # random position position = np.array([ np.random.choice(cube_size), np.random.choice(cube_size), np.random.choice(cube_size) ]) # is it template 1 or 2: template_type = np.random.choice( 2, p=[proportions[c], 1 - proportions[c]]) # Rotate the template 1 or 2 random_angles = [ np.random.uniform(range_rot[0], range_rot[1]) for i in range(3) ] rotated_template = apply_affine_transform_fixed( template[template_type], random_angles) # copy the rotated template in the cube cube = copy_template(cube, rotated_template, position) X.append(cube) y.append(c) X = np.expand_dims(np.asarray(X), axis=-1) y = np.asarray(y) le = preprocessing.LabelEncoder() le.fit(np.unique(y)) y = le.transform(y) return X, y
user_file = "../../Data/user_list.csv" test_pred_file = "test_predictions_xgb_dep14_child18_eta05_round450_seed0_trainseed1234.csv" train = pd.read_csv(train_file) users_list = np.array(pd.read_csv(user_file)["USER_ID_hash"]).astype('str') print train.shape print "Label encomding.." #col_names = ["UserPrefName", "CouponCapsuleText", "CouponGenreName", "CouponLargeAreaName", "CouponSmallAreaName", "CouponKenName"] #train = train.drop(col_names, axis=1) #le_UserIDHash = preprocessing.LabelEncoder() #le_UserIDHash.fit(users_list) #train["USER_ID_hash"] = le_UserIDHash.transform(train["USER_ID_hash"].astype("str")) le_UserPrefName = preprocessing.LabelEncoder() le_UserPrefName.fit(unique_pref_name) train["UserPrefName"] = le_UserPrefName.transform(train["UserPrefName"].astype('str')) le_CouponCapsuleText = preprocessing.LabelEncoder() le_CouponCapsuleText.fit(unique_capsule_text) train["CouponCapsuleText"] = le_CouponCapsuleText.transform(train["CouponCapsuleText"].astype('str')) le_CouponGenreName = preprocessing.LabelEncoder() le_CouponGenreName.fit(unique_genre_name) train["CouponGenreName"] = le_CouponGenreName.transform(train["CouponGenreName"].astype('str')) le_CouponLargeAreaName = preprocessing.LabelEncoder() le_CouponLargeAreaName.fit(unique_large_area_name) train["CouponLargeAreaName"] = le_CouponLargeAreaName.transform(train["CouponLargeAreaName"].astype('str'))
def main(_): x = tf.placeholder(tf.float32, shape=[None, 2352]) y_ = tf.placeholder(tf.float32, shape=[None, 2]) # First Convolution and Pooling Layer conv_weight_1 = weight_variable([5, 5, 3, 31]) conv_bias_1 = bias_variable([31]) x_image = tf.reshape(x, [-1, 28, 28, 3]) conv_1_1 = conv2d(x_image, conv_weight_1) conv_1 = tf.nn.relu(conv2d(x_image, conv_weight_1) + conv_bias_1) pool_1 = max_pool_2x2(conv_1) # Second Convolution and Pooling layer conv_weight_2 = weight_variable([5, 5, 31, 64]) conv_bias_2 = bias_variable([64]) conv_2 = tf.nn.relu(conv2d(pool_1, conv_weight_2) + conv_bias_2) pool_2 = max_pool_2x2(conv_2) # First fully connected layer fc_weight_1 = weight_variable([7 * 7 * 64, 1024]) fc_bias_1 = bias_variable([1024]) pool_2_flat = tf.reshape(pool_2, [-1, 7 * 7 * 64]) fc_1 = tf.nn.relu(tf.matmul(pool_2_flat, fc_weight_1) + fc_bias_1) # A drop out layer keep_prob = tf.placeholder(tf.float32) custom_fc1_drop = tf.nn.dropout(fc_1, keep_prob) # Second custom fully connected layer fc_weights_2 = weight_variable([1024, 2]) fc_bias_2 = bias_variable([2]) fc_2 = tf.matmul(fc_1, fc_weights_2) + fc_bias_2 y_conv = fc_2 cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) file_list, y_image_label = prepare_data(FLAGS.image_dir) le = preprocessing.LabelEncoder() y_one_hot = tf.one_hot(le.fit_transform(y_image_label), depth=2) x_feed = sess.run(read_image_array(file_list)) y_feed = sess.run(y_one_hot) for i in range(200): if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: x_feed, y_: y_feed, keep_prob: 1.0 }) print('step %d, training accuracy %g' % (i, train_accuracy)) train_step.run(feed_dict={x: x_feed, y_: y_feed, keep_prob: 0.8}) predicted = tf.argmax(y_conv, 1) if FLAGS.predict_image <> "": x_single_img = sess.run(read_single_image(FLAGS.predict_image)) print( "You got ", le.inverse_transform( sess.run(predicted, feed_dict={x: x_single_img})))
def test_CNN(model,X_train,y_train,X_valid,y_valid,w_id,batch_size,num_epochs,preprocessed=False): num_samples = X_train.shape[0] num_batches = int(np.ceil(num_samples / float(batch_size))) l1 = preprocessing.LabelEncoder() t1 = l1.fit_transform(y_train) l2 = preprocessing.LabelEncoder() t2 = l2.fit_transform(y_valid) num_test_samples = X_valid.shape[0] num_test_batches = int(np.ceil(num_test_samples / float(batch_size))) # setting up lists for handling loss/accuracy train_loss, val_loss = [], [] train_cost, val_cost = [], [] for epoch in range(num_epochs): # Forward -> Backprob -> Update params ## Train correct = 0 model.train() for i in range(num_batches): if i % 10 == 0: print("\n {}, still training...".format(i), end='') idx = range(i * batch_size, np.minimum((i + 1) * batch_size, num_samples)) index = idx[-1]-idx[0]+1 if preprocessed==False: batch_image = np.zeros((index,224,224)) for j in range(index): image_resized = resize(X_train[idx[j]], (224, 224), anti_aliasing=True) batch_image[j,:,:] = image_resized X_batch_tr = Variable(torch.from_numpy(batch_image)) y_batch_tr = Variable(torch.from_numpy(t1[idx]).long()) optimizer.zero_grad() output = model(X_batch_tr.unsqueeze(1).float()) else: X_batch_tr = X_train[idx,:,:,:] y_batch_tr = Variable(torch.from_numpy(t1[idx]).long()) optimizer.zero_grad() output = model(X_batch_tr.float()) batch_loss = criterion(output, y_batch_tr) train_loss.append(batch_loss.data.numpy()) batch_loss.backward() optimizer.step() preds = np.argmax(output.data.numpy(), axis=-1) correct += np.sum(y_batch_tr.data.numpy() == preds) train_acc = correct / float(num_samples) train_cost.append(np.mean(train_loss)) correct2 = 0 model.eval() wrong_guesses = [] wrong_predictions = [] all_predictions = [] for i in range(num_test_batches): if i % 10 == 0: print("\n {}, now validation...".format(i), end='') idx = range(i * batch_size, np.minimum((i + 1) * batch_size, num_test_samples)) index = idx[-1] - idx[0] + 1 if preprocessed==False: batch_image = np.zeros((index,224,224)) for j in range(index): image_resized = resize(X_valid[idx[j]], (224, 224), anti_aliasing=True) batch_image[j,:,:] = image_resized X_batch_v = Variable(torch.from_numpy(batch_image)) y_batch_v = Variable(torch.from_numpy(t2[idx]).long()) output = model(X_batch_v.unsqueeze(1).float()) else: X_batch_v = X_valid[idx,:,:,:] y_batch_v = Variable(torch.from_numpy(t2[idx]).long()) output = model(X_batch_v.float()) batch_loss = criterion(output, y_batch_v) val_loss.append(batch_loss.data.numpy()) preds = np.argmax(output.data.numpy(), axis=-1) eval_preds = y_batch_v.data.numpy() == preds for k in range(index): if eval_preds[k] == False: wrong_guesses.append(w_id[idx[k]]) wrong_predictions.append(preds[k]) else: correct2 += 1 all_predictions.append(preds[k]) val_acc = correct2 / float(num_test_samples) val_cost.append(np.mean(val_loss)) if epoch % 10 == 0: print("\n Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f" % ( epoch + 1, train_cost[-1], train_acc, val_acc)) return train_acc,train_cost,val_acc,val_cost, wrong_guesses, wrong_predictions, all_predictions, model
def get_df(self): if self.name == 'adult': header = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] df = pd.read_csv(self.file, names=header) df = df[df['occupation'] != ' ?'] df = df.reset_index() df['income'] = (df['income'] == ' >50K') col_action = { 'age': 'num', 'workclass': 'ohe', 'fnlwgt': 'del', 'education': 'ohe', 'education-num': 'num', 'marital-status': 'ohe', 'occupation': 'se', 'relationship': 'ohe', 'race': 'ohe', 'sex': 'ohe', 'capital-gain': 'num', 'capital-loss': 'num', 'hours-per-week': 'num', 'native-country': 'ohe', 'income': 'y' } self.clf_type = 'binary_clf' if self.name == 'beer_reviews': df = pd.read_csv(self.file) df.shape df = df.dropna(axis=0, how='any') # print_unique_values(df) col_action = { 'brewery_id': 'del', 'brewery_name': 'del', 'review_time': 'del', 'review_overall': 'del', 'review_aroma': 'num', 'review_appearance': 'num', 'review_profilename': 'del', 'beer_style': 'y', 'review_palate': 'num', 'review_taste': 'num', 'beer_name': 'se', 'beer_abv': 'del', 'beer_beerid': 'del' } self.clf_type = 'multiclass_clf' if self.name == 'midwest_survey': df = pd.read_csv(self.file) # print_unique_values(df) col_action = { 'RespondentID': 'del', 'In your own words, what would you call the part ' + 'of the country you live in now?': 'se', 'Personally identification as a Midwesterner?': 'ohe', 'Illinois in MW?': 'ohe-1', 'Indiana in MW?': 'ohe-1', 'Iowa in MW?': 'ohe-1', 'Kansas in MW?': 'ohe-1', 'Michigan in MW?': 'ohe-1', 'Minnesota in MW?': 'ohe-1', 'Missouri in MW?': 'ohe-1', 'Nebraska in MW?': 'ohe-1', 'North Dakota in MW?': 'ohe-1', 'Ohio in MW?': 'ohe-1', 'South Dakota in MW?': 'ohe-1', 'Wisconsin in MW?': 'ohe-1', 'Arkansas in MW?': 'ohe-1', 'Colorado in MW?': 'ohe-1', 'Kentucky in MW?': 'ohe-1', 'Oklahoma in MW?': 'ohe-1', 'Pennsylvania in MW?': 'ohe-1', 'West Virginia in MW?': 'ohe-1', 'Montana in MW?': 'ohe-1', 'Wyoming in MW?': 'ohe-1', 'ZIP Code': 'del', 'Gender': 'ohe', 'Age': 'ohe', 'Household Income': 'ohe', 'Education': 'ohe', 'Location (Census Region)': 'y' } le = preprocessing.LabelEncoder() ycol = [col for col in col_action if col_action[col] == 'y'] df[ycol] = le.fit_transform(df[ycol[0]].astype(str)) self.clf_type = 'multiclass_clf' if self.name == 'indultos_espana': df = pd.read_csv(self.file) col_action = { 'Fecha BOE': 'del', 'Ministerio': 'ohe-1', 'Ministro': 'ohe', 'Partido en el Gobierno': 'ohe-1', 'Género': 'ohe-1', 'Tribunal': 'ohe', 'Región': 'ohe', 'Fecha Condena': 'del', 'Rol en el delito': 'se', 'Delito': 'se', 'Año Inicio Delito': 'num', 'Año Fin Delito': 'num', 'Tipo de Indulto': 'y', 'Fecha Indulto': 'del', 'Categoría Cod.Penal': 'se', 'Subcategoría Cod.Penal': 'se', 'Fecha BOE.año': 'num', 'Fecha BOE.mes': 'num', 'Fecha BOE.día del mes': 'num', 'Fecha BOE.día de la semana': 'num', 'Fecha Condena.año': 'num', 'Fecha Condena.mes': 'num', 'Fecha Condena.día del mes': 'num', 'Fecha Condena.día de la semana': 'num', 'Fecha Indulto.año': 'num', 'Fecha Indulto.mes': 'num', 'Fecha Indulto.día del mes': 'num', 'Fecha Indulto.día de la semana': 'num' } df['Tipo de Indulto'] = (df['Tipo de Indulto'] == 'indultar') self.clf_type = 'binary_clf' if self.name == 'docs_payments': # Variable names in Dollars for Docs dataset ###################### pi_specialty = ['Physician_Specialty'] drug_nm = ['Name_of_Associated_Covered_Drug_or_Biological1'] # 'Name_of_Associated_Covered_Drug_or_Biological2', # 'Name_of_Associated_Covered_Drug_or_Biological3', # 'Name_of_Associated_Covered_Drug_or_Biological4', # 'Name_of_Associated_Covered_Drug_or_Biological5'] dev_nm = ['Name_of_Associated_Covered_Device_or_Medical_Supply1'] # 'Name_of_Associated_Covered_Device_or_Medical_Supply2', # 'Name_of_Associated_Covered_Device_or_Medical_Supply3', # 'Name_of_Associated_Covered_Device_or_Medical_Supply4', # 'Name_of_Associated_Covered_Device_or_Medical_Supply5'] corp = [ 'Applicable_Manufacturer_or_Applicable_GPO_Making_' + 'Payment_Name' ] amount = ['Total_Amount_of_Payment_USDollars'] dispute = ['Dispute_Status_for_Publication'] ################################################################### if os.path.exists(self.file): df = pd.read_hdf(self.file) # print('Loading DataFrame from:\n\t%s' % self.file) else: hdf_files = glob.glob(os.path.join(self.path, 'hdf', '*.h5')) hdf_files_ = [] for file_ in hdf_files: if 'RSRCH_PGYR2013' in file_: hdf_files_.append(file_) if 'GNRL_PGYR2013' in file_: hdf_files_.append(file_) dfd_cols = pi_specialty + drug_nm + dev_nm + corp + amount + dispute df_dfd = pd.DataFrame(columns=dfd_cols) for hdf_file in hdf_files_: if 'RSRCH' in hdf_file: with pd.HDFStore(hdf_file) as hdf: for key in hdf.keys(): df = pd.read_hdf(hdf_file, key) df = df[dfd_cols] df['status'] = 'allowed' df = df.drop_duplicates(keep='first') df_dfd = pd.concat([df_dfd, df], ignore_index=True) print('size: %d, %d' % tuple(df_dfd.shape)) unique_vals = {} for col in df_dfd.columns: unique_vals[col] = set(list(df_dfd[col].unique())) for hdf_file in hdf_files_: if 'GNRL' in hdf_file: with pd.HDFStore(hdf_file) as hdf: for key in hdf.keys(): df = pd.read_hdf(hdf_file, key) df = df[dfd_cols] df['status'] = 'disallowed' df = df.drop_duplicates(keep='first') # remove all value thats are not in RSRCH # for col in pi_specialty+drug_nm+dev_nm+corp: # print(col) # s1 = set(list(df[col].unique())) # s2 = unique_vals[col] # df = df.set_index(col).drop(labels=s1-s2) # .reset_index() df_dfd = pd.concat([df_dfd, df], ignore_index=True) print('size: %d, %d' % tuple(df_dfd.shape)) df_dfd = df_dfd.drop_duplicates(keep='first') df_dfd.to_hdf(self.file, 't1') df = df_dfd df['status'] = (df['status'] == 'allowed') # print_unique_values(df) col_action = { pi_specialty[0]: 'del', drug_nm[0]: 'del', dev_nm[0]: 'del', corp[0]: 'se', amount[0]: 'num', dispute[0]: 'ohe-1', 'status': 'y' } self.clf_type = 'binary_clf' if self.name == 'medical_charge': df = pd.read_csv(self.file) # print_unique_values(df) col_action = { 'State': 'ohe', 'Total population': 'del', 'Median age': 'del', '% BachelorsDeg or higher': 'del', 'Unemployment rate': 'del', 'Per capita income': 'del', 'Total households': 'del', 'Average household size': 'del', '% Owner occupied housing': 'del', '% Renter occupied housing': 'del', '% Vacant housing': 'del', 'Median home value': 'del', 'Population growth 2010 to 2015 annual': 'del', 'House hold growth 2010 to 2015 annual': 'del', 'Per capita income growth 2010 to 2015 annual': 'del', '2012 state winner': 'del', 'Medical procedure': 'se', 'Total Discharges': 'del', 'Average Covered Charges': 'num', 'Average Total Payments': 'y' } self.clf_type = 'regression' # opts: 'regression', # 'binary_clf', 'multiclass_clf' if self.name == 'road_safety': files = self.file for filename in files: if filename.split('/')[-1] == '2015_Make_Model.csv': df_mod = pd.read_csv(filename) df_mod['Vehicle_Reference'] = ( df_mod['Vehicle_Reference'].map(str)) df_mod['Vehicle_Index'] = (df_mod['Accident_Index'] + df_mod['Vehicle_Reference']) df_mod = df_mod.set_index('Vehicle_Index') df_mod = df_mod.dropna(axis=0, how='any', subset=['make']) for filename in files: if filename.split('/')[-1] == 'Accidents_2015.csv': df_acc = pd.read_csv(filename).set_index('Accident_Index') for filename in files: if filename.split('/')[-1] == 'Vehicles_2015.csv': df_veh = pd.read_csv(filename) df_veh['Vehicle_Reference'] = ( df_veh['Vehicle_Reference'].map(str)) df_veh['Vehicle_Index'] = (df_veh['Accident_Index'] + df_veh['Vehicle_Reference']) df_veh = df_veh.set_index('Vehicle_Index') for filename in files: if filename.split('/')[-1] == 'Casualties_2015.csv': df_cas = pd.read_csv(filename) df_cas['Vehicle_Reference'] = ( df_cas['Vehicle_Reference'].map(str)) df_cas['Vehicle_Index'] = (df_cas['Accident_Index'] + df_cas['Vehicle_Reference']) df_cas = df_cas.set_index('Vehicle_Index') df = df_cas.join(df_mod, how='left', lsuffix='_cas', rsuffix='_model') df = df.dropna(axis=0, how='any', subset=['make']) df = df[df['Sex_of_Driver'] != 3] df = df[df['Sex_of_Driver'] != -1] df['Sex_of_Driver'] = df['Sex_of_Driver'] - 1 # print_unique_values(df) # col_action = {'Casualty_Severity': 'y', # 'Casualty_Class': 'num', # 'make': 'ohe', # 'model': 'se'} col_action = {'Sex_of_Driver': 'y', 'model': 'se', 'make': 'ohe'} df = df.dropna(axis=0, how='any', subset=list(col_action.keys())) self.clf_type = 'binary_clf' # opts: 'regression', # 'binary_clf', 'multiclass_clf' self.file = self.file[0] if self.name == 'consumer_complaints': df = pd.read_csv(self.file) # print_unique_values(df) col_action = { 'Date received': 'del', 'Product': 'ohe', 'Sub-product': 'ohe', 'Issue': 'ohe', 'Sub-issue': 'ohe', 'Consumer complaint narrative': 'se', # too long 'Company public response': 'ohe', 'Company': 'se', 'State': 'del', 'ZIP code': 'del', 'Tags': 'del', 'Consumer consent provided?': 'del', 'Submitted via': 'ohe', 'Date sent to company': 'del', 'Company response to consumer': 'ohe', 'Timely response?': 'ohe-1', 'Consumer disputed?': 'y', 'Complaint ID': 'del' } for col in col_action: if col_action[col] in ['ohe', 'se']: df = df.fillna(value={col: 'nan'}) df = df.dropna(axis=0, how='any', subset=['Consumer disputed?']) df.loc[:, 'Consumer disputed?'] = (df['Consumer disputed?'] == 'Yes') self.clf_type = 'binary_clf' # opts: 'regression', # 'binary_clf', 'multiclass_clf' if self.name == 'traffic_violations': df = pd.read_csv(self.file) # print_unique_values(df) col_action = { 'Date Of Stop': 'del', 'Time Of Stop': 'del', 'Agency': 'del', 'SubAgency': 'del', # 'ohe' 'Description': 'se', 'Location': 'del', 'Latitude': 'del', 'Longitude': 'del', 'Accident': 'del', 'Belts': 'ohe-1', 'Personal Injury': 'del', 'Property Damage': 'ohe-1', 'Fatal': 'ohe-1', 'Commercial License': 'ohe-1', 'HAZMAT': 'ohe', 'Commercial Vehicle': 'ohe-1', 'Alcohol': 'ohe-1', 'Work Zone': 'ohe-1', 'State': 'del', # 'VehicleType': 'del', # 'ohe' 'Year': 'num', 'Make': 'del', 'Model': 'del', 'Color': 'del', 'Violation Type': 'y', 'Charge': 'del', # 'y' 'Article': 'del', # 'y' 'Contributed To Accident': 'del', # 'y' 'Race': 'ohe', 'Gender': 'ohe', 'Driver City': 'del', 'Driver State': 'del', 'DL State': 'del', 'Arrest Type': 'ohe', 'Geolocation': 'del' } for col in col_action: if col_action in ['ohe', 'se']: df = df.fillna(value={col: 'nan'}) self.clf_type = 'multiclass_clf' # opts: 'regression', # 'binary_clf', 'multiclass_clf' if self.name == 'crime_data': df = pd.read_csv(self.file) # print_unique_values(df) col_action = { 'DR Number': 'del', 'Date Reported': 'del', 'Date Occurred': 'del', 'Time Occurred': 'del', 'Area ID': 'del', 'Area Name': 'del', 'Reporting District': 'del', 'Crime Code': 'del', 'Crime Code Description': 'y', 'MO Codes': 'del', # 'se' 'Victim Age': 'num', 'Victim Sex': 'ohe', 'Victim Descent': 'ohe', 'Premise Code': 'del', 'Premise Description': 'ohe', 'Weapon Used Code': 'del', 'Weapon Description': 'ohe', 'Status Code': 'del', 'Status Description': 'del', 'Crime Code 1': 'del', 'Crime Code 2': 'del', 'Crime Code 3': 'del', 'Crime Code 4': 'del', 'Address': 'del', 'Cross Street': 'se', # 'se' 'Location ': 'del' } for col in col_action: if col_action in ['ohe', 'se']: df = df.fillna(value={col: 'nan'}) self.clf_type = 'multiclass_clf' # opts: 'regression', # 'binary_clf', 'multiclass_clf' if self.name == 'employee_salaries': df = pd.read_csv(self.file) col_action = { 'Full Name': 'del', 'Gender': 'ohe', 'Current Annual Salary': 'y', '2016 Gross Pay Received': 'del', '2016 Overtime Pay': 'del', 'Department': 'del', 'Department Name': 'ohe', 'Division': 'ohe', # 'se' 'Assignment Category': 'ohe-1', 'Employee Position Title': 'se', 'Underfilled Job Title': 'del', 'Date First Hired': 'num' } df['Current Annual Salary'] = [ float(s[1:]) for s in df['Current Annual Salary'] ] df['Date First Hired'] = [ datetime.datetime.strptime(d, '%m/%d/%Y').year for d in df['Date First Hired'] ] for col in col_action: if col_action in ['ohe', 'se']: df = df.fillna(value={col: 'nan'}) self.clf_type = 'regression' # opts: 'regression', # 'binary_clf', 'multiclass_clf' # add here info about the dataset ##################################### if self.name == 'new_dataset': df = pd.read_csv(self.file) col_action = {} for col in col_action: if col_action in ['ohe', 'se']: df = df.fillna(value={col: 'nan'}) self.clf_type = 'multiclass_clf' # opts: 'regression', # 'binary_clf', 'multiclass_clf' ####################################################################### self.df = df self.col_action = { k: col_action[k] for k in col_action if col_action[k] != 'del' } # why not but not coherent with the rest --> self.preprocess return self
# KNN Classification from pandas import read_csv from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn import preprocessing from sklearn.neighbors import KNeighborsClassifier filename = '../../datasets/iris_classification_train.csv' names = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'flower_name' ] df = read_csv(filename, names=names) # label_encoder object knows how to understand word labels. label_encoder = preprocessing.LabelEncoder() df['flower_name'] = label_encoder.fit_transform(df['flower_name']) df['flower_name'].unique() array = df.values inputx = array[:, 0:4] outputy = array[:, 4] model = KNeighborsClassifier() print(model.fit(inputx, outputy)) filename = '../../datasets/iris_classification_test.csv' names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] newdataframe = read_csv(filename, names=names) array = newdataframe.values z = array[:, 0:4] print("\n", newdataframe, "\n") res = model.predict(z) reslist = [] res = model.predict(z) print(model.predict(z), "\n")
def encode_text_index(df, name): le = preprocessing.LabelEncoder() df[name] = le.fit_transform(df[name]) return le.classes_
import os import numpy as np import pandas as pd from pandas import read_csv import sklearn from sklearn import linear_model from sklearn.utils import shuffle from sklearn import preprocessing data = pd.read_csv("Placement.csv") data = data[["status", "mba_p","etest_p", "specialisation","gender", "ssc_p", "ssc_b", "hsc_p", "hsc_b", "hsc_s", "degree_p", "degree_t", "workex"]] le = preprocessing.LabelEncoder() data.gender = le.fit_transform(list(data["gender"])) data.ssc_b = le.fit_transform(list(data["ssc_b"])) data.hsc_b = le.fit_transform(list(data["hsc_b"])) data.hsc_s = le.fit_transform(list(data["hsc_s"])) data.degree_t = le.fit_transform(list(data["degree_t"])) data.workex = le.fit_transform(list(data["workex"])) data.specialisation = le.fit_transform(list(data["specialisation"])) data.status = le.fit_transform(list(data["status"])) predict = "status" print (data.head()) X = np.array(data.drop([predict], 1)) ?? => print(x) ??
def model_selection(X_train, X_test, df_labels): y_train = df_labels.status_group.values # Compare models without optimization models = { "Dumb Model": AlwaysFunctionalClassifier(), "SGD Classifier": SGDClassifier(), "Random Forests": RandomForestClassifier(), "k-Nearest Neighbors": KNeighborsClassifier(), "Softmax Regression": LogisticRegression(multi_class="multinomial", solver="lbfgs"), "SVM": SVC(decision_function_shape="ovr"), "Decission Trees": DecisionTreeClassifier(), "AdaBoost": AdaBoostClassifier(algorithm="SAMME.R"), "Gradient Boost": GradientBoostingClassifier() } results = [] names = [] for k, v in models.items(): cv_scores = cross_val_score(estimator=v, X=X_train, y=y_train, cv=10, n_jobs=1, scoring='accuracy') results.append(cv_scores) names.append(k) print(k) print('CV accuracy: %.3f +/- %.3f' % (np.mean(cv_scores), np.std(cv_scores))) print('----------------') fig = plt.figure(figsize=(16, 12)) fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() # Let's try to optimize some of this models # Random Forests # Initial performance forest_clf = RandomForestClassifier() cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy") # Random Forests Confusion Matrix y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'max_depth': [30, 60], 'n_estimators': [80, 300], 'max_features': [5, 10], 'min_samples_leaf': [1, 10], 'n_jobs': [-1] }] grid_search_rf = GridSearchCV(forest_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_rf.fit(X_train, y_train) cvres = grid_search_rf.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_rf.best_params_) cv_results = cross_validate(RandomForestClassifier(**grid_search_rf.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # SGD Classifier # Initial performance sgd_clf = SGDClassifier() cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy") # SGD Confusion Matrix y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'alpha': [0.00001, 0.0001, 0.001, 0.01], 'loss': ['log'], 'n_jobs': [-1] }] grid_search_sgd = GridSearchCV(sgd_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_sgd.fit(X_train, y_train) cvres = grid_search_sgd.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_sgd.best_params_) cv_results = cross_validate(SGDClassifier(**grid_search_sgd.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # K Nearest Neighbors # Initial performance knn_clf = KNeighborsClassifier() cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy") # KNN Confusion Matrix y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3) for i in range(conf_mx.shape[0]): for j in range(conf_mx.shape[1]): perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%" ax.text(x=j, y=i, s=str(conf_mx[i, j]) + "\n\n" + perc, va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show() param_grid = [{ 'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'n_jobs': [-1] }] grid_search_knn = GridSearchCV(knn_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_knn.fit(X_train, y_train) cvres = grid_search_knn.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_knn.best_params_) cv_results = cross_validate(KNeighborsClassifier(**grid_search_knn.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # Classification with XGBoost param_grid = [{ 'max_depth': [3, 10], 'n_estimators': [80, 300], 'learning_rate': [0.01, 0.1, 0.3] }] gbm = xgb.XGBClassifier() grid_search_xgb = GridSearchCV(gbm, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1) grid_search_xgb.fit(X_train, y_train) cvres = grid_search_xgb.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) print(grid_search_xgb.best_params_) cv_results = cross_validate(xgb.XGBClassifier(**grid_search_xgb.best_params_), \ X_train, y_train, cv = 3, scoring="accuracy") print(cv_results['test_score'].mean()) # Just a bit better than Random Forests, but the best so far nevertheless. # Ensembling # Let's put together all the models shown above to see if we get a better result. sgd_clf = SGDClassifier(**grid_search_sgd.best_params_) rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_) knn_clf = KNeighborsClassifier(**grid_search_knn.best_params_) log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=30, n_jobs=-1) # We'll skip SVM as they slow down too much the modelling times # svm_clf = SVC(C= 1, gamma= 0.1, decision_function_shape="ovr", n_jobs=-1) dtr_clf = DecisionTreeClassifier(max_depth=20, min_samples_split=10) ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5) gbrt_clf = GradientBoostingClassifier(max_depth=5, n_estimators=500, learning_rate=0.5) xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_) clfs = [ sgd_clf, rnd_clf, knn_clf, log_clf, dtr_clf, ada_clf, gbrt_clf, xgb_clf ] voting_clf_ens_soft = VotingClassifier(estimators=[ ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]), ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]), ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]), ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7]) ], voting='soft', n_jobs=-1) voting_clf_ens_soft.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_soft, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Although slower, it doesn't seem to be a better model than just Random Forests optimized alone, is it probably the soft voting? Let's see voting_clf_ens_hard = VotingClassifier(estimators=[ ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]), ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]), ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]), ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7]) ], voting='hard', n_jobs=-1) voting_clf_ens_hard.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_hard, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Doesn't change much. # Stacking # Let's create a new model that decides the final label in a new second layer, taking as input the results of all the previous models. print(X_train.shape) idx = np.random.permutation(len(X_train)) # create shuffle index ## split into three sets # training set Xtr = X_train[idx[:33000]] ytr = y_train[idx[:33000]] # validation set Xvl = X_train[idx[33000:46200]] yvl = y_train[idx[33000:46200]] # test set Xts = X_train[idx[46200:]] yts = y_train[idx[46200:]] print(Xtr.shape, Xvl.shape, Xts.shape) for i, clf in enumerate(clfs): clf.fit(Xtr, ytr) print("Fitted {}/{}".format(i + 1, len(clfs))) # run individual classifiers on val set yhat = {} for i, clf in enumerate(clfs): yhat[i] = clf.predict(Xvl) print("Predicted {}/{}".format(i + 1, len(clfs))) # create new training set from predictions # combine the predictions into vectors using a horizontal stacking Xblend = np.c_[[preds for preds in yhat.values()]].T #Transform labels into codes le = preprocessing.LabelEncoder() Xblend = le.fit_transform(Xblend.reshape(13200 * 8)).reshape(13200, 8) # train a random forest classifier on Xblend using yvl for target labels rf_blend = RandomForestClassifier(n_estimators=100, n_jobs=-1) rf_blend.fit(Xblend, yvl) cv_results = cross_validate(rf_blend, Xblend, yvl, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Let's see how this behaves with an unseen dataset # run individual classifiers on test set yhatts = {} for i, clf in enumerate(clfs): yhatts[i] = clf.predict(Xts) print("Predicted {}/{}".format(i + 1, len(clfs))) Xblendts = np.c_[[preds for preds in yhatts.values()]].T Xblendts = le.transform(Xblendts.reshape(13200 * 8)).reshape(13200, 8) cv_results = cross_validate(rf_blend, Xblendts, yts, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Finally, in this exercise, nothing beats Random Forests and XGBoost. # Ensembling RF and XGB rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_) xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_) clfs = [rnd_clf, xgb_clf] voting_clf_ens_rfxgb = VotingClassifier(estimators=[('Random Forests', clfs[0]), ('XGBoost', clfs[1])], voting='soft', n_jobs=-1) voting_clf_ens_rfxgb.fit(X_train, y_train) cv_results = cross_validate(voting_clf_ens_rfxgb, X_train, y_train, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # This is the best result so far! # Stacking RF and XGB # We have to be specially careful here to not overfit the RF classifier. idx = np.random.permutation(len(X_train)) # create shuffle index ## split into three sets # training set Xtr = X_train[idx[:33000]] ytr = y_train[idx[:33000]] # validation set Xvl = X_train[idx[33000:46200]] yvl = y_train[idx[33000:46200]] # test set Xts = X_train[idx[46200:]] yts = y_train[idx[46200:]] print(Xtr.shape, Xvl.shape, Xts.shape) for i, clf in enumerate(clfs): clf.fit(Xtr, ytr) print("Fitted {}/{}".format(i + 1, len(clfs))) # run individual classifiers on val set yhat = {} for i, clf in enumerate(clfs): yhat[i] = clf.predict(Xvl) print("Predicted {}/{}".format(i + 1, len(clfs))) # create new training set from predictions # combine the predictions into vectors using a horizontal stacking Xblend = np.c_[[preds for preds in yhat.values()]].T #Transform labels into codes le = preprocessing.LabelEncoder() Xblend = le.fit_transform(Xblend.reshape(13200 * 2)).reshape(13200, 2) # train a random forest classifier on Xblend using yvl for target labels rf_blend = RandomForestClassifier(n_estimators=300, n_jobs=-1) rf_blend.fit(Xblend, yvl) cv_results = cross_validate(rf_blend, Xblend, yvl, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Let's see how this behaves with an unseen dataset # run individual classifiers on test set yhatts = {} for i, clf in enumerate(clfs): yhatts[i] = clf.predict(Xts) print("Predicted {}/{}".format(i + 1, len(clfs))) Xblendts = np.c_[[preds for preds in yhatts.values()]].T Xblendts = le.transform(Xblendts.reshape(13200 * 2)).reshape(13200, 2) cv_results = cross_validate(rf_blend, Xblendts, yts, cv=3, scoring="accuracy") print(cv_results['test_score'].mean()) # Finally, it seems that the best result were obtained with an RF and XGBoost ensemble. Let's use this model to make the final predictions and submission file creation. return voting_clf_ens_rfxgb