import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import datasets wine = datasets.load_wine() X, y = wine.data, wine.target The code below shows `scikit-learn` implementations of LDA, QDA, and Naive Bayes using the {doc}`wine </content/appendix/data>` dataset. Note that the Naive Bayes implementation assumes *all* variables follow a Normal distribution, unlike the construction in the previous section. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB lda = LinearDiscriminantAnalysis() lda.fit(X, y); qda = QuadraticDiscriminantAnalysis() qda.fit(X, y); nb = GaussianNB() nb.fit(X, y); Next, let's check that these `scikit-learn` implementations return the same decision boundaries as our constructions in the previous section. The code to create these graphs is written below. def graph_boundaries(X, model, model_title, n0 = 1000, n1 = 1000, figsize = (7, 5), label_every = 4): # Generate X for plotting d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0) d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1) X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2) # Get class predictions
log_model = lr_model if log == 'nb': from sklearn.naive_bayes import GaussianNB lr_model = GaussianNB() log_model = lr_model if log == 'knn': from sklearn.neighbors import KNeighborsClassifier lr_model = KNeighborsClassifier(n_neighbors=35, weights='distance') log_model = lr_model if log == 'lda': from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lr_model = LinearDiscriminantAnalysis() log_model = lr_model if log == 'qda': from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis lr_model = QuadraticDiscriminantAnalysis() log_model = lr_model # Run CV #if True: # fit_model = log_model.fit(X,y) # pred = fit_model.predict_proba(look)[:,1]#.clip(0.001,.999) # print( " look Gini = ", log_loss(ylook, pred) ) for i, (train_index, test_index) in enumerate(kf.split(X)): # Create data for this fold y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index] X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy() X_test = test_df.copy() print( "\nFold ", i)
Y = whole_data['target'].values X = whole_data.drop('target',axis=1).values X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.33, random_state=1) # now declare some varible for algorithum. list_of_lagorithum = [] num_of_folds = 10 results_of_algo = [] names_of_algo = [] # now check accuracy with some algo without standardization. list_of_lagorithum.append(('RandomForestClassifier ',RandomForestClassifier())) list_of_lagorithum.append(('QuadraticDiscriminantAnalysis ',QuadraticDiscriminantAnalysis())) list_of_lagorithum.append(('LogisticRegression ',LogisticRegression())) list_of_lagorithum.append(('DecisionTree ',DecisionTreeClassifier())) list_of_lagorithum.append(('LinearDiscriminant ',LinearDiscriminantAnalysis())) list_of_lagorithum.append(('Support Vector Machine ',SVC())) list_of_lagorithum.append(('GaussianNB ',GaussianNB())) list_of_lagorithum.append(('BernoulliNB ',BernoulliNB())) list_of_lagorithum.append(('KNeighborsClassifier ',KNeighborsClassifier())) print("\n\n\nAccuracies of algorithm without standardization \n\n") for name, model in list_of_lagorithum: kfold = KFold(n_splits=num_of_folds, random_state=13) startTime = time.time() cv_results = cross_val_score(model, X,Y, cv=kfold, scoring='accuracy') endTime = time.time()
def QDA(): pipe = Pipeline([('a_preprocess', MinMaxScaler()), ('b_reduce', PCA(iterated_power=7, random_state=86, n_components=14)), ('c_classify', QuadraticDiscriminantAnalysis())]) return ('QDA', pipe)
def handle(self, *args, **options): # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn.cross_validation import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) from history.tools import normalization, filter_by_mins, create_sample_row from history.models import Price graph = False self.symbol = 'BTC_ETH' self.minutes_back = 100 self.timedelta_back_in_granularity_increments = 0 datasetinputs = 2 gran_options = [1, 5, 15, 30] gran_options = [30, 60, 120, 240] datasets = [] _names = [] for gran in gran_options: self.granularity = gran splice_point = self.minutes_back + self.timedelta_back_in_granularity_increments prices = Price.objects.filter( symbol=self.symbol).order_by('-created_on') prices = filter_by_mins(prices, self.granularity) prices = [price.price for price in prices] data = normalization(list(prices[0:splice_point])) data.reverse() price_datasets = [[], []] for i, val in enumerate(data): try: # get NN projection sample = create_sample_row(data, i, datasetinputs) last_price = data[i + datasetinputs - 1] next_price = data[i + datasetinputs] change = next_price - last_price pct_change = change / last_price fee_pct = 0.002 do_buy = -1 if abs(pct_change) < fee_pct and False else ( 1 if change > 0 else 0) price_datasets[0].append(sample) price_datasets[1].append(do_buy) except Exception as e: print(e) datasets.append(price_datasets) _names.append(str(gran)) if graph: figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets for _index in range(0, len(datasets)): ds = datasets[_index] # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first if graph: cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): if graph: ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. _input = np.c_[xx.ravel(), yy.ravel()] if hasattr(clf, "decision_function"): Z = clf.decision_function(_input) else: Z = clf.predict_proba(_input)[:, 1] print(name, round(score * 100)) # Put the result into a color plot if graph: Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title("(" + _names[_index] + ")" + name) text = ('%.2f' % score).lstrip('0') ax.text(xx.max() - .3, yy.min() + .3, text, size=15, horizontalalignment='right') i += 1 stats = {'r': 0, 'w': 0} for ds in datasets: for i in range(0, len(ds[0])): sample = ds[0][i] actual = ds[1][i] prediction = clf.predict(sample) stats['r' if actual == prediction[0] else 'w'] = stats[ 'r' if actual == prediction[0] else 'w'] + 1 print( 'stats', name, stats, round((100.0 * stats['r'] / (stats['r'] + stats['w'])), 2)) if graph: figure.subplots_adjust(left=.02, right=.98) plt.show()
"\n Elapsed time: {}\n-------------------------".format( elapsed_time)) ref_dict["gnb"] = [gnb_dict_avg, "gnb_dict_avg"] ########################################### ######## QDA ########### ########################################### if algorithm.lower() == "qda": start_time = datetime.now() print(" Selected Classifier: Quadratic Discriminant Analysis") with open(log_file_name, "a") as file: file.write( "\n Selected Classifier: Quadratic Discriminant Analysis" ) from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis().fit(X, y) qda_dict = perform_prediction(model=qda, pred_pos=pred_pos, pred_neg=pred_neg, print_log=print_log) if i == 0: qda_dict_avg = qda_dict else: for k, n in zip(qda_dict_avg.keys(), qda_dict.keys()): qda_dict_avg[k] = float(qda_dict_avg[k]) + float( qda_dict[k]) if save_models.lower() in ["1", "yes", "y", "yeah", "whatever"]: save_model_pickle(qda, root_dir, file_name="qda-{}.pickle".format(i)) elapsed_time = datetime.now() - start_time
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ADASYN ada = ADASYN() os_X,os_y = ada.fit_sample(X_train, y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity)
raw_data = pd.read_csv("./Biomechanical features of orthopedic patients.csv") raw_data = raw_data.sample(frac=1).reset_index(drop=True) inputs = raw_data[[ 'pelvic_incidence', 'pelvic_tilt numeric', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis' ]] outputs = raw_data[['class']] linear_Regression_Dictionary = {"Normal": 1, "Abnormal": 0} output_linr = outputs.replace({"class": linear_Regression_Dictionary}) inputs_train = inputs.loc[:248] inputs_test = inputs.loc[248:] outputs_train = output_linr.loc[:248] outputs_test = output_linr.loc[248:] qda_classification = QuadraticDiscriminantAnalysis() qda_classification.fit(inputs_train, outputs_train) prediction = qda_classification.predict(inputs_test) reverting = lambda x: 1 if (x > 0.5) else 0 finalPrediction = pd.DataFrame(np.array([reverting(xi) for xi in prediction])) print('MAE:', metrics.mean_absolute_error(outputs_test, finalPrediction)) print('MSE:', metrics.mean_squared_error(outputs_test, finalPrediction)) print('RMSE:', np.sqrt(metrics.mean_squared_error(outputs_test, finalPrediction)))
def run_predict(): master_sdata = [] today = datetime.datetime.today() stocks = collections.OrderedDict([('BP', 0), ('SWN', 0), ('GLD', 0), ('USO', 0), ('^DJI', 0), ('CVX', 0)]) for s in stocks.keys(): sdata, today_df = retrieve_data(s, datetime.datetime(2007, 1, 1), today, lags=5) # Create training data - can change lag if needed lag_train_data = sdata[[ "Lag1 PercChange", "Lag2 PercChange", "Lag3 PercChange", "Lag4 PercChange" ]] today_train_data = today_df[[ "Lag1 PercChange", "Lag2 PercChange", "Lag3 PercChange", "Lag4 PercChange" ]] dir_train_data = sdata["Direction"] today_train_data1 = lag_train_data.append(today_train_data) # Test data start - one year ago test_start_date = datetime.datetime.now() - relativedelta(years=1) lag_train_set = today_train_data1[ today_train_data1.index < test_start_date] lag_test_set = today_train_data1[ today_train_data1.index >= test_start_date] dir_train_set = dir_train_data[dir_train_data.index < test_start_date] dir_test_set = dir_train_data[dir_train_data.index >= test_start_date] #scaler = StandardScaler() #scaler.fit(lag_train_set) #scaler.fit(dir_train_set) #lag_train_set = scaler.transform(lag_train_set) #dir_train_set = scaler.transform(dir_train_set) #lag_test_set = scaler.transform(lag_test_set) #print("LAG_TRAIN") #print_full(lag_train_set) #print("DIR_TRAIN") #print_full(dir_train_set) #print(dir_train_set['continuous']) # Prediction results pred = pd.DataFrame(index=lag_test_set.index) #print("PREDPRED") #print(pred.index) pred["Actual"] = dir_test_set # Running machine learning analysis with the models models = [("SVC", SVC()), ("LR", LogisticRegression(solver='lbfgs', multi_class='multinomial')), ("Forest", RandomForestRegressor(n_estimators=1, n_jobs=-1)), ("LDA", LinearDiscriminantAnalysis()), ("QDA", QuadraticDiscriminantAnalysis()), ("NN", MLPClassifier(algorithm='sgd', alpha=1e-5, learning_rate='adaptive', learning_rate_init=0.0001, hidden_layer_sizes=(5, 8), random_state=3, max_iter=400, activation='relu'))] for m in models: run_analysis(m[0], m[1], lag_train_set, dir_train_set, lag_test_set, pred) pred = pred.ix[1:] #print_full(pred) man_date = '2016-4-18' #print("Actual for " + s + " " + str(pred.ix[man_date]["Actual"])) #print("Prediction SVM: " + str(pred.ix[man_date]["SVC"])) #print("Prediction Linear Regression: " + str(pred.ix[man_date]["LR"])) #print("Prediction Linear Discriminant Analysis: " + str(pred.ix[man_date]["LDA"])) #print("Prediction Quad Discriminate Analysis: " + str(pred.ix[man_date]["QDA"])) #print("Prediction Random Forest: " + str(pred.ix[man_date]["Forest"])) #print("Prediction Neural Network: " + str(pred.ix[man_date]["NN"])) stocks[s] = pred.ix[-1]["NN"] master_sdata.append(sdata) return master_sdata, stocks
def classifiers_evaluation(df_res, y): classifiers = [ LinearSVC(), LinearSVR(), KNeighborsClassifier(3), SVC(probability=True), NuSVC(), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), BernoulliNB(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression(), MLPClassifier(max_iter=600), SGDClassifier(max_iter=600), LogisticRegressionCV(max_iter=600) ] res = list() preprocess = [ preprocessing.QuantileTransformer(), preprocessing.MinMaxScaler(), preprocessing.Normalizer(), preprocessing.StandardScaler(), preprocessing.RobustScaler(), preprocessing.MaxAbsScaler() ] for processor in preprocess: X = processor.fit_transform(df_res) log_cols = ["Classifier", "ROC_AUC score"] log = pd.DataFrame(columns=log_cols) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) acc_dict = {} for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train, y_train) train_predictions = clf.predict(X_test) # acc = accuracy_score(y_test, train_predictions) acc = roc_auc_score(y_test, train_predictions) if name in acc_dict: acc_dict[name] += acc else: acc_dict[name] = acc for clf in acc_dict: acc_dict[clf] = acc_dict[clf] / 10.0 log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols) log = log.append(log_entry) print(processor.__class__.__name__) print(log) res.append([processor.__class__.__name__, log]) return res
def quadratic_discriminant(x_train, y_train, x_test, y_test): clf = QuadraticDiscriminantAnalysis() return __fit_clf_model('quadratic_discriminant', clf, x_train, y_train, x_test, y_test)
def performance_analysis(self): """ Analyze and print to stdout the performances of a big list of classifiers, in order to include only the best ones in the final version of RiskInDroid. :return: None. """ # Category of permissions for which to calculate the performances. _cat = 'declared' _k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=self.seed) # The original list of classifiers taken into consideration, before selecting # only the best ones for RiskInDroid. _all_models = (SVC(kernel='linear', probability=True, random_state=self.seed), GaussianNB(), MultinomialNB(), BernoulliNB(), DecisionTreeClassifier(random_state=self.seed), RandomForestClassifier(random_state=self.seed), AdaBoostClassifier(random_state=self.seed), GradientBoostingClassifier(random_state=self.seed), SGDClassifier(loss='log', random_state=self.seed), LogisticRegression(random_state=self.seed), LogisticRegressionCV(random_state=self.seed), KNeighborsClassifier(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), MLPClassifier(random_state=self.seed)) _training_sets = list(self.get_training_vectors_3_sets()) for model in _all_models: print('\n\n\nAnalysis of ' + model.__class__.__name__ + ':') # Goodware and malware scores for the current model. _malware_scores = numpy.array([]) _goodware_scores = numpy.array([]) # Correctly predicted targets for the current model. _ok_targets = numpy.array([]) # We analyze the 3 training sets for each model. for (index, current_set) in enumerate(_training_sets): # current_set[0] = application set # current_set[1] = application targets # Goodware and malware scores for the current set. _loc_m_scores = numpy.array([]) _loc_g_scores = numpy.array([]) # Correctly predicted targets for the current set. _loc_ok_targets = numpy.array([]) # The analysis is done using 10-cross fold validation. for train_index, test_index in _k_fold.split( current_set[0][_cat], current_set[1]): _train_data = numpy.array(current_set[0][_cat]) _train_targets = numpy.array(current_set[1]) model.fit(_train_data[train_index], _train_targets[train_index]) # Correctly predicted targets for the current fold. _fold_ok_targets = 0 for loc_index in test_index: proba = list( zip( model.classes_, model.predict_proba([_train_data[loc_index] ])[0])) # The malware probability is considered as the risk value. if proba[0][0] == b'malware': _result = proba[0] else: _result = proba[1] # We consider only correct predictions for calculating the mean # and the standard deviation. _true_target = _train_targets[loc_index] # If the current app under test is a malware. if _result[1] >= 0.5: # If the prediction is correct. if _result[0] == _true_target: _fold_ok_targets += 1 _loc_m_scores = numpy.append( _loc_m_scores, _result[1]) # If the current app under test is not a malware. else: # If the prediction is correct. if _result[0] != _true_target: _fold_ok_targets += 1 _loc_g_scores = numpy.append( _loc_g_scores, _result[1]) _loc_ok_targets = numpy.append( _loc_ok_targets, _fold_ok_targets / len(test_index)) print(' set_{0}:'.format(index + 1)) print(' accuracy: {0:.2f}'.format( _loc_ok_targets.mean() * 100)) print(' malware mean: {0:.2f}'.format( _loc_m_scores.mean() * 100)) print(' malware std_dev: {0:.2f}'.format( _loc_m_scores.std() * 100)) print(' goodware mean: {0:.2f}'.format( _loc_g_scores.mean() * 100)) print(' goodware std_dev: {0:.2f}'.format( _loc_g_scores.std() * 100)) _ok_targets = numpy.append(_ok_targets, _loc_ok_targets) _malware_scores = numpy.append(_malware_scores, _loc_m_scores) _goodware_scores = numpy.append(_goodware_scores, _loc_g_scores) print(' total:') print(' accuracy: {0:.2f}'.format(_ok_targets.mean() * 100)) print(' malware mean: {0:.2f}'.format( _malware_scores.mean() * 100)) print(' malware std_dev: {0:.2f}'.format( _malware_scores.std() * 100)) print(' goodware mean: {0:.2f}'.format( _goodware_scores.mean() * 100)) print(' goodware std_dev: {0:.2f}'.format( _goodware_scores.std() * 100))
# Header for Features without Labels features = [str(i) for i in range(1, 1583)] # Standarize the DATA X = df.loc[:, features].values Y = df.loc[:, 'label'].values X = StandardScaler().fit_transform(X) # PCA # n : Number of principal components n = 90 pca = PCA(n_components=n) X = pca.fit_transform(X) #Split data to train and test X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #Create a Quadratic Discriminant Analysis instance classifier = QuadraticDiscriminantAnalysis() #Fit the classifier classifier.fit(X_train, Y_train) #Calculate the score (Accuracy) score = classifier.score(X_test, Y_test) #Printing the score print(score)
def classify_through_discriminant_analysis(classification_data={}): clf = QuadraticDiscriminantAnalysis() return general_classifier(classification_data, clf)
############################################################################### # 3. Create train and test set # ############################################################################### X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000) ############################################################################### # 4. Classifiers # ############################################################################### # Create list of tuples with classifier label and classifier object classifiers = {} classifiers.update({"LDA": LinearDiscriminantAnalysis()}) classifiers.update({"QDA": QuadraticDiscriminantAnalysis()}) classifiers.update({"AdaBoost": AdaBoostClassifier()}) classifiers.update({"Bagging": BaggingClassifier()}) classifiers.update({"Extra Trees Ensemble": ExtraTreesClassifier()}) classifiers.update({"Gradient Boosting": GradientBoostingClassifier()}) classifiers.update({"Random Forest": RandomForestClassifier()}) classifiers.update({"Ridge": RidgeClassifier()}) classifiers.update({"SGD": SGDClassifier()}) classifiers.update({"BNB": BernoulliNB()}) classifiers.update({"GNB": GaussianNB()}) classifiers.update({"KNN": KNeighborsClassifier()}) classifiers.update({"MLP": MLPClassifier()}) classifiers.update({"LSVC": LinearSVC()}) classifiers.update({"NuSVC": NuSVC()}) classifiers.update({"SVC": SVC()}) classifiers.update({"DTC": DecisionTreeClassifier()})
def learn(self, fname, file_data=None): csvfile = None if file_data: # base64 and gzipped file data = base64.b64decode(file_data) # data = zlib.decompress(data, 16 + zlib.MAX_WBITS) data = gzip.decompress(data) csvfile = StringIO(data.decode('utf-8')) else: csvfile = open(fname, 'r') t = time.time() # load CSV file self.header = [] rows = [] naming_num = 0 # with open(fname, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for i, row in enumerate(reader): self.logger.debug(row) if i == 0: self.header = row else: for j, val in enumerate(row): if j == 0: # this is a name of the location if val not in self.naming['from']: self.naming['from'][val] = naming_num self.naming['to'][naming_num] = val naming_num += 1 row[j] = self.naming['from'][val] continue if val == '': row[j] = 0 continue try: row[j] = float(val) except: self.logger.error("problem parsing value " + str(val)) rows.append(row) csvfile.close() # first column in row is the classification, Y y = numpy.zeros(len(rows)) x = numpy.zeros((len(rows), len(rows[0]) - 1)) # shuffle it up for training record_range = list(range(len(rows))) shuffle(record_range) for i in record_range: y[i] = rows[i][0] x[i, :] = numpy.array(rows[i][1:]) names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", # "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True), SVC(gamma=2, C=1, probability=True), # GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] self.algorithms = {} # split_for_learning = int(0.70 * len(y)) for name, clf in zip(names, classifiers): t2 = time.time() self.logger.debug("learning {}".format(name)) try: self.algorithms[name] = self.train(clf, x, y) # score = self.algorithms[name].score(x,y) # logger.debug(name, score) self.logger.debug("learned {}, {:d} ms".format( name, int(1000 * (t2 - time.time())))) except Exception as e: self.logger.error("{} {}".format(name, str(e))) self.logger.debug("{:d} ms".format(int(1000 * (t - time.time()))))
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print("\n\t\t------------------------------------------------------------------------------------------------------------------------\n") print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n") print("\t\t------------------------------------------------------------------------------------------------------------------------\n") # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'.format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure':{'None':np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc)) #-------------------------------------# # EVALUATE PERFORMANCE BY TARGETS # #-------------------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Number of targets num_targets = [[1],[2],[3,4,5,6],[7]] # Names of the methods if consider_se: if options.different_atc: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcse', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcse'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'Side Effects', 'Random'] else: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'ATC', 'Side Effects', 'Random'] else: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure'] # Without random!! types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'Random'] # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 2 # Number of folds min_num_dc_group = 10 greater_or_smaller = 'greater' classifier = 'SVC best 1' classifiers = { 'KNeighbors' : KNeighborsClassifier(3), 'SVC' : SVC(probability=True), 'SVC linear' : SVC(kernel="linear", C=0.025), 'SVC rbf' : SVC(gamma=2, C=1), 'DecisionTree' : DecisionTreeClassifier(max_depth=5), 'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP' : MLPClassifier(alpha=1), 'AdaBoost' : AdaBoostClassifier(), 'GaussianNB' : GaussianNB(), 'QuadraticDiscr.' : QuadraticDiscriminantAnalysis(), 'SVC best 1' : SVC(kernel="rbf", gamma=0.01, C=100, probability=True), 'SVC best 2' : SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True) } if options.pca: pca_str = '_withPCA' else: pca_str = '_withoutPCA' # Plot of distributions of AUC plot_auc_distribution = os.path.join(img_dir, 'numtargets_auc_distribution_ranges{}.{}'.format(pca_str, fig_format)) # Plot of accuracy/sensitivity name acc_sens_dctargets = os.path.join(img_dir, 'numtargets_accsens_dctargets_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcguild = os.path.join(img_dir, 'numtargets_accsens_dcguild_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcstructure = os.path.join(img_dir, 'numtargets_accsens_dcstructure_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcatc = os.path.join(img_dir, 'numtargets_accsens_dcatc_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcse = os.path.join(img_dir, 'numtargets_accsens_dcse_ranges{}.{}'.format(pca_str, fig_format)) # Results table results_table = os.path.join(tables_dir, 'numtargets_auc_table_ranges{}.txt'.format(pca_str)) # Accuracy/Sensitivity results table prec_rec_table = os.path.join(tables_dir, 'numtargets_accsens_table_ranges{}.txt'.format(pca_str)) # File with results of Mann Whitney tests mannwhitney_file = os.path.join(tables_dir, 'numtargets_mannwhitney_ranges{}.txt'.format(pca_str)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) analysis_results = {} # Defining the dictionary that will store the results if consider_se: dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se) else: dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se) for range_tar in num_targets: selected_rows = [] for index, row in df.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() if len(range_tar) == 1: # If it is the first of the range if range_tar == num_targets[0]: if len(drugbank_to_targets[drug1]) <= range_tar[0] and len(drugbank_to_targets[drug2]) <= range_tar[0]: selected_rows.append(index) # If it is the last of the range elif range_tar == num_targets[len(num_targets)-1]: if len(drugbank_to_targets[drug1]) >= range_tar[0] and len(drugbank_to_targets[drug2]) >= range_tar[0]: selected_rows.append(index) # If it is in the middle of the range else: if len(drugbank_to_targets[drug1]) == range_tar[0] and len(drugbank_to_targets[drug2]) == range_tar[0]: selected_rows.append(index) else: if len(drugbank_to_targets[drug1]) in range_tar and len(drugbank_to_targets[drug2]) in range_tar: selected_rows.append(index) df_tar = df.ix[selected_rows] dc_data = df_tar[df_tar['combination'] == 1] num_dc = len(dc_data.index) print('Num drug combinations: {}'.format(num_dc)) if consider_se: list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns] ] else: list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns] ] for method, columns_method in list_methods: print('Evaluating {} targets with method {}\n'.format(range_tar,method)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: variance_cut_off = 0.01 num_components = 0 df_method = df_tar[columns_method] df_raw = df_method.drop('combination', axis=1) raw_columns = copy.copy(columns_method) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for column, var in sorted(zip(raw_columns, explained_variance), key=lambda x: x[1], reverse=True): #print(column, var) if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print('Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df_method.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df_method[['combination']] df_new = pd.concat([df_trans, df_comb], axis=1) df_method = df_new else: # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] if method == 'Combination' or method == 'random': selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) else: selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(method, guild_thresholds, rank_scoring, list_scoring) # Remove ATC columns if different ATC if options.different_atc and consider_se: selected_columns = [col for col in selected_columns if col not in dcatc_columns or col == 'combination'] print('Selected columns: {}\n'.format(', '.join(selected_columns))) print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column # Define the new table with the selected columns df_method = df_tar[selected_columns] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) #------------------------------------------------------------------# dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Building {} repetition groups of {} (same) DC and {} (different) non-DC'.format(repetitions,num_dc,num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [] # Here we will store the means of AUCs from the cross-validations std_aucs = [] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels num_repetitions=0 for ndc_data_equal in ndc_repetitions: num_repetitions+=1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print('Building {} fold groups of {} DC and {} non-DC x {} repetitions'.format(n_fold,num_items_group,num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length(dc_data, n_fold, num_items_group, me_too_drug_combinations) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations) # Defining the non-drug combination groups in each cross-validation step merged_groups = [pd.concat([x,y]) for x,y in zip(dc_groups, ndc_groups)] if method == 'random': #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier]) mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(n_fold, merged_groups, classifiers[classifier]) else: mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(all_aucs) #final_mean = np.mean(mean_aucs) std = np.std(all_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) #print('MEAN of STD: {}'.format(mean_std)) # Store the distribution of AUCs in the dictionary analysis_results.setdefault(range_tar[0], {}) analysis_results[range_tar[0]].setdefault(method, {}) analysis_results[range_tar[0]][method]['all_aucs'] = all_aucs analysis_results[range_tar[0]][method]['all_probs'] = all_probs analysis_results[range_tar[0]][method]['mean'] = final_mean analysis_results[range_tar[0]][method]['std'] = std analysis_results[range_tar[0]][method]['num_dc'] = num_dc #------------------------------------# # PLOT PRECISION VS. SENSITIVITY # #------------------------------------# analysis_results = plot_precision_sensitivity(analysis_results, 'dctargets', num_targets, acc_sens_dctargets) analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild', num_targets, acc_sens_dcguild) analysis_results = plot_precision_sensitivity(analysis_results, 'dcstructure', num_targets, acc_sens_dcstructure) if consider_se: analysis_results = plot_precision_sensitivity(analysis_results, 'dcatc', num_targets, acc_sens_dcatc) analysis_results = plot_precision_sensitivity(analysis_results, 'dcse', num_targets, acc_sens_dcse) #----------------------------------------------------# # PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #----------------------------------------------------# plot_auc_distributions(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se) #--------------------------------------------------------# # TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #--------------------------------------------------------# with open(results_table, 'w') as results_table_fd: # Header results_table_fd.write(' ') for method in types_analysis_labels: results_table_fd.write('\t{}\t \t '.format(method)) results_table_fd.write('\n') for num in num_targets: results_table_fd.write('{}'.format(num)) for method in types_analysis: mean = analysis_results[num[0]][method]['mean'] std = analysis_results[num[0]][method]['std'] num_dc = analysis_results[num[0]][method]['num_dc'] results_table_fd.write('\t{}\t{}\t{}'.format(mean, std, num_dc)) results_table_fd.write('\n') #----------------------------------------# # TABLE OF PRECISION VS. SENSITIVITY # #----------------------------------------# with open(prec_rec_table, 'w') as prec_rec_table_fd: # Header prec_rec_table_fd.write(' ') for method in types_analysis2: prec_rec_table_fd.write('\t{}\t '.format(method)) prec_rec_table_fd.write('\n') for num in num_targets: prec_rec_table_fd.write('{}'.format(num)) for method in types_analysis2: cut_off = analysis_results[num[0]][method]['cut_off'] value = analysis_results[num[0]][method]['value'] prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value)) prec_rec_table_fd.write('\n') #-------------------------------------------------------------------# # TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U # #-------------------------------------------------------------------# with open(mannwhitney_file, 'w') as mannwhitney_fd: mann_results = {} mannwhitney_fd.write(' \t ') for method in types_analysis_labels: mannwhitney_fd.write('\t{}'.format(method)) mannwhitney_fd.write('\n') # Perform the comparisons for num in num_targets: mann_results.setdefault(num[0], {}) for method1 in types_analysis: mann_results[num[0]].setdefault(method1, {}) for method2 in types_analysis: if method1 == method2: mann_results[num[0]][method1][method2] = '-' else: method1_dist = analysis_results[num[0]][method1]['all_aucs'] method2_dist = analysis_results[num[0]][method2]['all_aucs'] stat, pval = scipy.stats.mannwhitneyu(method1_dist, method2_dist) mann_results[num[0]][method1][method2] = [stat, pval] # Write the table of crossings for num in num_targets: for method1 in types_analysis: mannwhitney_fd.write('{}\t{}'.format(num[0], method1)) for method2 in types_analysis: if method1 == method2: mannwhitney_fd.write('\t-') else: stat, pval = mann_results[num[0]][method1][method2] mannwhitney_fd.write('\t{}, {:.2e}'.format(stat,pval)) mannwhitney_fd.write('\n') # End marker for time end = time.time() print('\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60)) return
def classiferCompare(X_train, X_test, Y_train, Y_test): names = [ "KNeighborsClassifier", "Linear SVM", # "RBF SVM", "Decision Tree", "Stochastic Gradient Descent", "Gaussian Process", "LDA", "QDA", "Random Forest", "GaussianNB", "AdaBoost", "XGBoost", "LogisticRegression(L1)", "LogisticRegression(L2)" ] classifiers = [ KNeighborsClassifier(3), LinearSVC(C=1, penalty='l1', loss='squared_hinge', dual=False), # SVC(kernel='rbf', C=1000), DecisionTreeClassifier(), SGDClassifier(loss="perceptron", penalty="l2"), GaussianProcessClassifier(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), RandomForestClassifier(n_estimators=200, max_features=15), GaussianNB(), AdaBoostClassifier(), LogisticRegression(penalty='l1'), LogisticRegression(penalty='l2') ] figure = plt.figure(figsize=(27, 9)) # iterate over classifer models print("Start training!") plot_number = 1 for name, clf in zip(names, classifiers): #ax = plt.subplot(len(datasets), len(classifiers) + 1, i) y_score = clf.fit(X_train, Y_train) train_score = cross_val_score(clf, X_train, Y_train, cv=5) test_score = clf.score(X_test, Y_test) Y_pred = clf.predict(X_test) precision = precision_score(Y_test, Y_pred) recall = recall_score(Y_test, Y_pred) f1 = f1_score(Y_test, Y_pred) print("***", name, "***") print("Train Score:", train_score.mean()) print("Test Score:", test_score) print("Precision:", precision) print("Recall:", recall) print("F1 score", f1) print(classification_report(Y_test, Y_pred)) # Plot ROC curve ax = plt.subplot(4, len(classifiers) / 2, plot_number) fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.title(name) plt.xlabel('False positive rate', fontsize=12) plt.ylabel('True positive rate', fontsize=12) plt.legend(loc="lower right") plot_number += 1 plt.subplots_adjust(hspace=0.5) plt.show()
def main(): """Orchestrate data analysis.""" # Load configuration file which describes the problem with open('config.json') as data_file: config = json.load(data_file) # Load data train_df = pd.read_csv(config['input']['train']) test_df = pd.read_csv(config['input']['test']) for factor_column in config['input']['factor_columns']: train_df.ix[:, factor_column] = ( train_df.ix[:, factor_column].astype('category')) train_x = train_df.ix[:, config['input']['feature_columns']] test_x = test_df.ix[:, config['input']['feature_columns']] test_ids = test_df.ix[:, 0] train_y = train_df.ix[:, config['input']['label_column']] # Add new colum train_x['blood_per_donation'] = train_x.ix[:, 3] / train_x.ix[:, 2] del train_x['Total Volume Donated (c.c.)'] test_x['blood_per_donation'] = test_x.ix[:, 3] / test_x.ix[:, 2] del test_x['Total Volume Donated (c.c.)'] # Simple statistics print(train_x.describe(include='all')) print(train_y.describe(include='all')) print("# Class 1: %i \t\t # class 0: %i" % (sum(train_y), len(train_y) - sum(train_y))) # It's easier to work with numpy train_x_orig = train_x.as_matrix() train_y_orig = train_y.as_matrix() # Shuffle data perm = np.random.permutation(len(train_y_orig)) train_x_orig = train_x_orig[perm] train_y_orig = train_y_orig[perm] # Get classifiers classifiers = [ ('Logistic Regression (C=1)', LogisticRegression(C=1)), ('Logistic Regression (C=1000)', LogisticRegression(C=10000)), # ('RBM 200, n_iter=40, LR=0.01, Reg: C=1', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=200, # n_iter=40, # learning_rate=0.01, # verbose=True)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=200, # n_iter=40, # learning_rate=0.01, # verbose=True)), # ('logistic', LogisticRegression(C=10000))])), # ('RBM 100', Pipeline(steps=[('rbm', BernoulliRBM(n_components=100)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 100, n_iter=20', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 256', Pipeline(steps=[('rbm', BernoulliRBM(n_components=256)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 512, n_iter=100', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10)), # ('logistic', LogisticRegression(C=1))])), # ('NN 20:5', skflow.TensorFlowDNNClassifier(hidden_units=[20, 5], # n_classes=config['classes'], # steps=500)), # ('NN 500:200 dropout', # skflow.TensorFlowEstimator(model_fn=dropout_model, # n_classes=10, # steps=20000)), # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model, # n_classes=10, # batch_size=100, # steps=20000, # learning_rate=0.001)), ('SVM, adj.', SVC(probability=True, kernel="rbf", C=2.8, gamma=.0073, cache_size=200)), # ('SVM, linear', SVC(probability=True, # kernel="linear", # C=0.025, # cache_size=200)), ('k nn (k=3)', KNeighborsClassifier(3)), ('k nn (k=5)', KNeighborsClassifier(5)), ('k nn (k=7)', KNeighborsClassifier(7)), ('k nn (k=21)', KNeighborsClassifier(21)), ('Decision Tree', DecisionTreeClassifier(max_depth=5)), ('Random Forest', RandomForestClassifier(n_estimators=50, n_jobs=10)), ('Random Forest 2', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=10)), ('AdaBoost', AdaBoostClassifier()), ('Naive Bayes', GaussianNB()), ('Gradient Boosting', GradientBoostingClassifier()), ('LDA', LinearDiscriminantAnalysis()), ('QDA', QuadraticDiscriminantAnalysis()) ] kf = KFold(n_splits=5) i = 0 for clf_name, clf in classifiers: print("-" * 80) print("Name: %s (%i)" % (clf_name, i)) score_estimates = [] for train_ids, val_ids in kf.split(train_x_orig): # Split labeled data into training and validation train_x = train_x_orig[train_ids] train_y = train_y_orig[train_ids] val_x = train_x_orig[val_ids] val_y = train_y_orig[val_ids] # Train classifier clf.fit(train_x, train_y) # Estimate loss val_pred = clf.predict_proba(val_x)[:, 1] score_estimates.append(calculate_score(val_y, val_pred)) print("Estimated score: %0.4f" % score_estimates[-1]) print("Average estimated score: %0.4f" % np.array(score_estimates).mean()) i += 1 print("#" * 80) # Train classifier on complete data clf_name, clf = classifiers[13] print("Train %s on complete data and generated %s" % (clf_name, config['output'])) clf.fit(train_x_orig, train_y_orig) # Predict and write output test_predicted = clf.predict_proba(test_x)[:, 1] write_solution(test_ids, test_predicted, config['output'])
def preprocess(X, y, X_val, test_data, verbose=True, scale=True, autoencoder=True, qda=True, knn=False, xgb=False): """Preprocess the data by adding features and scaling it. For each method, we train the model on the training data using the corresponding labels, then apply the same transformation to validation and test data. Args: X (numpy ndarray): Training data y (numpy ndarray): Training labels X_val (numpy ndarray): Validation data test_data (numpy ndarray): Test data for submission verbose (bool): log level scale (bool): scale the data autoencoder (bool): use autoencoder feature qda (bool): use Quadratic Discriminant Analysis feature knn (bool): use k-nearest neighbours feature xgb (bool): use XGBoost feature Returns: The dataset appropriately transformed by the selected methods. """ if autoencoder: if verbose: print("## Autoencoder") print("### Train...", end=" ", flush=True) ae = train_autoencoder(X, size=32, epochs=20, verbose=1) else: ae = train_autoencoder(X, size=32, epochs=20, verbose=0) if verbose: print("done.") print("### Evaluate...", end=" ", flush=True) ae.eval() X_ae = ae.layer1(Variable(torch.Tensor(X))).data X = np.c_[X, X_ae] X_val_ae = ae.layer1(Variable(torch.Tensor(X_val))).data X_val = np.c_[X_val, X_val_ae] test_data_ae = ae.layer1(Variable(torch.Tensor(test_data))).data test_data = np.c_[test_data, test_data_ae] if verbose: print("done.") if qda: if verbose: print("## Quadratic Discriminant Analysis...", end=" ", flush=True) qdaclf = QuadraticDiscriminantAnalysis(reg_param=0.02) qdaclf.fit(X, y) X_qda = qdaclf.predict_proba(X) X = np.c_[X, X_qda[:, 1]] X_val_qda = qdaclf.predict_proba(X_val) X_val = np.c_[X_val, X_val_qda[:, 1]] test_data_qda = qdaclf.predict_proba(test_data) test_data = np.c_[test_data, test_data_qda[:, 1]] if verbose: print("done.") if knn: print("## K-Nearest Neighbours...", end=" ", flush=True) knnclf = KNeighborsClassifier(n_neighbors=10, p=2, n_jobs=-1) knnclf.fit(X, y) X_knn = knnclf.predict_proba(X) X = np.c_[X, X_knn[:, 1]] X_val_knn = knnclf.predict_proba(X_val) X_val = np.c_[X_val, X_val_knn[:, 1]] test_data_knn = knnclf.predict_proba(test_data) test_data = np.c_[test_data, test_data_knn[:, 1]] print("done.") if xgb: print("## XGBoost...", end=" ", flush=True) xgbclf = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=1000, gamma=10, min_child_weight=10, objective='binary:logistic', n_jobs=4) xgbclf.fit(X, y) X_xgb = xgbclf.predict_proba(X) X_val_xgb = xgbclf.predict_proba(X_val) X = np.c_[X, X_xgb[:, 1]] X_val = np.c_[X_val, X_val_xgb[:, 1]] test_data_xgb = xgbclf.predict_proba(test_data) test_data = np.c_[test_data, test_data_xgb[:, 1]] print("done.") if scale: if verbose: print("## Scaling...", end=" ", flush=True) scaler = StandardScaler() X = scaler.fit_transform(X) X_val = scaler.transform(X_val) test_data = scaler.transform(test_data) if verbose: print("done.") return X, y, X_val, test_data
def main(): # when running on circleci, set the vars in the project settings public_id = os.environ.get('NUMERAPI_PUBLIC_ID', '') secret_key = os.environ.get('NUMERAPI_SECRET_KEY', '') if not os.path.exists(test_csv): os.makedirs(test_csv) napi = NumeraiApiWrapper(public_id=public_id, secret_key=secret_key) if not os.path.exists(DATA_SET_PATH): logger.info("Downloading the current dataset...") os.makedirs(DATA_SET_PATH) napi.download_current_dataset(dest_path=DATA_SET_PATH, dest_filename=DATA_SET_FILE + '.zip', unzip=True) shutil.move(os.path.join(DATA_SET_PATH, DATA_SET_FILE, TRAIN_FILE), os.path.join(DATA_SET_PATH, TRAIN_FILE)) shutil.move(os.path.join(DATA_SET_PATH, DATA_SET_FILE, TOURN_FILE), os.path.join(DATA_SET_PATH, TOURN_FILE)) else: logger.info("Found old data to use.") training_data = pd.read_csv('%s/%s' % (DATA_SET_PATH, TRAIN_FILE), header=0) tournament_data = pd.read_csv('%s/%s' % (DATA_SET_PATH, TOURN_FILE), header=0) napi.set_data(tournament_data, training_data) features = [f for f in list(training_data) if "feature" in f] features = features[:len(features) // 2] # just use half, speed things up a bit X, Y = training_data[features], training_data[ "target_bernie"] # hardcode to target bernie for now x_prediction = tournament_data[features] ids = tournament_data["id"] clfs = [ RandomForestClassifier(n_estimators=15, max_features=1, max_depth=2, n_jobs=1, criterion='entropy', random_state=42), XGBClassifier(learning_rate=0.1, subsample=0.4, max_depth=2, n_estimators=20, nthread=1, seed=42), DecisionTreeClassifier(max_depth=5, random_state=42), MLPClassifier(alpha=1, hidden_layer_sizes=(25, 25), random_state=42), GaussianNB(), QuadraticDiscriminantAnalysis(tol=1.0e-3), # last item can have multiple jobs since it may be the last to be processed so we have an extra core LogisticRegression(n_jobs=2, solver='sag', C=1, tol=1e-2, random_state=42, max_iter=50) ] before = time.time() fit_all(clfs, X, Y) logger.info('all clfs fit() took %.2fs' % (time.time() - before)) before = time.time() uploads_wait_for_legit = predict_and_upload_legit(napi, clfs, x_prediction, ids) logger.info('all legit clfs predict_proba() took %.2fs' % (time.time() - before)) before = time.time() uploads_wait_for_mix = predict_and_upload_mix(napi, clfs, tournament_data, x_prediction, ids) logger.info('all mix clfs predict_proba() took %.2fs' % (time.time() - before)) legit_submission_ids = list() mix_submission_ids = list() before = time.time() for f in futures.as_completed(uploads_wait_for_legit): legit_submission_ids.append(f.result()) logger.info('await legit uploads took %.2fs' % (time.time() - before)) before = time.time() for f in futures.as_completed(uploads_wait_for_mix): mix_submission_ids.append(f.result()) logger.info('await mix uploads took %.2fs' % (time.time() - before)) n_passed_concordance = get_concordance(napi, legit_submission_ids) if len(n_passed_concordance) != len(clfs): logger.error('legit passed concordance %s/%s' % (len(n_passed_concordance), len(clfs))) sys.exit(1) else: logger.info('all legit tests passed!') n_passed_concordance = get_concordance(napi, mix_submission_ids) if len(n_passed_concordance) > 0: logger.error('mix passed concordance %s/%s' % (len(n_passed_concordance), len(clfs))) sys.exit(1) else: logger.info('all mix tests passed!') sys.exit(0)
def create_csv_score_YES_NO(scaler_, abbr_scaler): #tot_random_state = [] tot_train_score = [] tot_test_score = [] #tot_macro_ovo = [] #tot_weighted_ovo = [] #tot_macro_ovr = [] tot_weighted_ovr = [] for i in range(1, 31): #train test split X_train, X_test, y_train, y_test = train_test_split( public_data, public_labels, test_size=0.3, stratify=public_labels) #tot_random_state.append(500*i) #vettorizzare i label train_labels_encoded = encoder.fit_transform(y_train) test_labels_encoded = encoder.transform(y_test) scaler = scaler_ clf = QuadraticDiscriminantAnalysis() steps = [('scaler', scaler), ('red_dim', None), ('clf', clf)] pipeline = Pipeline(steps) summary = pipeline.named_steps pipeline.fit(X_train, train_labels_encoded) score_train = pipeline.score(X_train, train_labels_encoded) tot_train_score.append(score_train) score_test = pipeline.score(X_test, test_labels_encoded) tot_test_score.append(score_test) y_scores = pipeline.predict_proba(X_test) #macro_ovo = roc_auc_score(test_labels_encoded, y_scores, average='macro', multi_class='ovo') #weighted_ovo = roc_auc_score(test_labels_encoded, y_scores, average='weighted', multi_class='ovo') #macro_ovr = roc_auc_score(test_labels_encoded, y_scores, average='macro', multi_class='ovr') weighted_ovr = roc_auc_score(test_labels_encoded, y_scores, average='weighted', multi_class='ovr') #tot_macro_ovo.append(macro_ovo) #tot_weighted_ovo.append(weighted_ovo) #tot_macro_ovr.append(macro_ovr) tot_weighted_ovr.append(weighted_ovr) y_pred = pipeline.predict(X_test) report = classification_report(test_labels_encoded, y_pred, output_dict=True) df_r = pd.DataFrame(report) df_r = df_r.transpose() #df_r.to_csv(f'/home/users/ubaldi/TESI_PA/result_CV/report_{name}/report_{i}') #outname = f'report_{i}.csv' #outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/report_{name}_{str(abbr_scaler)}_YES_NO' #if not os.path.exists(outdir): # os.makedirs(outdir) #fullname_r = os.path.join(outdir, outname) #df_r.to_csv(fullname_r) #mean value and std mean_train_score = np.mean(tot_train_score) mean_test_score = np.mean(tot_test_score) mean_weighted_ovr = np.mean(tot_weighted_ovr) std_train_score = np.std(tot_train_score) std_test_score = np.std(tot_test_score) std_weighted_ovr = np.std(tot_weighted_ovr) # pandas can convert a list of lists to a dataframe. # each list is a row thus after constructing the dataframe # transpose is applied to get to the user's desired output. df = pd.DataFrame([ tot_train_score, [mean_train_score], [std_train_score], tot_test_score, [mean_test_score], [std_test_score], tot_weighted_ovr, [mean_weighted_ovr], [std_weighted_ovr], [scaler] ]) df = df.transpose() fieldnames = [ 'train_accuracy', 'train_accuracy_MEAN', 'train_accuracy_STD', 'test_accuracy', 'test_accuracy_MEAN', 'test_accuracy_STD', 'roc_auc_score_weighted_ovr', 'roc_auc_score_weighted_ovr_MEAN', 'roc_auc_score_weighted_ovr_STD', 'SCALER' ] ## write the data to the specified output path: "output"/+file_name ## without adding the index of the dataframe to the output ## and without adding a header to the output. ## => these parameters are added to be fit the desired output. #df.to_csv(f'/home/users/ubaldi/TESI_PA/result_score/Public/score_{name}.csv', index=False, header=fieldnames) #create folder and save import os outname = f'score_{name}_{str(abbr_scaler)}_YES_NO.csv' outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/' if not os.path.exists(outdir): os.makedirs(outdir) fullname = os.path.join(outdir, outname) df.to_csv(fullname, index=False, header=fieldnames)
"Nearest Neighbors", "Linear SVC", "RBF SVC", "Gaussian Process", "Decision Tree", "Random Forest", "Multilayer Perceptron", "AdaBoost", "Naive Bayes", "QDA", "XGBoost", "Logistic Regression" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(n_estimators=100, max_features='auto'), MLPClassifier(alpha=1, max_iter=int(1e8)), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis(), XGBClassifier, LogisticRegression() ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, gini_index.gini_index, chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim, ICAP.icap, MRMR.mrmr, MIFS.mifs ] selectornames_short = [ "RELF", "FSCR", "GINI", "CHSQ", "JMI", "CIFE", "DISR", "MIM", "CMIM", "ICAP", "MRMR", "MIFS" ] # class boundary list
def predefined_ops(): '''return dict of user defined none-default instances of operators ''' clean = { 'clean': Cleaner(dtype_filter='not_datetime', na1='null', na2='mean', drop_uid=True), 'cleanNA': Cleaner(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), 'cleanMn': Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'), } # encode = { 'woe8': WoeEncoder(max_leaf_nodes=8), 'woe5': WoeEncoder(max_leaf_nodes=5), 'woeq8': WoeEncoder(q=8), 'woeq5': WoeEncoder(q=5), 'woeb5': WoeEncoder(bins=5), 'woem': WoeEncoder(mono=True), 'oht': OhtEncoder(), 'ordi': OrdiEncoder(), # 'bin10': BinEncoder(bins=10, int_bins=True), # 10 bin edges encoder # 'bin5': BinEncoder(bins=5, int_bins=True), # 5 bin edges encoder # 'binm10': BinEncoder(max_leaf_nodes=10, # int_bins=True), # 10 bin tree cut edges encoder # 'binm5': BinEncoder(max_leaf_nodes=5, # int_bins=True), # 5 bin tree cut edges encoder } resample = { # over_sampling # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # clean outliers 'inlierForest': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'IsolationForest', 'contamination': 0.1 }), 'inlierLocal': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'LocalOutlierFactor', 'contamination': 0.1 }), 'inlierEllip': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'EllipticEnvelope', 'contamination': 0.1 }), 'inlierOsvm': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'OneClassSVM', 'contamination': 0.1 }), } scale = { 'stdscale': StandardScaler(), 'minmax': MinMaxScaler(), 'absmax': MaxAbsScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'quantile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), # kernel approximation 'Nys': Nystroem(random_state=0), 'rbf': RBFSampler(random_state=0), 'rfembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(WoeEncoder(max_leaf_nodes=5)), 'flog': SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C=1e-2)), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fxgb': SelectFromModel( XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), ), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)), # fixed number of features 'fxgb20': SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'), max_features=20), 'frf20': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=20), 'frf10': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=10), 'fRFElog': RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1), } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } imp = { "impXGB": XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), "impRF": ExtraTreesClassifier(n_estimators=100, max_depth=2) } instances = {} instances.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **resample, **imp) return instances
plot_CM_and_ROC_curve(clf, x_std, y_train, x_test_std, y_test) #Ensemble Model plot_CM_and_ROC_curve(('Ensemble model', eclf), x_std, y_train, x_test_std, y_test) #A list of classifiers to run K-Fold Cross Validation on clfrs = [] clfrs.append(('Logistic Regression', LogisticRegression(random_state=42))) clfrs.append(('Naive Bayes', GaussianNB())) #classifiers.append(('KNN', KNeighborsClassifier()))#This one takes a very long time to run! #classifiers.append(('SVM', SVC(random_state=42, probability=True))) #This one takes a very long time to run! clfrs.append(('Decision Tree', DecisionTreeClassifier(random_state=42))) clfrs.append(('Random Forest', RandomForestClassifier(random_state=42))) clfrs.append(('LDA', LinearDiscriminantAnalysis())) clfrs.append(('QDA', QuadraticDiscriminantAnalysis())) clfrs.append(('Ensemble Model', eclf)) #Iterate over the list to validate every model #This step of validating every trained model takes a lot of time to execute. As the dataset it has to validate over is very large #The runtime of the code is subject to a good GPU unit, which in general laptops is a constraint #The value of k is set 20 for classifier in clfrs: clf = classifier[1] clf.fit(x_train, y_train) training_score = cross_val_score(clf, x_train, y_train, cv=20) print("Classifiers: ", classifier[0], "has a cross validation score of", round(training_score.mean(), 2) * 100, "% accuracy score") #A bar plot for all the trained models and their F1 score train_accuracies = [0.71, 0.61, 1.00, 1.00, .72, 0.68, 0.86] models = ['Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'LDA', 'QDA', 'Ensemble']
splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant' 'Analysis') plt.show()
def train_models(model_name, epoch=5, batch_size=100): log.info("current model:{}".format(model_name)) pos = pd.read_csv( "Order_predicts/datasets/results/train/action_pos_features.csv") posfillna = pos.fillna(pos.median()).replace(np.inf, 100) neg = pd.read_csv( "Order_predicts/datasets/results/train/action_neg_features.csv") negfillna = neg.fillna(neg.median()).replace(np.inf, 100) data = pd.concat([posfillna, negfillna]) data = shuffle(data) del data['id'] y = data['label'] del data['label'] scaler = preprocessing.StandardScaler().fit(data) X = scaler.transform(data) pd.DataFrame(X).to_csv("Order_predicts/datasets/results/scale_x.csv", index=None) data_scaled = preprocessing.scale(X) log.info("data shape: {}".format(data_scaled.shape)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) log.info("{}, {}".format(X_train.shape, X_test.shape)) i = 0 for e in range(epoch): for train_x, train_y in minibatches(X_train, y_train, batch_size=batch_size, shuffle=False): if model_name == 'svc': clf_weights = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight={1: 10}, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=0) elif model_name == 'svr': clf_weights = svm.SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif model_name == 'lasso': clf_weights = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, positive=False, random_state=0, selection='cyclic') elif model_name == 'logistic': clf_weights = LogisticRegression(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight={ 0: 0.1, 1: 0.9 }, random_state=0, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) elif model_name == 'mlpr': # learning_rate: {'constant', 'invscaling', 'adaptive'} clf_weights = MLPRegressor(hidden_layer_sizes=(100, ), activation="logistic", solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=0, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8) elif model_name == 'rf': clf_weights = RandomForestClassifier( n_estimators=20, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=0, verbose=0, warm_start=False, class_weight={ 0: 0.1, 1: 0.9 }) elif model_name == 'adaboost': base_estimator = RandomForestClassifier( n_estimators=20, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=0, verbose=0, warm_start=False, class_weight={ 0: 0.1, 1: 0.9 }) base_estimator1 = LogisticRegression(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight={ 0: 0.1, 1: 0.9 }, random_state=0, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) clf_weights = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, learning_rate=0.6666, algorithm='SAMME.R', random_state=0) elif model_name == 'gbr': clf_weights = GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=0, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') elif model_name == 'qda': clf_weights = QuadraticDiscriminantAnalysis( priors=None, reg_param=0., store_covariance=False, tol=1.0e-4, store_covariances=None) elif model_name == 'lda': clf_weights = LinearDiscriminantAnalysis( solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4) elif model_name == 'n_n': clf_weights = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1) elif model_name == 'gnb': clf_weights = GaussianNB(priors=None) elif model_name == 'bnb': clf_weights = BernoulliNB(alpha=1.0, binarize=.0, fit_prior=True, class_prior=None) elif model_name == 'dcc': clf_weights = DecisionTreeClassifier( criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, class_weight=None, presort=False) elif model_name == 'dcr': clf_weights = DecisionTreeRegressor( criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, presort=False) elif model_name == 'RAN': base_estimator = LinearRegression() clf_weights = RANSACRegressor(base_estimator=base_estimator, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, stop_probability=0.99, residual_metric=None, loss='absolute_loss', random_state=0) elif model_name == 'adar': clf_weights = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1., loss='linear', random_state=None) else: # model_name == 'SGDR': clf_weights = SGDRegressor(loss="squared_loss", penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate="invscaling", eta0=0.01, power_t=0.25, warm_start=False, average=False, n_iter=None) # build clf_weights.fit(train_x, train_y) i += 1 if i % 20 == 0: mse = mean_squared_error(y_test, clf_weights.predict(X_test)) log.info("均方误差:{}".format(mse)) avgscores = cross_val_score(clf_weights, train_x, train_y).mean() log.info("{}/{} 训练集得分平均值: {}".format(e, i, avgscores)) model_path = os.path.join( "Order_predicts/datasets/results/models", '{}'.format(model_name)) if not os.path.exists(model_path): os.makedirs(model_path) joblib.dump( clf_weights, os.path.join(model_path, "{}_{}.model".format(e, i))) log.info(" Save ") if i % 50 == 0: scores = clf_weights.score(X_test, y_test) log.info("验证得分: {}".format(scores))
print(confusion_matrix(y_test_folds1, y_pred1)) i=i+1 if i == 3: break break # ============================================================================= # [[42538 462] # [ 2369 255]] # not good enough yet # ============================================================================= #### QuadraticDiscriminantAnalysis qda_clf = QuadraticDiscriminantAnalysis() skfolds = StratifiedKFold(n_splits = 20, random_state=77) skfolds1 = StratifiedKFold(n_splits = 3, random_state=77) for train_index, test_index in skfolds.split(x_train, y_train): clone_clf = clone(qda_clf) x_train_folds = x_train[train_index] y_train_folds = y_train[train_index] x_test_folds = x_train[test_index] y_test_folds = y_train[test_index] for train_index1, test_index1 in skfolds1.split(x_test_folds, y_test_folds): clone_clf = clone(qda_clf) x_train_folds1 = x_test_folds[train_index1] y_train_folds1 = y_test_folds[train_index1] x_test_folds1 = x_test_folds[test_index1]
clf.fit(X_train, Y_train) print clf.best_params_ #for score in clf.grid_scores_: # print score # AdaBoost print print "AdaBoost" rate = clf.best_params_['learning_rate'] classifier = AdaBoostClassifier(learning_rate=rate) classifier.fit(X_train, Y_train) acc = classifier.score(X_test, Y_test) print "Accuracy:", acc print print "Compare models" for classifier in [ DecisionTreeClassifier(), LogisticRegression(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), KNeighborsClassifier(), AdaBoostClassifier(), BaggingClassifier(), RandomForestClassifier(), ]: print str(classifier)[:str(classifier).find('(')], classifier.fit(X_train, Y_train) pred = classifier.predict(X_test) acc = metrics.accuracy_score(Y_test, pred) print "Accuracy:", acc
recall_2 = recall(predict_2, t) recall_3 = recall(predict_3, t) print('P(C = 1|x) = 0.05, precision: ' + str(precision_1) + ', recall: ' + str(recall_1)) print('P(C = 1|x) = 0.5, precision: ' + str(precision_2) + ', recall: ' + str(recall_2)) print('P(C = 1|x) = 0.6, precision: ' + str(precision_3) + ', recall: ' + str(recall_3)) #4 #a X, t = gen_data(mu0=(1, 1), mu1=(2, 2), cov0=0, cov1=-0.9, N0=1000, N1=500) X_repeat, t_repeat = X, t ## used in 4e clf = QuadraticDiscriminantAnalysis() model = clf.fit(X, t) accuracy4a = clf.score(X, t) print('accuracy: ' + str(accuracy4a)) fig_4a = plt.figure() fig_4a.suptitle('Question 4(a): Decision boundary and contours') X, y = X.T colors = [] for i in range(len(t)): if (t[i] == 0): colors.append('red') else: colors.append('blue') plt.scatter(X, y, c=colors, s=2) bonnerlib2.dfContour(clf)