def find_possible_coalitions_generative(XY_train, XY_test, prob_matrix): X_train, y_train = split_label_from_data(XY_train) X_test, y_test = split_label_from_data(XY_test) labels = set(y_train) possible_coalitions = set() for label in labels: coalition = [label] coalition_size = 0 while coalition_size < 0.51 or len(coalition) < len(labels): parties = np.zeros(len(labels)) for party1 in coalition: for party2 in labels - set(coalition): parties[party2] += prob_matrix[party1][party2] + prob_matrix[party2][party1] coalition.append(np.argmax(parties)) coalition_size = X_test[y_test.isin(coalition)].shape[0] / X_test.shape[0] if coalition_size >= 0.51: possible_coalitions.add(tuple(sorted(coalition))) coalitions_variance = calculate_variance(XY_train, possible_coalitions) coalitions_opposition_distance = calculate_oppo_coali_dist(XY_train, possible_coalitions) sorted_variance = sorted(coalitions_variance.items(), key=lambda x: x[1]) print(f'Sum variances of each coalition : {sorted_variance[:10]}') sorted_dists = sorted(coalitions_opposition_distance.items(), key=lambda x: x[1], reverse=True) print(f'Dists from opposition of each coalition : {sorted_dists[:10]}') # Picked manually best_coalition = (3, 4, 5, 6, 8, 9, 11, 12) return best_coalition
def find_best_coalition_generative(XY_train, XY_val, XY_test_pred): classifiers_list = [GaussianNB(), LinearDiscriminantAnalysis()] XY_trainVal = pd.concat([XY_train, XY_val]) X_trainVal, y_trainVal = split_label_from_data(XY_trainVal) X_train, y_train = split_label_from_data(XY_train) X_val, y_val = split_label_from_data(XY_val) best_model = None best_model_score = 0 for model in classifiers_list: score = np.average(cross_val_score(model, X_trainVal, y_trainVal, cv=5)) if score > best_model_score: best_model = model best_model_score = score print(f'best model: = {best_model}, with score: {best_model_score}') prob_matrix = calc_prob_matrix(XY_train, XY_trainVal, best_model) coalition = find_possible_coalitions_generative(XY_trainVal, XY_test_pred, prob_matrix) print(f'Generative model best Coalition : {sorted(list(coalition))}') return sorted(list(coalition))
def find_possible_coalitions(XY_train, XY_val, clusters_params): X_train, y_train = split_label_from_data(XY_train) X_val, y_val = split_label_from_data(XY_val) n_val_sample = len(y_val) possible_coalitions = set() for model, params in clusters_params.items(): clf = model(**params) clf.fit(X_train) clusters_pred = clf.predict(X_val) for cluster in set(clusters_pred): samples_party_in_cluster = y_val[clusters_pred == cluster] parties_in_cluster = set(samples_party_in_cluster) coalition = [] coalition_size = 0 for party in parties_in_cluster: total_party_samples = np.sum(y_val == party) party_samples_in_cluster = np.sum( samples_party_in_cluster == party) percentage_in_cluster = party_samples_in_cluster / total_party_samples if percentage_in_cluster > 0.85: coalition.append(party) coalition_size += total_party_samples if coalition_size / n_val_sample >= 0.51: possible_coalitions.add(tuple(sorted(coalition))) return possible_coalitions
def main(): XY_train = pd.read_csv('train_transformed.csv', index_col=0, header=0) XY_val = pd.read_csv('val_transformed.csv', index_col=0, header=0) XY_test = pd.read_csv('test_transformed.csv', index_col=0, header=0) X_train, y_train = split_label_from_data(XY_train) X_test, y_test = split_label_from_data(XY_test) # Predicting model clf = RandomForestClassifier(n_estimators=220, min_samples_split=10) clf.fit(X_train, y_train) y_test_pred = clf.predict(X_test) XY_test_pred = insert_label_to_data(X_test, y_test_pred) # Finding best coalition using clustering models clustring_coalition = find_best_coalition_cluster(XY_train, XY_val, XY_test_pred) print( f'Cluster coalition is : {[classes[i] for i in clustring_coalition]}') # Finding best generative using clustering models generative_coalition = find_best_coalition_generative( XY_train, XY_val, XY_test_pred) print( f'Generative coalition is : {[classes[i] for i in generative_coalition]}' ) leading_features_for_each_party(pd.concat([XY_train, XY_val])) find_group_factors(pd.concat([XY_train, XY_val]), XY_test_pred, clustring_coalition) find_factors_to_change_results(XY_train, XY_val, XY_test_pred)
def train_and_evaluate(classifiers_params_dict: dict, XY_train: pd.DataFrame, XY_val: pd.DataFrame, score_function=accuracy_of_clf): X_train, y_train = split_label_from_data(XY_train) X_val, y_val = split_label_from_data(XY_val) results = dict() for classifier, params in classifiers_params_dict.items(): clf = classifier(**params) clf.fit(X_train, y_train) score = score_function(clf, X_val, y_val) results[classifier] = score return results
def leading_features_for_each_party(df): X, y = split_label_from_data(df) num_of_classes = len(set(y)) features = X.columns for current_class in range(num_of_classes): y_new = one_vs_all(y, current_class) clf = ExtraTreesClassifier(n_estimators=50) clf.fit(X, y_new) features_scores = clf.feature_importances_ indices = np.argsort(features_scores)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], features_scores[indices[f]])) colors = ['r'] * len(features) colors[indices[0]] = 'b' # Plot the feature importance of the forest plt.figure() plt.title(f"Feature importance of {classes[current_class]} vs all") plt.barh(range(X.shape[1]), features_scores, orientation='horizontal', color=colors, align="center", tick_label=features) plt.show()
def find_group_factors(XY_train, XY_test, coalition): X_train, y_train = split_label_from_data(XY_train) y_train[~y_train.isin(coalition)] = -1 y_train[y_train != -1] = 1 tree = DecisionTreeClassifier(random_state=0, min_samples_split=3) tree.fit(X_train, y_train) export_graph_tree(tree, ['Opposition', 'Coalition'], 'Coallition-Opposition-tree') coalition_variances = X_train[y_train == 1].var(axis=0).sort_values( ascending=False) #print(coalition_variances) coalition_dists = abs(X_train[y_train == 1].mean(axis=0) - X_train[y_train == -1].mean(axis=0)).sort_values() #print(coalition_dists) X_test, y_test = split_label_from_data(XY_test) y_pred = tree.predict(X_test) coalition_size = (y_pred == 1).sum() / len(y_pred) print(f'Original coalition size is : {coalition_size}') coalition_variance = calculate_variance(XY_test, [coalition])[coalition] coalition_dist = calculate_oppo_coali_dist(XY_test, [coalition])[coalition] print( f'Original coalition variance : {coalition_variance}\nOriginal coalition opposition dist : {coalition_dist}' ) factors = { 'Political_interest_Total_Score': (1.5, mul), 'Number_of_differnt_parties_voted_for': (0.5, mul) } XY_test_transformed = change_set_by_factors(XY_test, factors) factors = {'Political_interest_Total_Score': (-1.5, add)} XY_test_transformed = change_set_by_factors(XY_test_transformed, factors) X_test_new, _ = split_label_from_data(XY_test_transformed) y_pred = tree.predict(X_test_new) coalition_size = (y_pred == 1).sum() / len(y_pred) print(f'New coalition size is : {coalition_size}') coalition_variance = calculate_variance(XY_test_transformed, [coalition])[coalition] coalition_dist = calculate_oppo_coali_dist(XY_test_transformed, [coalition])[coalition] print( f'New coalition variance : {coalition_variance}\nNew coalition opposition dist : {coalition_dist}' )
def calculate_variance(df, possible_coalitions): X, y = split_label_from_data(df) coalitions_vars = dict() for coalition in possible_coalitions: indices = y.isin(coalition) X_coalition = X.loc[indices, :] coalition_variance = X_coalition.var(axis=0) coalitions_vars[coalition] = coalition_variance.sum() return coalitions_vars
def calc_prob_matrix(XY_train, XY_trainVal, model): X_train, y_train = split_label_from_data(XY_train) model.fit(X_train, y_train) X_train, y_train = split_label_from_data(XY_trainVal) labels = set(y_train) label_sample = dict() for label in labels: label_sample[label] = X_train[y_train == label] prob_matrix = np.zeros((len(labels), len(labels))) for label1 in labels: pred_prob = model.predict_proba(label_sample[label1]) for label2 in labels: prob_matrix[label1][label2] = sum(pred_prob[:, label2]) prob_matrix[label1] = prob_matrix[label1] / len(label_sample[label1]) return prob_matrix
def find_best_cluster_model_params(XY_train, classifiers_params_dict): X_train, y_train = split_label_from_data(XY_train) X_train = X_train.to_numpy() y_train = y_train.to_numpy() n_splits = 3 kf = KFold(n_splits=n_splits, shuffle=True, random_state=1) new_params_dict = dict() for classifier, params in classifiers_params_dict.items(): new_params_dict[classifier] = dict() random_state = False if 'random_state' in params.keys(): random_state = params['random_state'] params.pop('random_state') if len(params) == 2: param_name_1 = list(params.keys())[0] values_1 = params[param_name_1] param_name_2 = list(params.keys())[1] values_2 = params[param_name_2] best_score = -1 best_values = None for value_1 in values_1: for value_2 in values_2: params_dict = {param_name_1: value_1, param_name_2: value_2} if random_state: params_dict['random_state'] = random_state score = 0 for k, (train_index, val_index) in enumerate(kf.split(X_train)): clf = classifier(**params_dict) clf.fit(X_train[train_index], y_train[train_index]) # train_score = clustering_score(clf, X_train[train_index], y_train[train_index]) score += clustering_score(clf, X_train[val_index], y_train[val_index]) if score > best_score: best_values = {param_name_1: value_1, param_name_2: value_2} best_score = score elif len(params) == 1: param_name_1 = list(params.keys())[0] values_1 = params[param_name_1] best_score = -1 best_values = None for value_1 in values_1: params_dict = {param_name_1: value_1} if random_state: params_dict['random_state'] = random_state clf = classifier(**params_dict) score = np.mean(cross_val_score(clf, X_train, y_train, scoring=clustering_score, cv=n_splits)) if score > best_score: best_values = {param_name_1: value_1} best_score = score else: continue for param_name in params.keys(): new_params_dict[classifier][param_name] = best_values[param_name] return new_params_dict
def calculate_oppo_coali_dist(df, possible_coalitions): X, y = split_label_from_data(df) coalitions_dists = dict() for coalition in possible_coalitions: coalition_indices = y.isin(coalition).to_numpy() opposition_indices = ~coalition_indices X_coalition = X.loc[coalition_indices, :] X_opposition = X.loc[opposition_indices, :] dist = np.linalg.norm((X_coalition.mean(axis=0) - X_opposition.mean(axis=0)), 1) coalitions_dists[coalition] = dist return coalitions_dists
def train_and_evaluate(classifiers_params_dict: dict, XY_train: pd.DataFrame, XY_val: pd.DataFrame, score_function=accuracy_of_clf): X_train, y_train = split_label_from_data(XY_train) X_val, y_val = split_label_from_data(XY_val) results = dict() clfs = [] for classifier, params in classifiers_params_dict.items(): clf = classifier(**params) clfs.append((clf.__class__.__name__, clf)) clf.fit(X_train, y_train) score = score_function(clf, X_val, y_val) results[classifier] = score hardVoting = VotingClassifier(estimators=clfs, voting='hard') softVoting = VotingClassifier(estimators=clfs, voting='soft') for clf in [hardVoting, softVoting]: clf.fit(X_train, y_train) score = score_function(clf, X_val, y_val) results[str(clf.__class__.__name__ + '.' + clf.voting)] = score return results
if accuracy > best_accuracy: best_accuracy = accuracy best_feature = [feature] elif accuracy == best_accuracy: # if there are multiple features with same accuracy we save them best_feature.append(feature) # In case no improvement best_feature will be empty if len(best_feature): chosen_feature = np.random.choice(best_feature) features.append(chosen_feature) features_left.remove(chosen_feature) chosen_features = [features_name[index] for index in sorted(features)] return chosen_features if __name__ == '__main__': XY_train = pd.read_csv('train_transformed.csv', index_col=0, header=0) XY_val = pd.read_csv('val_transformed.csv', index_col=0, header=0) XY_test = pd.read_csv('test_transformed.csv', index_col=0, header=0) X_train, y_train = split_label_from_data(XY_train) X_val, y_val = split_label_from_data(XY_val) X_test, y_test = split_label_from_data(XY_test) knn = KNeighborsClassifier() forest = RandomForestClassifier() selected_features = SFS(knn, X_train, y_train, X_val, y_val) print(f'KNN SFS - features : {selected_features}') selected_features = SFS(forest, X_train, y_train, X_val, y_val) print(f'Forest SFS - features : {selected_features}')
def train_best_classifier(XY_train: pd.DataFrame, classifier, params): X_train, y_train = split_label_from_data(XY_train) clf = classifier(**params) clf.fit(X_train, y_train) return clf
def find_factors_to_change_results(XY_train, XY_val, XY_test): tree = DecisionTreeClassifier(random_state=0, min_samples_split=3) x_train, y_train = split_label_from_data(pd.concat([XY_train, XY_val])) x_test, y_test = split_label_from_data(XY_test) tree.fit(x_train, y_train) export_graph_tree(tree, classes, 'fourth_prediction-tree')
def find_best_params_CV(XY_train: pd.DataFrame, classifiers_params_dict: dict, scoring='accuracy'): X_train, y_train = split_label_from_data(XY_train) X_train = X_train.to_numpy() y_train = y_train.to_numpy() n_splits = 5 new_params_dict = dict() for classifier, params in classifiers_params_dict.items(): new_params_dict[classifier] = dict() random_state = False if 'random_state' in params.keys(): random_state = params['random_state'] params.pop('random_state') if len(params) == 2: param_name_1 = list(params.keys())[0] values_1 = params[param_name_1] param_name_2 = list(params.keys())[1] values_2 = params[param_name_2] best_score = -1 best_values = None for value_1 in values_1: for value_2 in values_2: params_dict = { param_name_1: value_1, param_name_2: value_2 } if random_state: params_dict['random_state'] = random_state clf = classifier(**params_dict) score = np.mean( cross_val_score(clf, X_train, y_train, scoring=scoring, cv=n_splits)) if score > best_score: best_values = { param_name_1: value_1, param_name_2: value_2 } best_score = score elif len(params) == 1: param_name_1 = list(params.keys())[0] values_1 = params[param_name_1] best_score = -1 best_values = None for value_1 in values_1: params_dict = {param_name_1: value_1} if random_state: params_dict['random_state'] = random_state clf = classifier(**params_dict) score = np.mean( cross_val_score(clf, X_train, y_train, scoring=scoring, cv=n_splits)) if score > best_score: best_values = {param_name_1: value_1} best_score = score else: continue for param_name in params.keys(): new_params_dict[classifier][param_name] = best_values[param_name] return new_params_dict
def main(): # This automate is not related to the bonus, the bonus is implemented in different file automate_model_selection = False find_best_params = False XY_train = pd.read_csv('train_transformed.csv', index_col=0, header=0) XY_val = pd.read_csv('val_transformed.csv', index_col=0, header=0) XY_test = pd.read_csv('test_transformed.csv', index_col=0, header=0) if find_best_params: classifiers_params_dict = { RandomForestClassifier: { 'n_estimators': list(range(60, 400, 40)), 'min_samples_split': list(range(2, 20, 3)), 'random_state': 2 }, KNeighborsClassifier: { 'n_neighbors': list(range(1, 10)) }, SVC: { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'] }, DecisionTreeClassifier: { 'min_samples_split': list(range(2, 20, 2)) }, GaussianNB: {} } classifiers_params_dict = find_best_params_CV(XY_train, classifiers_params_dict) else: classifiers_params_dict = { RandomForestClassifier: { 'n_estimators': 220, 'min_samples_split': 10 }, KNeighborsClassifier: { 'n_neighbors': 3 }, SVC: { 'kernel': 'rbf' }, DecisionTreeClassifier: { 'min_samples_split': 4 }, GaussianNB: {} } print(f'Classifiers best params : \n{classifiers_params_dict}') results = train_and_evaluate(classifiers_params_dict, XY_train, XY_val) # used to pick best model manually best_clf = pick_best_classifier( results) if automate_model_selection else RandomForestClassifier XY_train_new = pd.concat([XY_train, XY_val]) clf = train_best_classifier(XY_train_new, best_clf, classifiers_params_dict[best_clf]) X_train_new, y_train_new = split_label_from_data(XY_train_new) # First prediction X_test, y_test = split_label_from_data(XY_test) y_pred = clf.predict(X_test) y_train_pred = clf.predict(X_train_new) accuracy = accuracy_score(y_test, y_pred) train_accuracy = accuracy_score(y_train_new, y_train_pred) conf_mat = confusion_matrix(y_test, y_pred) plot_confusion_matrix(clf, X_test, y_test, display_labels=classes, xticks_rotation='vertical', values_format='.3g') plt.show() print(f'Test Error : {1 - accuracy}, Train Error : {1 - train_accuracy}') print(f'Confusion Matrix : \n{conf_mat}') division_of_voters = { classes[party]: list(y_pred).count(party) for party in set(y_pred) } party_with_majority = max(division_of_voters, key=division_of_voters.get) print( f'Party that will win majority of votes (in relation to Test set) : {party_with_majority}' ) n_voters = len(X_test) division_of_voters.update((key, round(value * 100 / n_voters, 3)) for key, value in division_of_voters.items()) print(f'Division of voters : \n{division_of_voters}') bins = np.linspace(0, 12, 26) y_test = [classes[y] for y in y_test] y_pred = [classes[y] for y in y_pred] pd.DataFrame(y_pred).to_csv('y_pred_of_test_set.csv') plt.hist([y_test, y_pred], bins, label=['Real votes', 'Prediction']) plt.xticks(range(0, 13, 1), rotation='vertical') plt.legend(loc='upper right') plt.title('Prediction - Real comparison') plt.show() explode = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0) plt.pie(division_of_voters.values(), explode=explode, labels=division_of_voters.keys(), autopct='%1.1f%%', shadow=True, startangle=0) plt.title('Division of voters') plt.axis('equal') plt.show() threshold = 0.6 indices_list = createTransportationLists(clf, X_test, threshold) pd.DataFrame(indices_list).set_axis(classes).to_csv('indices_list.csv') decision_tree = DecisionTreeClassifier(min_samples_split=6) decision_tree.fit(X_train_new, y_train_new) export_graph_tree(decision_tree) check_factors(clf, X_test)