def user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads): """ Perform user classification. Inputs: - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - user_label_matrix: A user-to-label matrix in scipy sparse matrix format. - annotated_user_ids: A list of Twitter user ids. - node_to_id: A node to Twitter id map as a python dictionary. - number_of_threads: Output: - prediction: The output of the classification in scipy sparse matrix format. """ non_annotated_user_ids = np.setdiff1d(np.arange(len(node_to_id), dtype=int), annotated_user_ids) features = normalize_columns(features) X_train = features[annotated_user_ids, :] X_test = features[non_annotated_user_ids, :] y_train = user_label_matrix[annotated_user_ids, :] X_train, X_test = chi2_psnr_community_weighting(X_train, X_test, y_train) # X_train = normalize(X_train, norm="l2") # X_test = normalize(X_test, norm="l2") print("Performed community weighting.") model = model_fit(X_train, y_train, svm_hardness=10.0, fit_intercept=True, number_of_threads=number_of_threads, classifier_type="LogisticRegression") meta_model = meta_model_fit(X_train, y_train, svm_hardness=10.0, fit_intercept=True, number_of_threads=number_of_threads, regressor_type="LinearSVR") print("Classification model has been trained.") y_train_pred_proba = weigh_users(X_train, model, classifier_type="LogisticRegression") y_test_pred = classify_users(X_test, model, classifier_type="LogisticRegression", meta_model=meta_model, upper_cutoff=20) prediction = form_prediction_matrix(y_train_pred_proba, y_test_pred, user_label_matrix, annotated_user_ids, non_annotated_user_ids) return prediction
def user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads): """ Perform user classification. Inputs: - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - user_label_matrix: A user-to-label matrix in scipy sparse matrix format. - annotated_user_ids: A list of Twitter user ids. - node_to_id: A node to Twitter id map as a python dictionary. - number_of_threads: Output: - prediction: The output of the classification in scipy sparse matrix format. """ non_annotated_user_ids = np.setdiff1d(np.arange(len(node_to_id), dtype=int), annotated_user_ids) features = normalize_columns(features) X_train = features[annotated_user_ids, :] X_test = features[non_annotated_user_ids, :] y_train = user_label_matrix[annotated_user_ids, :] X_train, X_test = chi2_psnr_community_weighting(X_train, X_test, y_train) print("Performed community weighting.") model = model_fit(X_train, y_train, svm_hardness=10.0, fit_intercept=True, number_of_threads=number_of_threads, classifier_type="LogisticRegression") # classifier_type="RandomForest") print("Classification model has been trained.") prediction = spsp.csr_matrix(user_label_matrix.shape, dtype=np.float64) y_pred = classify_users(X_test, model, classifier_type="LogisticRegression") # classifier_type="RandomForest") print("Classification on new data has been performed.") y_pred = spsp.csr_matrix(y_pred) prediction[non_annotated_user_ids, :] = y_pred prediction[annotated_user_ids, :] = user_label_matrix[annotated_user_ids, :] prediction.eliminate_zeros() return prediction
def run_experiment(dataset_name, dataset_folder, feature_extraction_method_name, percentages, trial_num, thread_num, feature_extraction_parameters, classifier_parameters): if dataset_name == "snow2014": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_snow2014graph_data(dataset_folder) elif dataset_name == "flickr": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "youtube": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "politicsuk": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_insight_data(dataset_folder) else: print("Invalid dataset name.") raise RuntimeError print("Graphs and labels read.") feature_matrix,\ feature_extraction_elapsed_time = feature_extraction(adjacency_matrix, feature_extraction_method_name, thread_num, feature_extraction_parameters) print("Feature extraction elapsed time: ", feature_extraction_elapsed_time) if feature_extraction_parameters["community_weighting"] is None: pass elif feature_extraction_parameters["community_weighting"] == "chi2": feature_matrix = normalize_columns(feature_matrix) elif feature_extraction_parameters["community_weighting"] == "ivf": feature_matrix = normalize_columns(feature_matrix) else: print("Invalid community weighting selection.") raise RuntimeError C = classifier_parameters["C"] fit_intercept = classifier_parameters["fit_intercept"] for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) folds = generate_folds(node_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = feature_matrix[train, :],\ feature_matrix[test, :],\ node_label_matrix[train, :],\ node_label_matrix[test, :] if issparse(feature_matrix): if feature_extraction_parameters["community_weighting"] == "chi2": contingency_matrix = chi2_contingency_matrix(X_train, y_train) community_weights = peak_snr_weight_aggregation(contingency_matrix) X_train, X_test = community_weighting(X_train, X_test, community_weights) else: X_train = normalize(X_train, norm="l2") X_test = normalize(X_test, norm="l2") ############################################################################################################ # Train model ############################################################################################################ # Train classifier. start_time = time.time() model = OneVsRestClassifier(svm.LinearSVC(C=C, random_state=None, dual=False, fit_intercept=fit_intercept), n_jobs=thread_num) model.fit(X_train, y_train) hypothesis_training_time = time.time() - start_time print('Model fitting time: ', hypothesis_training_time) ############################################################################################################ # Make predictions ############################################################################################################ start_time = time.time() y_pred = model.decision_function(X_test) prediction_time = time.time() - start_time print('Prediction time: ', prediction_time) ############################################################################################################ # Calculate measures ############################################################################################################ y_pred = evaluation.form_node_label_prediction_matrix(y_pred, y_test) measures = evaluation.calculate_measures(y_pred, y_test) macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] # print('Trial ', trial+1, ':') # print(' Macro-F1: ', macro_F1[trial]) # print(' Micro-F1: ', micro_F1[trial]) # print('\n') ################################################################################################################ # Experiment results ################################################################################################################ print(percentage) print('\n') print('Macro F1 average: ', np.mean(macro_F1)) print('Micro F1 average: ', np.mean(micro_F1)) print('Macro F1 std: ', np.std(macro_F1)) print('Micro F1 std: ', np.std(micro_F1))
def run_prototype(snow_tweets_folder, prototype_output_folder, restart_probability, number_of_threads): """ This is a sample execution of the User Network Profile Classifier Prototype. Specifically: - Reads a set of tweets from a local folder. - Forms graphs and text-based vector representation for the users involved. - Fetches Twitter lists for influential users. - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics. - Extracts graph-based features using the ARCTE algorithm. - Performs user classification for the rest of the users. """ if number_of_threads is None: number_of_threads = get_threads_number() #################################################################################################################### # Read data. #################################################################################################################### # Read graphs. edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv") adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path, separator='\t') number_of_nodes = adjacency_matrix.shape[0] # Read labels. node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv") user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path, '\t') #################################################################################################################### # Extract features. #################################################################################################################### features = arcte(adjacency_matrix, restart_probability, 0.00001, number_of_threads=number_of_threads) features = normalize_columns(features) percentages = np.arange(1, 11, dtype=np.int) trial_num = 10 #################################################################################################################### # Perform user classification. #################################################################################################################### mean_macro_precision = np.zeros(percentages.size, dtype=np.float) std_macro_precision = np.zeros(percentages.size, dtype=np.float) mean_micro_precision = np.zeros(percentages.size, dtype=np.float) std_micro_precision = np.zeros(percentages.size, dtype=np.float) mean_macro_recall = np.zeros(percentages.size, dtype=np.float) std_macro_recall = np.zeros(percentages.size, dtype=np.float) mean_micro_recall = np.zeros(percentages.size, dtype=np.float) std_micro_recall = np.zeros(percentages.size, dtype=np.float) mean_macro_F1 = np.zeros(percentages.size, dtype=np.float) std_macro_F1 = np.zeros(percentages.size, dtype=np.float) mean_micro_F1 = np.zeros(percentages.size, dtype=np.float) std_micro_F1 = np.zeros(percentages.size, dtype=np.float) F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float) for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_precision = np.zeros(trial_num, dtype=np.float) micro_precision = np.zeros(trial_num, dtype=np.float) macro_recall = np.zeros(trial_num, dtype=np.float) micro_recall = np.zeros(trial_num, dtype=np.float) macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float) folds = generate_folds(user_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = features[train, :],\ features[test, :],\ user_label_matrix[train, :],\ user_label_matrix[test, :] contingency_matrix = chi2_contingency_matrix(X_train, y_train) community_weights = peak_snr_weight_aggregation(contingency_matrix) X_train, X_test = community_weighting(X_train, X_test, community_weights) #################################################################################################### # Train model #################################################################################################### # Train classifier model = OneVsRestClassifier(svm.LinearSVC(C=1, random_state=None, dual=False, fit_intercept=True), n_jobs=number_of_threads) model.fit(X_train, y_train) #################################################################################################### # Make predictions #################################################################################################### y_pred = model.decision_function(X_test) y_pred = form_node_label_prediction_matrix(y_pred, y_test) ######################################################################################################## # Calculate measures ######################################################################################################## measures = evaluation.calculate_measures(y_pred, y_test) macro_recall[trial] = measures[0] micro_recall[trial] = measures[1] macro_precision[trial] = measures[2] micro_precision[trial] = measures[3] macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] trial_F1[trial, :] = measures[6] mean_macro_precision[p] = np.mean(macro_precision) std_macro_precision[p] = np.std(macro_precision) mean_micro_precision[p] = np.mean(micro_precision) std_micro_precision[p] = np.std(micro_precision) mean_macro_recall[p] = np.mean(macro_recall) std_macro_recall[p] = np.std(macro_recall) mean_micro_recall[p] = np.mean(micro_recall) std_micro_recall[p] = np.std(micro_recall) mean_macro_F1[p] = np.mean(macro_F1) std_macro_F1[p] = np.std(macro_F1) mean_micro_F1[p] = np.mean(micro_F1) std_micro_F1[p] = np.std(micro_F1) F1[p, :] = np.mean(trial_F1, axis=0) measure_list = [(mean_macro_precision, std_macro_precision), (mean_micro_precision, std_micro_precision), (mean_macro_recall, std_macro_recall), (mean_micro_recall, std_micro_recall), (mean_macro_F1, std_macro_F1), (mean_micro_F1, std_micro_F1), F1] write_results(measure_list, os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
def run_experiment(dataset_name, dataset_folder, feature_extraction_method_name, percentages, trial_num, thread_num, feature_extraction_parameters, classifier_parameters): if dataset_name == "snow2014": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_snow2014graph_data(dataset_folder) elif dataset_name == "flickr": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "youtube": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "politicsuk": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_insight_data(dataset_folder) else: print("Invalid dataset name.") raise RuntimeError print("Graphs and labels read.") feature_matrix,\ feature_extraction_elapsed_time = feature_extraction(adjacency_matrix, feature_extraction_method_name, thread_num, feature_extraction_parameters) print("Feature extraction elapsed time: ", feature_extraction_elapsed_time) if feature_extraction_parameters["community_weighting"] is None: pass elif feature_extraction_parameters["community_weighting"] == "chi2": feature_matrix = normalize_columns(feature_matrix) elif feature_extraction_parameters["community_weighting"] == "ivf": feature_matrix = normalize_columns(feature_matrix) else: print("Invalid community weighting selection.") raise RuntimeError C = classifier_parameters["C"] fit_intercept = classifier_parameters["fit_intercept"] for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) folds = generate_folds(node_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = feature_matrix[train, :],\ feature_matrix[test, :],\ node_label_matrix[train, :],\ node_label_matrix[test, :] if issparse(feature_matrix): if feature_extraction_parameters[ "community_weighting"] == "chi2": contingency_matrix = chi2_contingency_matrix( X_train, y_train) community_weights = peak_snr_weight_aggregation( contingency_matrix) X_train, X_test = community_weighting( X_train, X_test, community_weights) else: X_train = normalize(X_train, norm="l2") X_test = normalize(X_test, norm="l2") ############################################################################################################ # Train model ############################################################################################################ # Train classifier. start_time = time.time() model = OneVsRestClassifier(svm.LinearSVC( C=C, random_state=None, dual=False, fit_intercept=fit_intercept), n_jobs=thread_num) model.fit(X_train, y_train) hypothesis_training_time = time.time() - start_time print('Model fitting time: ', hypothesis_training_time) ############################################################################################################ # Make predictions ############################################################################################################ start_time = time.time() y_pred = model.decision_function(X_test) prediction_time = time.time() - start_time print('Prediction time: ', prediction_time) ############################################################################################################ # Calculate measures ############################################################################################################ y_pred = evaluation.form_node_label_prediction_matrix( y_pred, y_test) measures = evaluation.calculate_measures(y_pred, y_test) macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] # print('Trial ', trial+1, ':') # print(' Macro-F1: ', macro_F1[trial]) # print(' Micro-F1: ', micro_F1[trial]) # print('\n') ################################################################################################################ # Experiment results ################################################################################################################ print(percentage) print('\n') print('Macro F1 average: ', np.mean(macro_F1)) print('Micro F1 average: ', np.mean(micro_F1)) print('Macro F1 std: ', np.std(macro_F1)) print('Micro F1 std: ', np.std(micro_F1))