def user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads):
    """
    Perform user classification.

    Inputs:  - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - user_label_matrix: A user-to-label matrix in scipy sparse matrix format.
             - annotated_user_ids: A list of Twitter user ids.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - number_of_threads:

    Output:  - prediction: The output of the classification in scipy sparse matrix format.
    """
    non_annotated_user_ids = np.setdiff1d(np.arange(len(node_to_id), dtype=int), annotated_user_ids)

    features = normalize_columns(features)

    X_train = features[annotated_user_ids, :]
    X_test = features[non_annotated_user_ids, :]
    y_train = user_label_matrix[annotated_user_ids, :]

    X_train, X_test = chi2_psnr_community_weighting(X_train, X_test, y_train)
    # X_train = normalize(X_train, norm="l2")
    # X_test = normalize(X_test, norm="l2")
    print("Performed community weighting.")

    model = model_fit(X_train,
                      y_train,
                      svm_hardness=10.0,
                      fit_intercept=True,
                      number_of_threads=number_of_threads,
                      classifier_type="LogisticRegression")

    meta_model = meta_model_fit(X_train,
                                y_train,
                                svm_hardness=10.0,
                                fit_intercept=True,
                                number_of_threads=number_of_threads,
                                regressor_type="LinearSVR")
    print("Classification model has been trained.")

    y_train_pred_proba = weigh_users(X_train,
                                     model,
                                     classifier_type="LogisticRegression")
    y_test_pred = classify_users(X_test,
                                 model,
                                 classifier_type="LogisticRegression",
                                 meta_model=meta_model,
                                 upper_cutoff=20)

    prediction = form_prediction_matrix(y_train_pred_proba,
                                        y_test_pred,
                                        user_label_matrix,
                                        annotated_user_ids,
                                        non_annotated_user_ids)

    return prediction
def user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads):
    """
    Perform user classification.

    Inputs:  - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - user_label_matrix: A user-to-label matrix in scipy sparse matrix format.
             - annotated_user_ids: A list of Twitter user ids.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - number_of_threads:

    Output:  - prediction: The output of the classification in scipy sparse matrix format.
    """
    non_annotated_user_ids = np.setdiff1d(np.arange(len(node_to_id), dtype=int), annotated_user_ids)

    features = normalize_columns(features)

    X_train = features[annotated_user_ids, :]
    X_test = features[non_annotated_user_ids, :]
    y_train = user_label_matrix[annotated_user_ids, :]

    X_train, X_test = chi2_psnr_community_weighting(X_train, X_test, y_train)
    print("Performed community weighting.")

    model = model_fit(X_train,
                      y_train,
                      svm_hardness=10.0,
                      fit_intercept=True,
                      number_of_threads=number_of_threads,
                      classifier_type="LogisticRegression")
                      # classifier_type="RandomForest")
    print("Classification model has been trained.")
    prediction = spsp.csr_matrix(user_label_matrix.shape, dtype=np.float64)
    y_pred = classify_users(X_test,
                            model,
                            classifier_type="LogisticRegression")
                            # classifier_type="RandomForest")
    print("Classification on new data has been performed.")
    y_pred = spsp.csr_matrix(y_pred)
    prediction[non_annotated_user_ids, :] = y_pred
    prediction[annotated_user_ids, :] = user_label_matrix[annotated_user_ids, :]
    prediction.eliminate_zeros()

    return prediction
예제 #3
0
def run_experiment(dataset_name,
                   dataset_folder,
                   feature_extraction_method_name,
                   percentages,
                   trial_num,
                   thread_num,
                   feature_extraction_parameters,
                   classifier_parameters):
    if dataset_name == "snow2014":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_snow2014graph_data(dataset_folder)
    elif dataset_name == "flickr":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "youtube":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "politicsuk":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_insight_data(dataset_folder)
    else:
        print("Invalid dataset name.")
        raise RuntimeError
    print("Graphs and labels read.")

    feature_matrix,\
    feature_extraction_elapsed_time = feature_extraction(adjacency_matrix,
                                                         feature_extraction_method_name,
                                                         thread_num,
                                                         feature_extraction_parameters)
    print("Feature extraction elapsed time: ", feature_extraction_elapsed_time)
    if feature_extraction_parameters["community_weighting"] is None:
        pass
    elif feature_extraction_parameters["community_weighting"] == "chi2":
        feature_matrix = normalize_columns(feature_matrix)
    elif feature_extraction_parameters["community_weighting"] == "ivf":
        feature_matrix = normalize_columns(feature_matrix)
    else:
        print("Invalid community weighting selection.")
        raise RuntimeError

    C = classifier_parameters["C"]
    fit_intercept = classifier_parameters["fit_intercept"]

    for p in np.arange(percentages.size):
        percentage = percentages[p]

        # Initialize the metric storage arrays to zero
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)

        folds = generate_folds(node_label_matrix,
                               labelled_node_indices,
                               number_of_categories,
                               percentage,
                               trial_num)

        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = feature_matrix[train, :],\
                                               feature_matrix[test, :],\
                                               node_label_matrix[train, :],\
                                               node_label_matrix[test, :]

            if issparse(feature_matrix):
                if feature_extraction_parameters["community_weighting"] == "chi2":
                    contingency_matrix = chi2_contingency_matrix(X_train, y_train)
                    community_weights = peak_snr_weight_aggregation(contingency_matrix)

                    X_train, X_test = community_weighting(X_train, X_test, community_weights)
                else:
                    X_train = normalize(X_train, norm="l2")
                    X_test = normalize(X_test, norm="l2")

            ############################################################################################################
            # Train model
            ############################################################################################################
            # Train classifier.
            start_time = time.time()
            model = OneVsRestClassifier(svm.LinearSVC(C=C,
                                                      random_state=None,
                                                      dual=False,
                                                      fit_intercept=fit_intercept),
                                        n_jobs=thread_num)

            model.fit(X_train, y_train)
            hypothesis_training_time = time.time() - start_time
            print('Model fitting time: ', hypothesis_training_time)

            ############################################################################################################
            # Make predictions
            ############################################################################################################
            start_time = time.time()
            y_pred = model.decision_function(X_test)
            prediction_time = time.time() - start_time
            print('Prediction time: ', prediction_time)

            ############################################################################################################
            # Calculate measures
            ############################################################################################################
            y_pred = evaluation.form_node_label_prediction_matrix(y_pred, y_test)

            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            # print('Trial ', trial+1, ':')
            # print(' Macro-F1:        ', macro_F1[trial])
            # print(' Micro-F1:        ', micro_F1[trial])
            # print('\n')

        ################################################################################################################
        # Experiment results
        ################################################################################################################
        print(percentage)
        print('\n')
        print('Macro F1        average: ', np.mean(macro_F1))
        print('Micro F1        average: ', np.mean(micro_F1))
        print('Macro F1            std: ', np.std(macro_F1))
        print('Micro F1            std: ', np.std(micro_F1))
def run_prototype(snow_tweets_folder,
                  prototype_output_folder,
                  restart_probability,
                  number_of_threads):
    """
    This is a sample execution of the User Network Profile Classifier Prototype.

    Specifically:
           - Reads a set of tweets from a local folder.
           - Forms graphs and text-based vector representation for the users involved.
           - Fetches Twitter lists for influential users.
           - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics.
           - Extracts graph-based features using the ARCTE algorithm.
           - Performs user classification for the rest of the users.
    """
    if number_of_threads is None:
        number_of_threads = get_threads_number()

    ####################################################################################################################
    # Read data.
    ####################################################################################################################
    # Read graphs.
    edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv")
    adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path,
                                             separator='\t')
    number_of_nodes = adjacency_matrix.shape[0]

    # Read labels.
    node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv")
    user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path,
                                                                                            '\t')

    ####################################################################################################################
    # Extract features.
    ####################################################################################################################
    features = arcte(adjacency_matrix,
                     restart_probability,
                     0.00001,
                     number_of_threads=number_of_threads)

    features = normalize_columns(features)

    percentages = np.arange(1, 11, dtype=np.int)
    trial_num = 10

    ####################################################################################################################
    # Perform user classification.
    ####################################################################################################################
    mean_macro_precision = np.zeros(percentages.size, dtype=np.float)
    std_macro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_micro_precision = np.zeros(percentages.size, dtype=np.float)
    std_micro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_macro_recall = np.zeros(percentages.size, dtype=np.float)
    std_macro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_micro_recall = np.zeros(percentages.size, dtype=np.float)
    std_micro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    mean_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float)
    for p in np.arange(percentages.size):
        percentage = percentages[p]
        # Initialize the metric storage arrays to zero
        macro_precision = np.zeros(trial_num, dtype=np.float)
        micro_precision = np.zeros(trial_num, dtype=np.float)
        macro_recall = np.zeros(trial_num, dtype=np.float)
        micro_recall = np.zeros(trial_num, dtype=np.float)
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)
        trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float)

        folds = generate_folds(user_label_matrix,
                               labelled_node_indices,
                               number_of_categories,
                               percentage,
                               trial_num)
        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = features[train, :],\
                                                features[test, :],\
                                                user_label_matrix[train, :],\
                                                user_label_matrix[test, :]

            contingency_matrix = chi2_contingency_matrix(X_train, y_train)
            community_weights = peak_snr_weight_aggregation(contingency_matrix)
            X_train, X_test = community_weighting(X_train, X_test, community_weights)

            ####################################################################################################
            # Train model
            ####################################################################################################
            # Train classifier
            model = OneVsRestClassifier(svm.LinearSVC(C=1,
                                                      random_state=None,
                                                      dual=False,
                                                      fit_intercept=True),
                                        n_jobs=number_of_threads)

            model.fit(X_train, y_train)
            ####################################################################################################
            # Make predictions
            ####################################################################################################
            y_pred = model.decision_function(X_test)

            y_pred = form_node_label_prediction_matrix(y_pred, y_test)

            ########################################################################################################
            # Calculate measures
            ########################################################################################################
            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_recall[trial] = measures[0]
            micro_recall[trial] = measures[1]

            macro_precision[trial] = measures[2]
            micro_precision[trial] = measures[3]

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            trial_F1[trial, :] = measures[6]

        mean_macro_precision[p] = np.mean(macro_precision)
        std_macro_precision[p] = np.std(macro_precision)
        mean_micro_precision[p] = np.mean(micro_precision)
        std_micro_precision[p] = np.std(micro_precision)
        mean_macro_recall[p] = np.mean(macro_recall)
        std_macro_recall[p] = np.std(macro_recall)
        mean_micro_recall[p] = np.mean(micro_recall)
        std_micro_recall[p] = np.std(micro_recall)
        mean_macro_F1[p] = np.mean(macro_F1)
        std_macro_F1[p] = np.std(macro_F1)
        mean_micro_F1[p] = np.mean(micro_F1)
        std_micro_F1[p] = np.std(micro_F1)
        F1[p, :] = np.mean(trial_F1, axis=0)

    measure_list = [(mean_macro_precision, std_macro_precision),
                    (mean_micro_precision, std_micro_precision),
                    (mean_macro_recall, std_macro_recall),
                    (mean_micro_recall, std_micro_recall),
                    (mean_macro_F1, std_macro_F1),
                    (mean_micro_F1, std_micro_F1),
                    F1]

    write_results(measure_list,
                  os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
예제 #5
0
def run_experiment(dataset_name, dataset_folder,
                   feature_extraction_method_name, percentages, trial_num,
                   thread_num, feature_extraction_parameters,
                   classifier_parameters):
    if dataset_name == "snow2014":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_snow2014graph_data(dataset_folder)
    elif dataset_name == "flickr":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "youtube":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "politicsuk":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_insight_data(dataset_folder)
    else:
        print("Invalid dataset name.")
        raise RuntimeError
    print("Graphs and labels read.")

    feature_matrix,\
    feature_extraction_elapsed_time = feature_extraction(adjacency_matrix,
                                                         feature_extraction_method_name,
                                                         thread_num,
                                                         feature_extraction_parameters)
    print("Feature extraction elapsed time: ", feature_extraction_elapsed_time)
    if feature_extraction_parameters["community_weighting"] is None:
        pass
    elif feature_extraction_parameters["community_weighting"] == "chi2":
        feature_matrix = normalize_columns(feature_matrix)
    elif feature_extraction_parameters["community_weighting"] == "ivf":
        feature_matrix = normalize_columns(feature_matrix)
    else:
        print("Invalid community weighting selection.")
        raise RuntimeError

    C = classifier_parameters["C"]
    fit_intercept = classifier_parameters["fit_intercept"]

    for p in np.arange(percentages.size):
        percentage = percentages[p]

        # Initialize the metric storage arrays to zero
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)

        folds = generate_folds(node_label_matrix, labelled_node_indices,
                               number_of_categories, percentage, trial_num)

        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = feature_matrix[train, :],\
                                               feature_matrix[test, :],\
                                               node_label_matrix[train, :],\
                                               node_label_matrix[test, :]

            if issparse(feature_matrix):
                if feature_extraction_parameters[
                        "community_weighting"] == "chi2":
                    contingency_matrix = chi2_contingency_matrix(
                        X_train, y_train)
                    community_weights = peak_snr_weight_aggregation(
                        contingency_matrix)

                    X_train, X_test = community_weighting(
                        X_train, X_test, community_weights)
                else:
                    X_train = normalize(X_train, norm="l2")
                    X_test = normalize(X_test, norm="l2")

            ############################################################################################################
            # Train model
            ############################################################################################################
            # Train classifier.
            start_time = time.time()
            model = OneVsRestClassifier(svm.LinearSVC(
                C=C,
                random_state=None,
                dual=False,
                fit_intercept=fit_intercept),
                                        n_jobs=thread_num)

            model.fit(X_train, y_train)
            hypothesis_training_time = time.time() - start_time
            print('Model fitting time: ', hypothesis_training_time)

            ############################################################################################################
            # Make predictions
            ############################################################################################################
            start_time = time.time()
            y_pred = model.decision_function(X_test)
            prediction_time = time.time() - start_time
            print('Prediction time: ', prediction_time)

            ############################################################################################################
            # Calculate measures
            ############################################################################################################
            y_pred = evaluation.form_node_label_prediction_matrix(
                y_pred, y_test)

            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            # print('Trial ', trial+1, ':')
            # print(' Macro-F1:        ', macro_F1[trial])
            # print(' Micro-F1:        ', micro_F1[trial])
            # print('\n')

        ################################################################################################################
        # Experiment results
        ################################################################################################################
        print(percentage)
        print('\n')
        print('Macro F1        average: ', np.mean(macro_F1))
        print('Micro F1        average: ', np.mean(micro_F1))
        print('Macro F1            std: ', np.std(macro_F1))
        print('Micro F1            std: ', np.std(micro_F1))