def get_top_vectorizer_dict(dataframe,
                            metric_name='accuracy',
                            top_n=1,
                            classifier=None):
    data_filename_list = dataframeutils.get_unique_values_from_dataframe(
        dataframe, 'data_filename')
    vectorizer_margin_dict = dict({})
    for vectorizer in commonconstants.VECTORIZER_LIST:
        vectorizer_margin_dict[vectorizer] = 0.0

    if classifier is None:
        classifier_name_list = dataframeutils.get_unique_values_from_dataframe(
            dataframe, 'classifier_name')
    else:
        classifier_name_list = [classifier]
    top_vectorizer_dict = dict({})
    for data_filename in data_filename_list:
        for classifier_name in classifier_name_list:
            sorted_metric_list = get_top_vectorizer_dict_given_data_classifier_metric(
                dataframe, data_filename, classifier_name, metric_name)

            if len(sorted_metric_list) >= 2:
                margin_value = get_margin_value(sorted_metric_list)
                vectorizer_margin_dict[
                    sorted_metric_list[0][0]] = vectorizer_margin_dict[
                        sorted_metric_list[0][0]] + margin_value
            sorted_metric_list = sorted_metric_list[:top_n]
            for sorted_metric in sorted_metric_list:
                if sorted_metric[0] in top_vectorizer_dict.keys():
                    top_vectorizer_dict[sorted_metric[
                        0]] = top_vectorizer_dict[sorted_metric[0]] + 1
                else:
                    top_vectorizer_dict[sorted_metric[0]] = 1
    return top_vectorizer_dict, vectorizer_margin_dict
示例#2
0
def get_global_metadata_visualization(less_than_value=None,
                                      greater_than_value=None):
    dataframe = dataframeutils.get_dataframe_size_filter(
        less_than_value=less_than_value, greater_than_value=greater_than_value)
    unique_num_rows = dataframeutils.get_unique_values_from_dataframe(
        dataframe, column_name='num_rows')
    unique_average_sentence_length = dataframeutils.get_unique_values_from_dataframe(
        dataframe, column_name='average_sentence_length')
    unique_class_imbalance = dataframeutils.get_unique_values_from_dataframe(
        dataframe, column_name='imbalance_measure')
    unique_class_labels = list(dataframe['num_class_labels'].values)
    unique_num_tokens = dataframeutils.get_unique_values_from_dataframe(
        dataframe, column_name='num_tokens')
    histogram_plot_iterable = [
        unique_num_rows, unique_average_sentence_length,
        unique_class_imbalance, unique_class_labels, unique_num_tokens
    ]
    title_iterable = [
        'Distribution of rows', 'Distribution of Average Sentence Length',
        'Distribution of Class Imbalance', 'Distribution of Class Labels',
        'Distribution of Tokens'
    ]
    x_label_iterable = [
        'Number of Rows', 'Average Sentence Length', 'Class Imbalance',
        'Number of Class Labels', 'Number of Tokens'
    ]
    y_label_iterable = ['Count'] * len(x_label_iterable)

    for plot_iter, title_iter, x_label_iter, y_label_iter in zip(
            histogram_plot_iterable, title_iterable, x_label_iterable,
            y_label_iterable):

        filename_to_save = get_filename_to_save([str(title_iter)])
        visualizationutils.plot_histogram_chart(
            plot_iter,
            title_name=title_iter,
            x_label=x_label_iter,
            y_label=y_label_iter,
            filename_to_save=filename_to_save)
def get_top_classifier_dict(dataframe,
                            metric_name='accuracy',
                            top_n=1,
                            vectorizer=None):
    data_filename_list = dataframeutils.get_unique_values_from_dataframe(
        dataframe, 'data_filename')
    if vectorizer is None:
        vectorizer_name_list = dataframeutils.get_unique_values_from_dataframe(
            dataframe, 'vectorizer')
    else:
        vectorizer_name_list = [vectorizer]
    top_classifier_dict = dict({})
    for data_filename in data_filename_list:
        for vectorizer_name in vectorizer_name_list:
            sorted_metric_list = get_top_classifier_dict_given_data_vectorizer_metric(
                dataframe, data_filename, vectorizer_name, metric_name)
            sorted_metric_list = sorted_metric_list[:top_n]
            for sorted_metric in sorted_metric_list:
                if sorted_metric[0] in top_classifier_dict.keys():
                    top_classifier_dict[sorted_metric[
                        0]] = top_classifier_dict[sorted_metric[0]] + 1
                else:
                    top_classifier_dict[sorted_metric[0]] = 1
    return top_classifier_dict
def get_top_vectorizer_given_category(dataframe,
                                      metric_name='accuracy',
                                      top_n=1,
                                      category_name_list=None,
                                      classifier=None):
    category_names = dataframeutils.get_unique_values_from_dataframe(
        dataframe, 'category_folder_name')
    if category_name_list is not None:
        category_names = filter(lambda x: x in category_name_list,
                                category_names)
    category_top_vectorizer_dict = dict({})
    for category_name in category_names:
        filtered_dataframe = dataframe[dataframe['category_folder_name'] ==
                                       category_name]
        top_vectorizer_dict = get_top_vectorizer_dict(filtered_dataframe,
                                                      metric_name=metric_name,
                                                      top_n=top_n,
                                                      classifier=classifier)
        category_top_vectorizer_dict[category_name] = top_vectorizer_dict
    return category_top_vectorizer_dict