def get_top_vectorizer_dict(dataframe, metric_name='accuracy', top_n=1, classifier=None): data_filename_list = dataframeutils.get_unique_values_from_dataframe( dataframe, 'data_filename') vectorizer_margin_dict = dict({}) for vectorizer in commonconstants.VECTORIZER_LIST: vectorizer_margin_dict[vectorizer] = 0.0 if classifier is None: classifier_name_list = dataframeutils.get_unique_values_from_dataframe( dataframe, 'classifier_name') else: classifier_name_list = [classifier] top_vectorizer_dict = dict({}) for data_filename in data_filename_list: for classifier_name in classifier_name_list: sorted_metric_list = get_top_vectorizer_dict_given_data_classifier_metric( dataframe, data_filename, classifier_name, metric_name) if len(sorted_metric_list) >= 2: margin_value = get_margin_value(sorted_metric_list) vectorizer_margin_dict[ sorted_metric_list[0][0]] = vectorizer_margin_dict[ sorted_metric_list[0][0]] + margin_value sorted_metric_list = sorted_metric_list[:top_n] for sorted_metric in sorted_metric_list: if sorted_metric[0] in top_vectorizer_dict.keys(): top_vectorizer_dict[sorted_metric[ 0]] = top_vectorizer_dict[sorted_metric[0]] + 1 else: top_vectorizer_dict[sorted_metric[0]] = 1 return top_vectorizer_dict, vectorizer_margin_dict
def get_global_metadata_visualization(less_than_value=None, greater_than_value=None): dataframe = dataframeutils.get_dataframe_size_filter( less_than_value=less_than_value, greater_than_value=greater_than_value) unique_num_rows = dataframeutils.get_unique_values_from_dataframe( dataframe, column_name='num_rows') unique_average_sentence_length = dataframeutils.get_unique_values_from_dataframe( dataframe, column_name='average_sentence_length') unique_class_imbalance = dataframeutils.get_unique_values_from_dataframe( dataframe, column_name='imbalance_measure') unique_class_labels = list(dataframe['num_class_labels'].values) unique_num_tokens = dataframeutils.get_unique_values_from_dataframe( dataframe, column_name='num_tokens') histogram_plot_iterable = [ unique_num_rows, unique_average_sentence_length, unique_class_imbalance, unique_class_labels, unique_num_tokens ] title_iterable = [ 'Distribution of rows', 'Distribution of Average Sentence Length', 'Distribution of Class Imbalance', 'Distribution of Class Labels', 'Distribution of Tokens' ] x_label_iterable = [ 'Number of Rows', 'Average Sentence Length', 'Class Imbalance', 'Number of Class Labels', 'Number of Tokens' ] y_label_iterable = ['Count'] * len(x_label_iterable) for plot_iter, title_iter, x_label_iter, y_label_iter in zip( histogram_plot_iterable, title_iterable, x_label_iterable, y_label_iterable): filename_to_save = get_filename_to_save([str(title_iter)]) visualizationutils.plot_histogram_chart( plot_iter, title_name=title_iter, x_label=x_label_iter, y_label=y_label_iter, filename_to_save=filename_to_save)
def get_top_classifier_dict(dataframe, metric_name='accuracy', top_n=1, vectorizer=None): data_filename_list = dataframeutils.get_unique_values_from_dataframe( dataframe, 'data_filename') if vectorizer is None: vectorizer_name_list = dataframeutils.get_unique_values_from_dataframe( dataframe, 'vectorizer') else: vectorizer_name_list = [vectorizer] top_classifier_dict = dict({}) for data_filename in data_filename_list: for vectorizer_name in vectorizer_name_list: sorted_metric_list = get_top_classifier_dict_given_data_vectorizer_metric( dataframe, data_filename, vectorizer_name, metric_name) sorted_metric_list = sorted_metric_list[:top_n] for sorted_metric in sorted_metric_list: if sorted_metric[0] in top_classifier_dict.keys(): top_classifier_dict[sorted_metric[ 0]] = top_classifier_dict[sorted_metric[0]] + 1 else: top_classifier_dict[sorted_metric[0]] = 1 return top_classifier_dict
def get_top_vectorizer_given_category(dataframe, metric_name='accuracy', top_n=1, category_name_list=None, classifier=None): category_names = dataframeutils.get_unique_values_from_dataframe( dataframe, 'category_folder_name') if category_name_list is not None: category_names = filter(lambda x: x in category_name_list, category_names) category_top_vectorizer_dict = dict({}) for category_name in category_names: filtered_dataframe = dataframe[dataframe['category_folder_name'] == category_name] top_vectorizer_dict = get_top_vectorizer_dict(filtered_dataframe, metric_name=metric_name, top_n=top_n, classifier=classifier) category_top_vectorizer_dict[category_name] = top_vectorizer_dict return category_top_vectorizer_dict