def plot(matrix, name=None): # plt.xkcd() plt.clf() indexes = OrderedDict([(cat, index) for index, cat in enumerate(similarity.get_category_dict())]) size = 3000 # fig, ax = plt.subplots() # ax.xaxis.set_ticks_position('top') # plt.tick_params(top=True, bottom=False) xoptions = {'rotation': -45, 'horizontalalignment': 'right', 'rotation_mode': 'anchor', 'size': 'x-small'} yoptions = {'rotation': -45, 'horizontalalignment': 'right', 'rotation_mode': 'anchor', 'size': 'x-small'} plt.xticks(range(0, len(indexes)), indexes.keys(), **xoptions) plt.yticks(range(0, len(indexes)), indexes.keys(), **yoptions) plt.gcf().axes[0].xaxis.set_ticks_position('top') plt.xticks(range(0, len(indexes)), indexes.keys(), **xoptions) for cat1 in matrix: mcat = max(matrix[cat1], key=lambda x: matrix[cat1][x]) for cat2, val in matrix[cat1].items(): color = 'g' if cat2 == mcat: color = 'r' if cat1 == cat2: color = 'b' plt.scatter(indexes[cat2], indexes[cat1], c=color, s=val * size, alpha=0.5) plt.grid(True) plt.subplots_adjust(left=0.20, right=0.95, top=0.80, bottom=0.05) # plt.show() plt.gcf().set_size_inches(10, 10) if name: iotools.make_dir(name) plt.savefig(name, dpi=300) # save as png
def get_similarity_old_matrix(): options = [similarity.metrics.df_old_keywords_list, similarity.metrics.sf_simple] categories = similarity.get_category_dict() ret = OrderedDict() for cat1, dlist1 in categories.items(): ret[cat1] = OrderedDict() for cat2, dlist2 in categories.items(): ret[cat1][cat2] = similarity.metrics.group_group_similarity(dlist1, dlist2, *options) return ret
def get_similarity_new_matrix_weighted(): options = [similarity.metrics.df_new_keywords_list_weighted, similarity.metrics.sf_weight] categories = similarity.get_category_dict() ret = OrderedDict() for cat1, dlist1 in categories.items(): ret[cat1] = OrderedDict() for cat2, dlist2 in categories.items(): ret[cat1][cat2] = similarity.metrics.group_group_similarity(dlist1, dlist2, *options) return ret
def similarity_list_groups(): category_dict = similarity.get_category_dict() for name1, dlist1 in category_dict.items(): for name2, dlist2 in category_dict.items(): group_similarity2 = group_group_similarity(dlist1, dlist2, df_old_keywords_list, sf_simple) group_similarity = group_group_similarity(dlist1, dlist2, df_new_keywords_list, sf_simple) if name1 == name2: print "%s\t%s\n\t\t%s\t(old: %%%4.2f new: %%%4.2f)" % ( name1, name2, len(dlist1), group_similarity2 * 100, group_similarity * 100)
def minmax_similarity_items(): option_dict = {"new": [df_new_keywords_list, sf_simple], "old": [df_old_keywords_list, sf_simple]} category_dict = similarity.get_category_dict() for key, options in option_dict.items(): print key print print for name1, dlist1 in category_dict.items(): for name2, dlist2 in category_dict.items(): max_val = group_similarity(dlist1, dlist2, max, *options) min_val = group_similarity(dlist1, dlist2, min, *options) if name1 == name2: print "%s\t%s" % (len(dlist1), len(dlist2)) print "%s\t%s\n\t\min: %%%4.2f max: %%%4.2f\n" % (name1, name2, min_val, max_val)
def similarity_items(): option_dict = {"new": [df_new_keywords_list, sf_simple], "old": [df_old_keywords_list, sf_simple]} category_dict = similarity.get_category_dict() error = {} for key, options in option_dict.items(): error[key] = [] for name, dataset in iotools.load_datasets_dict().items(): cat = dataset['category'] self_sim = item_group_similarity(dataset, category_dict[cat], *options) out_sim, cat2 = max( [(item_group_similarity(dataset, dlist, *options), cat2) for cat2, dlist in category_dict.items() if cat2 != cat], key=lambda x: x[0]) if self_sim < out_sim: error[key].append(name) print "%s\t\t(%s)\n\t\tself: %%%4.2f out: %%%4.2f\t\t(%s)\n" % (name, cat, self_sim, out_sim, cat2) shared = len(set(error['new']).intersection(set(error['old']))) distinct = len(error['new']) + len(error['old']) - shared print shared, distinct
def get_dataset_compatibility(keyword_list, use_numbering_for_key=False, new_keywords=True): options = [df_simple_list, sf_simple] category_dict = similarity.get_category_dict() similarity_dict = {} for name, dlist in category_dict.items(): if new_keywords: dlist_keywords = [iotools.load_dataset_keywords_dict(dataset['name'])['all'] for dataset in dlist] else: dlist_keywords = [dataset['keywords'] for dataset in dlist] similarity_value = item_group_similarity(keyword_list, dlist_keywords, *options) similarity_dict[name] = similarity_value if use_numbering_for_key: ret = {} for cid, cat in enumerate(similarity.get_categories()): ret[cid] = similarity_dict[cat] return ret else: return similarity_dict
print dataset_format print '---' print formats print '---' print text print '---' print if len(formats-dataset_format) > 0: return False, "'%s' format(s) might mentions in long description, but not in formats (%s)" % ( ", ".join(formats-dataset_format), ", ".join(dataset_format)) return True, None category_dict = get_category_dict() def categories_conflicts(dataset): global category_dict current_cat = dataset['category'] options = [df_new_keywords_list, sf_simple] similarity_dict = dict([ (cat, item_group_similarity(dataset, dlist, *options)) for cat, dlist in category_dict.items() ]) max_cat = max(similarity_dict.items(), key=lambda x: x[1]) if max_cat[0] != current_cat and len(category_dict[current_cat]) > 1: return False, "Dataset is more similar to '%s' category (%4.2f%%) than its own category, '%s' (%4.2f%%)" % ( max_cat[0], max_cat[1] * 100, current_cat, similarity_dict[current_cat] * 100 ) return True, None
def get_all_datasets(): categories = similarity.get_category_dict() return render_template('all-datasets.html', dataset_dict=categories)
def get_all_datasets_with_keywords(): categories = similarity.get_category_dict() keywords = iotools.load_dataset_keywords_dict() return render_template('datasets-with-keywords.html', dataset_dict=categories, keywords=keywords)