Exemplo n.º 1
0
def plot(matrix, name=None):
    # plt.xkcd()
    plt.clf()
    indexes = OrderedDict([(cat, index) for index, cat in enumerate(similarity.get_category_dict())])
    size = 3000

    # fig, ax = plt.subplots()
    # ax.xaxis.set_ticks_position('top')
    # plt.tick_params(top=True, bottom=False)

    xoptions = {'rotation': -45, 'horizontalalignment': 'right', 'rotation_mode': 'anchor', 'size': 'x-small'}
    yoptions = {'rotation': -45, 'horizontalalignment': 'right', 'rotation_mode': 'anchor', 'size': 'x-small'}
    plt.xticks(range(0, len(indexes)), indexes.keys(), **xoptions)
    plt.yticks(range(0, len(indexes)), indexes.keys(), **yoptions)
    plt.gcf().axes[0].xaxis.set_ticks_position('top')
    plt.xticks(range(0, len(indexes)), indexes.keys(), **xoptions)

    for cat1 in matrix:
        mcat = max(matrix[cat1], key=lambda x: matrix[cat1][x])
        for cat2, val in matrix[cat1].items():
            color = 'g'
            if cat2 == mcat:
                color = 'r'
            if cat1 == cat2:
                color = 'b'
            plt.scatter(indexes[cat2], indexes[cat1], c=color, s=val * size, alpha=0.5)

    plt.grid(True)
    plt.subplots_adjust(left=0.20, right=0.95, top=0.80, bottom=0.05)
    # plt.show()
    plt.gcf().set_size_inches(10, 10)

    if name:
        iotools.make_dir(name)
        plt.savefig(name, dpi=300)  # save as png
Exemplo n.º 2
0
def get_similarity_old_matrix():
    options = [similarity.metrics.df_old_keywords_list, similarity.metrics.sf_simple]
    categories = similarity.get_category_dict()
    ret = OrderedDict()
    for cat1, dlist1 in categories.items():
        ret[cat1] = OrderedDict()
        for cat2, dlist2 in categories.items():
            ret[cat1][cat2] = similarity.metrics.group_group_similarity(dlist1, dlist2, *options)
    return ret
Exemplo n.º 3
0
def get_similarity_new_matrix_weighted():
    options = [similarity.metrics.df_new_keywords_list_weighted, similarity.metrics.sf_weight]
    categories = similarity.get_category_dict()
    ret = OrderedDict()
    for cat1, dlist1 in categories.items():
        ret[cat1] = OrderedDict()
        for cat2, dlist2 in categories.items():
            ret[cat1][cat2] = similarity.metrics.group_group_similarity(dlist1, dlist2, *options)
    return ret
Exemplo n.º 4
0
def similarity_list_groups():
    category_dict = similarity.get_category_dict()
    for name1, dlist1 in category_dict.items():
        for name2, dlist2 in category_dict.items():
            group_similarity2 = group_group_similarity(dlist1, dlist2, df_old_keywords_list,
                                                       sf_simple)
            group_similarity = group_group_similarity(dlist1, dlist2, df_new_keywords_list,
                                                      sf_simple)
            if name1 == name2:
                print "%s\t%s\n\t\t%s\t(old: %%%4.2f new: %%%4.2f)" % (
                    name1, name2, len(dlist1), group_similarity2 * 100, group_similarity * 100)
Exemplo n.º 5
0
def minmax_similarity_items():
    option_dict = {"new": [df_new_keywords_list, sf_simple], "old": [df_old_keywords_list, sf_simple]}
    category_dict = similarity.get_category_dict()
    for key, options in option_dict.items():
        print key
        print
        print
        for name1, dlist1 in category_dict.items():
            for name2, dlist2 in category_dict.items():
                max_val = group_similarity(dlist1, dlist2, max, *options)
                min_val = group_similarity(dlist1, dlist2, min, *options)
                if name1 == name2:
                    print "%s\t%s" % (len(dlist1), len(dlist2))
                    print "%s\t%s\n\t\min: %%%4.2f max: %%%4.2f\n" % (name1, name2, min_val, max_val)
Exemplo n.º 6
0
def similarity_items():
    option_dict = {"new": [df_new_keywords_list, sf_simple], "old": [df_old_keywords_list, sf_simple]}
    category_dict = similarity.get_category_dict()
    error = {}
    for key, options in option_dict.items():
        error[key] = []
        for name, dataset in iotools.load_datasets_dict().items():
            cat = dataset['category']
            self_sim = item_group_similarity(dataset, category_dict[cat], *options)
            out_sim, cat2 = max(
                [(item_group_similarity(dataset, dlist, *options), cat2) for cat2, dlist in
                 category_dict.items() if cat2 != cat], key=lambda x: x[0])

            if self_sim < out_sim:
                error[key].append(name)
                print "%s\t\t(%s)\n\t\tself: %%%4.2f out: %%%4.2f\t\t(%s)\n" % (name, cat, self_sim, out_sim, cat2)
    shared = len(set(error['new']).intersection(set(error['old'])))
    distinct = len(error['new']) + len(error['old']) - shared
    print shared, distinct
Exemplo n.º 7
0
def get_dataset_compatibility(keyword_list, use_numbering_for_key=False, new_keywords=True):
    options = [df_simple_list, sf_simple]
    category_dict = similarity.get_category_dict()
    similarity_dict = {}
    for name, dlist in category_dict.items():
        if new_keywords:
            dlist_keywords = [iotools.load_dataset_keywords_dict(dataset['name'])['all'] for dataset in dlist]
        else:
            dlist_keywords = [dataset['keywords'] for dataset in dlist]
        similarity_value = item_group_similarity(keyword_list, dlist_keywords, *options)
        similarity_dict[name] = similarity_value

    if use_numbering_for_key:
        ret = {}
        for cid, cat in enumerate(similarity.get_categories()):
            ret[cid] = similarity_dict[cat]
        return ret
    else:
        return similarity_dict
Exemplo n.º 8
0
        print dataset_format
        print '---'
        print formats
        print '---'
        print text
        print '---'
        print

    if len(formats-dataset_format) > 0:
        return False, "'%s' format(s) might mentions in long description, but not in formats (%s)" % (
            ", ".join(formats-dataset_format), ", ".join(dataset_format))

    return True, None


category_dict = get_category_dict()
def categories_conflicts(dataset):
    global category_dict
    current_cat = dataset['category']
    options = [df_new_keywords_list, sf_simple]
    similarity_dict = dict([
        (cat, item_group_similarity(dataset, dlist, *options))
        for cat, dlist in category_dict.items()
    ])

    max_cat = max(similarity_dict.items(), key=lambda x: x[1])

    if max_cat[0] != current_cat and len(category_dict[current_cat]) > 1:
        return False, "Dataset is more similar to '%s' category (%4.2f%%) than its own category, '%s' (%4.2f%%)" % (
            max_cat[0], max_cat[1] * 100, current_cat, similarity_dict[current_cat] * 100 )
    return True, None
Exemplo n.º 9
0
def get_all_datasets():
    categories = similarity.get_category_dict()
    return render_template('all-datasets.html', dataset_dict=categories)
Exemplo n.º 10
0
def get_all_datasets_with_keywords():
    categories = similarity.get_category_dict()
    keywords = iotools.load_dataset_keywords_dict()
    return render_template('datasets-with-keywords.html', dataset_dict=categories, keywords=keywords)