def calculate_and_write_edge_weigthings_for_synsets(synset_filenames_dict, file_name): max_co_occurrence = calculate_max_co_occurrence(synset_filenames_dict) edge_weigthings_for_synsets = dict() how_many_added = 0 how_many_done = 0 how_many_to_do = len(synset_filenames_dict.keys()) * (len(synset_filenames_dict.keys())-1) write_edge_weightings_to_file(dict(), file_name) for synset1, filenames1 in synset_filenames_dict.iteritems(): for synset2, filenames2 in synset_filenames_dict.iteritems(): if synset1 < synset2: how_many_done += 1 #if (synset1.name, synset2.name) not in similarity_histogram: similarity = wn.synset(synset1).lch_similarity(wn.synset(synset2)) co_occurence = len(set(synset_filenames_dict[synset1]).intersection(set(synset_filenames_dict[synset2]))) normalized_co_occurrence = co_occurence/max_co_occurrence if similarity < 2.0: similarity = 0 if normalized_co_occurrence < 0.4: normalized_co_occurrence = 0 edge_weighting = similarity + 4*normalized_co_occurrence if edge_weighting != 0: edge_weigthings_for_synsets[(synset1, synset2)] = edge_weighting how_many_added += 1 if how_many_added > 1000: print_status("Done with " + str(how_many_done) + " von " + str(how_many_to_do) + "\n") write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True) edge_weigthings_for_synsets = dict() how_many_added = 0 write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True)
def keyword_clustering_via_mcl(synset_filenames_dict): config = ConfigParser.SafeConfigParser() config.read('../config.cfg') edge_weightings_filename = config.get('Filenames for Pickles', 'edge_weightings_filename') calculate_and_write_edge_weigthings_for_synsets(synset_filenames_dict, edge_weightings_filename) print_status("Done calculation of edge weightings and writing to file.\n") print_status("Start MCL Clustering...") mcl_clustering(edge_weightings_filename) print "Done."
def write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=False): print_status("Writing edge weightings to mcl readable file.") output_file = None if append_to_file: output_file = open(file_name, 'a') else: output_file = open(file_name, 'w') for (synset1, synset2), edge_weighting in edge_weigthings_for_synsets.iteritems(): output_file.write(str(synset1) + ' ' + str(synset2) + ' ' + str(edge_weighting) + '\n') output_file.close() print "Done."
def calculate_average_distance(parsed_result_tree, image_id_tuples, id_type, verbose=False): distances = [] for image_id_1, image_id_2 in image_id_tuples: if verbose: print_status("Checking for ids %s and %s (%s)... " % (image_id_1, image_id_2, id_type)) distance = find_closest_match_to_nodes(parsed_result_tree, image_id_1, image_id_2) if distance != float('inf'): distances.append(distance) if verbose: sys.stdout.write("distance is: %s\n" % distance) else: if verbose: sys.stdout.write("one image could not be found!\n") return float(sum(distances)) / len(distances)
def main(args): print_status("Checking images against testset:\n") print_status("Loading visual_features from file... ") visual_features = general_helpers.load_visual_features() print "Done." print_status("Loading cluster_for_synsets from mcl_clusters file... ") cluster_for_synsets = general_helpers.load_cluster_for_synsets() print "Done." print_status("Loading keywords_for_pictures from file... ") keywords_for_pictures = general_helpers.load_keywords_for_pictures() print "Done." print_status("Loading cluster_representatives from file... ") cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6) print "Done loading preprocessed data." print_status("Checking images against testset:\n") print_status("Retrieving clusters... \n") # image_tree = get_searchtrees_with_filenames("food", use_meronyms=False, minimal_node_size=1) image_tree = pipeline.get_clusters( "food", use_meronyms=False, visual_clustering_threshold=10000, mcl_clustering_threshold=15, minimal_mcl_cluster_size=10, minimal_node_size=15, visual_features=visual_features, cluster_for_synsets=cluster_for_synsets, keywords_for_pictures=keywords_for_pictures, cluster_representatives=cluster_representatives, ) sys.stdout.write("Collecting images from tree... \n") result_ids = recursively_collect_images(image_tree) sys.stdout.write("Loading testset from database... \n") testset_positive_ids, testset_negative_ids = retrieveTestsetResults(args.database_file) sys.stdout.write("Comparing result images to testset... \n") result_size = len(result_ids) testset_positive_size = len(testset_positive_ids) testset_negative_size = len(testset_negative_ids) true_positives = 0 false_positives = 0 for result_id in result_ids: if result_id in testset_positive_ids: true_positives += 1 testset_positive_ids.remove(result_id) if result_id in testset_negative_ids: false_positives += 1 testset_negative_ids.remove(result_id) false_negatives = len(testset_positive_ids) precision = float(true_positives) / (true_positives + false_positives) recall = float(true_positives) / (true_positives + false_negatives) sys.stdout.write("Done:\n\n") sys.stdout.write("Testset size: %d\n\n" % (testset_positive_size + testset_negative_size)) sys.stdout.write("Result size: %d\n" % result_size) sys.stdout.write("Real positives: %d\n\n" % testset_positive_size) sys.stdout.write("True Positives: %d\n" % true_positives) sys.stdout.write("True Negatives: ???\n") sys.stdout.write("False Positives: %d\n" % false_positives) sys.stdout.write("False Negatives: %d\n\n" % false_negatives) sys.stdout.write("Precision: %f (tp / (tp + fp))\n" % precision) sys.stdout.write("Recall: %f (tp / (tp + fn))\n" % recall) sys.stdout.write( "F-Measure: %f (2 * (p * r / (p + r)))\n" % (2 * (float(precision) * float(recall)) / (precision + recall)) )
def main(args): # Loading preprocessed features on startup print_status("Loading visual_features from file... ") visual_features = general_helpers.load_visual_features() print "Done." print_status("Loading cluster_for_synsets from mcl_clusters file... ") cluster_for_synsets = general_helpers.load_cluster_for_synsets() print "Done." print_status("Loading keywords_for_pictures from file... ") keywords_for_pictures = general_helpers.load_keywords_for_pictures() print "Done." print_status("Loading cluster_representatives from file... ") cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6) print "Done loading preprocessed data." print_status("Checking images against testset:\n") print_status("Retrieving clusters... \n") pipeline_result = pipeline.get_clusters("food", use_meronyms=False, visual_clustering_threshold=100000, mcl_clustering_threshold=10, minimal_mcl_cluster_size=1, minimal_node_size=10, visual_features=visual_features, cluster_for_synsets=cluster_for_synsets, keywords_for_pictures=keywords_for_pictures, cluster_representatives=cluster_representatives) # pipeline_result = pickle.load(open('image_tree.pickle', 'r')) print_status("Parsing result tree to easier accessible format...") parsed_result_tree = parse_result_tree(pipeline_result) print_status("Loading testset from database... \n") same_object_ids, same_object_same_context_ids, not_similar_ids = retrieveTestsetResults(args.database_file) print_status("Comparing result images to testset... \n") average_same_object_distance = calculate_average_distance(parsed_result_tree, same_object_ids, "same object", verbose=True) average_same_context_distance = calculate_average_distance(parsed_result_tree, same_object_same_context_ids, "same context", verbose=True) average_not_similar_distance = calculate_average_distance(parsed_result_tree, not_similar_ids, "not_similar", verbose=True) print_status("Done!\n") sys.stdout.write("Average distance for same object is %s with closeness %s \n" % (average_same_object_distance, float(1)/average_same_object_distance)) sys.stdout.write("Average distance for same context is %s with closeness %s \n" % (average_same_context_distance, float(1)/average_same_context_distance)) sys.stdout.write("Average distance for not similar is %s with closeness %s \n" % (average_not_similar_distance, float(1)/average_not_similar_distance)) sys.stdout.write("Distance %s \n" % (float(1)/average_same_object_distance - float(1)/average_not_similar_distance))
def main(args): # Loading preprocessed features on startup print_status("Loading visual_features from file... ") visual_features = general_helpers.load_visual_features() print "Done." print_status("Loading cluster_for_synsets from mcl_clusters file... ") cluster_for_synsets = general_helpers.load_cluster_for_synsets() print "Done." print_status("Loading keywords_for_pictures from file... ") keywords_for_pictures = general_helpers.load_keywords_for_pictures() print "Done." print_status("Loading cluster_representatives from file... ") cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6) print "Done loading preprocessed data." print_status("Checking images against testset:\n") print_status("Retrieving clusters... \n") pipeline_result = pipeline.get_clusters("food", use_meronyms=False, visual_clustering_threshold=100000, mcl_clustering_threshold=4, minimal_mcl_cluster_size=6, minimal_node_size=4, visual_features=visual_features, cluster_for_synsets=cluster_for_synsets, keywords_for_pictures=keywords_for_pictures, cluster_representatives=cluster_representatives) # # Comment in to load preprocessed pipeline_result for dev mode # pipeline_result = pickle.load(open('image_tree.pickle', 'r')) annotated_food_dict = json.load(open(args.food_id_file, 'r')) print_status("Flattening result tree... \n") flattened_mcl_tree = flatten_result_tree(pipeline_result, annotated_food_dict, size_from_id=0, size_to_id=-1) image_counter = len(flattened_mcl_tree.subclusters[0]['subcluster']) print_status("Loading visual_features from file... \n") visual_features = general_helpers.load_visual_features() true_positives_total = [] false_negatives_total = [] true_negatives_total = [] false_positives_total = [] for i in range(0, 10): print_status("Calculating visual clusters (%d x)... \n" % i) visually_clustered_result = combined_clustering.cluster_visually(copy.deepcopy(flattened_mcl_tree), visual_clustering_threshold=4, visual_features=visual_features) print_status("Convert visual clusters to simpler structure... \n") visual_clusters = [] for visual_cluster in visually_clustered_result.subclusters[0]['subcluster']: visual_clusters.append(set([image_tuple[0].split('\\')[-1].split('.')[0] for image_tuple in visual_cluster])) print_status("Done clustering %d images into %d visual clusters. \n" % (image_counter, len(visual_clusters))) # # Comment in to load preprocessed visual_clusters for dev mode # visual_clusters = pickle.load(open('visual_clusters.pickle', 'r')) print_status("Loading testset from database... \n") visually_similar_tuples, visually_different_tuples = retrieveTestsetResults(args.database_file) print_status("Comparing clusters to testset... \n") true_negatives = 0 false_positives = 0 true_positives = 0 false_negatives = 0 print_status("Starting with visually similar tuples... \n") for id_tuple in visually_similar_tuples: if both_ids_are_found(id_tuple, visual_clusters): if one_cluster_contains_both_ids(id_tuple, visual_clusters): true_negatives += 1 else: false_positives += 1 print_status("Now checking different image tuples... \n") for id_tuple in visually_different_tuples: if both_ids_are_found(id_tuple, visual_clusters): if one_cluster_contains_both_ids(id_tuple, visual_clusters): false_negatives += 1 else: true_positives += 1 true_positives_total.append(true_positives) false_negatives_total.append(false_negatives) true_negatives_total.append(true_negatives) false_positives_total.append(false_positives) average_true_positives = float(sum(true_positives_total)) / len(true_positives_total) average_false_negatives = float(sum(false_negatives_total)) / len(false_negatives_total) average_true_negatives = float(sum(true_negatives_total)) / len(true_negatives_total) average_false_positives = float(sum(false_positives_total)) / len(false_positives_total) precision = float(average_true_positives) / (average_true_positives + average_false_positives) recall = float(average_true_positives) / (average_true_positives + average_false_negatives) print_status("Done!\n\n") sys.stdout.write("Testset contains %5d visually similar image tuples \n" % len(visually_similar_tuples)) sys.stdout.write("And there are %5d visually different image tuples \n\n" % len(visually_different_tuples)) sys.stdout.write("Average true positives: %f \n" % average_true_positives) sys.stdout.write("Average false negatives: %f \n" % average_false_negatives) sys.stdout.write("Average true negatives: %f \n" % average_true_negatives) sys.stdout.write("Average false positives: %f \n\n" % average_false_positives) sys.stdout.write("Precision: %f (tp / (tp + fp))\n" % precision) sys.stdout.write("Recall: %f (tp / (tp + fn))\n" % recall) sys.stdout.write("F-Measure: %f (2 * (p * r / (p + r)))\n" % (2 * (float(precision) * float(recall)) / (precision + recall)))
import sys from flask import Flask, url_for, Response from flask import render_template from flask_assets import Environment, Bundle import string # import own modules from clustering.pipeline import get_clusters from helpers.general_helpers import print_status, load_visual_features, load_cluster_for_synsets, load_keywords_for_pictures from helpers.general_helpers import load_cluster_representatives app = Flask(__name__) assets = Environment(app) # Loading preprocessed features on startup print_status("Loading cluster_for_synsets from mcl_clusters file... ") cluster_for_synsets = load_cluster_for_synsets() print "Done." print_status("Loading keywords_for_pictures from file... ") keywords_for_pictures = load_keywords_for_pictures() print "Done." print_status("Loading cluster_representatives from file... ") cluster_representatives = load_cluster_representatives(how_many_per_cluster=6) print "Done.\n\n" bufferedSearches = {} print_status("Server is ready!\n\n") @app.route("/") def hello(): return render_template('index.html')