def calculate_and_write_edge_weigthings_for_synsets(synset_filenames_dict, file_name):
  max_co_occurrence = calculate_max_co_occurrence(synset_filenames_dict)
  edge_weigthings_for_synsets = dict()
  how_many_added = 0
  how_many_done = 0
  how_many_to_do = len(synset_filenames_dict.keys()) * (len(synset_filenames_dict.keys())-1)
  write_edge_weightings_to_file(dict(), file_name)

  for synset1, filenames1 in synset_filenames_dict.iteritems():
    for synset2, filenames2 in synset_filenames_dict.iteritems():
      if synset1 < synset2:
        how_many_done += 1
        #if (synset1.name, synset2.name) not in similarity_histogram:
        similarity = wn.synset(synset1).lch_similarity(wn.synset(synset2))
        co_occurence = len(set(synset_filenames_dict[synset1]).intersection(set(synset_filenames_dict[synset2])))
        normalized_co_occurrence = co_occurence/max_co_occurrence
        if similarity < 2.0:
          similarity = 0
        if normalized_co_occurrence < 0.4:
          normalized_co_occurrence = 0
        edge_weighting = similarity + 4*normalized_co_occurrence
        if edge_weighting != 0:
          edge_weigthings_for_synsets[(synset1, synset2)] = edge_weighting
          how_many_added += 1
        if how_many_added > 1000:
          print_status("Done with " + str(how_many_done) + " von " + str(how_many_to_do) + "\n")
          write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True)
          edge_weigthings_for_synsets = dict()
          how_many_added = 0
  write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True)
def keyword_clustering_via_mcl(synset_filenames_dict):
  config = ConfigParser.SafeConfigParser()
  config.read('../config.cfg')
  edge_weightings_filename = config.get('Filenames for Pickles', 'edge_weightings_filename')

  calculate_and_write_edge_weigthings_for_synsets(synset_filenames_dict, edge_weightings_filename)
  print_status("Done calculation of edge weightings and writing to file.\n")

  print_status("Start MCL Clustering...")
  mcl_clustering(edge_weightings_filename)
  print "Done."
def write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=False):
  print_status("Writing edge weightings to mcl readable file.")
  output_file = None
  if append_to_file:
    output_file = open(file_name, 'a')
  else:
    output_file = open(file_name, 'w')
  for (synset1, synset2), edge_weighting in edge_weigthings_for_synsets.iteritems():
    output_file.write(str(synset1) + ' ' + str(synset2) + ' ' + str(edge_weighting) + '\n')
  output_file.close()
  print "Done."
def calculate_average_distance(parsed_result_tree, image_id_tuples, id_type, verbose=False):
  distances = []
  for image_id_1, image_id_2 in image_id_tuples:
    if verbose: print_status("Checking for ids %s and %s (%s)... " % (image_id_1, image_id_2, id_type))
    distance = find_closest_match_to_nodes(parsed_result_tree, image_id_1, image_id_2)
    if distance != float('inf'):
      distances.append(distance)
      if verbose: sys.stdout.write("distance is: %s\n" % distance)
    else:
      if verbose: sys.stdout.write("one image could not be found!\n")
  return float(sum(distances)) / len(distances)
def main(args):
    print_status("Checking images against testset:\n")

    print_status("Loading visual_features from file... ")
    visual_features = general_helpers.load_visual_features()
    print "Done."
    print_status("Loading cluster_for_synsets from mcl_clusters file... ")
    cluster_for_synsets = general_helpers.load_cluster_for_synsets()
    print "Done."
    print_status("Loading keywords_for_pictures from file... ")
    keywords_for_pictures = general_helpers.load_keywords_for_pictures()
    print "Done."
    print_status("Loading cluster_representatives from file... ")
    cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6)
    print "Done loading preprocessed data."

    print_status("Checking images against testset:\n")
    print_status("Retrieving clusters... \n")
    # image_tree = get_searchtrees_with_filenames("food", use_meronyms=False, minimal_node_size=1)
    image_tree = pipeline.get_clusters(
        "food",
        use_meronyms=False,
        visual_clustering_threshold=10000,
        mcl_clustering_threshold=15,
        minimal_mcl_cluster_size=10,
        minimal_node_size=15,
        visual_features=visual_features,
        cluster_for_synsets=cluster_for_synsets,
        keywords_for_pictures=keywords_for_pictures,
        cluster_representatives=cluster_representatives,
    )

    sys.stdout.write("Collecting images from tree... \n")
    result_ids = recursively_collect_images(image_tree)

    sys.stdout.write("Loading testset from database... \n")
    testset_positive_ids, testset_negative_ids = retrieveTestsetResults(args.database_file)

    sys.stdout.write("Comparing result images to testset... \n")

    result_size = len(result_ids)
    testset_positive_size = len(testset_positive_ids)
    testset_negative_size = len(testset_negative_ids)

    true_positives = 0
    false_positives = 0

    for result_id in result_ids:
        if result_id in testset_positive_ids:
            true_positives += 1
            testset_positive_ids.remove(result_id)
        if result_id in testset_negative_ids:
            false_positives += 1
            testset_negative_ids.remove(result_id)

    false_negatives = len(testset_positive_ids)

    precision = float(true_positives) / (true_positives + false_positives)
    recall = float(true_positives) / (true_positives + false_negatives)

    sys.stdout.write("Done:\n\n")

    sys.stdout.write("Testset size:    %d\n\n" % (testset_positive_size + testset_negative_size))
    sys.stdout.write("Result size:     %d\n" % result_size)
    sys.stdout.write("Real positives:  %d\n\n" % testset_positive_size)
    sys.stdout.write("True Positives:  %d\n" % true_positives)
    sys.stdout.write("True Negatives:  ???\n")
    sys.stdout.write("False Positives: %d\n" % false_positives)
    sys.stdout.write("False Negatives: %d\n\n" % false_negatives)
    sys.stdout.write("Precision:       %f (tp / (tp + fp))\n" % precision)
    sys.stdout.write("Recall:          %f (tp / (tp + fn))\n" % recall)
    sys.stdout.write(
        "F-Measure:       %f (2 * (p * r / (p + r)))\n"
        % (2 * (float(precision) * float(recall)) / (precision + recall))
    )
def main(args):
  # Loading preprocessed features on startup
  print_status("Loading visual_features from file... ")
  visual_features = general_helpers.load_visual_features()
  print "Done."
  print_status("Loading cluster_for_synsets from mcl_clusters file... ")
  cluster_for_synsets = general_helpers.load_cluster_for_synsets()
  print "Done."
  print_status("Loading keywords_for_pictures from file... ")
  keywords_for_pictures = general_helpers.load_keywords_for_pictures()
  print "Done."
  print_status("Loading cluster_representatives from file... ")
  cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6)
  print "Done loading preprocessed data."

  print_status("Checking images against testset:\n")
  print_status("Retrieving clusters... \n")
  pipeline_result = pipeline.get_clusters("food", use_meronyms=False,
                                     visual_clustering_threshold=100000,
                                     mcl_clustering_threshold=10,
                                     minimal_mcl_cluster_size=1,
                                     minimal_node_size=10,
                                     visual_features=visual_features,
                                     cluster_for_synsets=cluster_for_synsets,
                                     keywords_for_pictures=keywords_for_pictures,
                                     cluster_representatives=cluster_representatives)
  # pipeline_result = pickle.load(open('image_tree.pickle', 'r'))

  print_status("Parsing result tree to easier accessible format...")
  parsed_result_tree = parse_result_tree(pipeline_result)

  print_status("Loading testset from database... \n")
  same_object_ids, same_object_same_context_ids, not_similar_ids = retrieveTestsetResults(args.database_file)


  print_status("Comparing result images to testset... \n")

  average_same_object_distance  = calculate_average_distance(parsed_result_tree, same_object_ids, "same object", verbose=True)
  average_same_context_distance = calculate_average_distance(parsed_result_tree, same_object_same_context_ids, "same context", verbose=True)
  average_not_similar_distance  = calculate_average_distance(parsed_result_tree, not_similar_ids, "not_similar", verbose=True)

  print_status("Done!\n")
  sys.stdout.write("Average distance for same object  is %s with closeness %s \n" % (average_same_object_distance, float(1)/average_same_object_distance))
  sys.stdout.write("Average distance for same context is %s with closeness %s \n" % (average_same_context_distance, float(1)/average_same_context_distance))
  sys.stdout.write("Average distance for not similar  is %s with closeness %s \n" % (average_not_similar_distance, float(1)/average_not_similar_distance))
  sys.stdout.write("Distance %s \n" % (float(1)/average_same_object_distance - float(1)/average_not_similar_distance))
def main(args):
  # Loading preprocessed features on startup
  print_status("Loading visual_features from file... ")
  visual_features = general_helpers.load_visual_features()
  print "Done."
  print_status("Loading cluster_for_synsets from mcl_clusters file... ")
  cluster_for_synsets = general_helpers.load_cluster_for_synsets()
  print "Done."
  print_status("Loading keywords_for_pictures from file... ")
  keywords_for_pictures = general_helpers.load_keywords_for_pictures()
  print "Done."
  print_status("Loading cluster_representatives from file... ")
  cluster_representatives = general_helpers.load_cluster_representatives(how_many_per_cluster=6)
  print "Done loading preprocessed data."

  print_status("Checking images against testset:\n")
  print_status("Retrieving clusters... \n")
  pipeline_result = pipeline.get_clusters("food", use_meronyms=False,
                                     visual_clustering_threshold=100000,
                                     mcl_clustering_threshold=4,
                                     minimal_mcl_cluster_size=6,
                                     minimal_node_size=4,
                                     visual_features=visual_features,
                                     cluster_for_synsets=cluster_for_synsets,
                                     keywords_for_pictures=keywords_for_pictures,
                                     cluster_representatives=cluster_representatives)


  # # Comment in to load preprocessed pipeline_result for dev mode
  # pipeline_result = pickle.load(open('image_tree.pickle', 'r'))

  annotated_food_dict = json.load(open(args.food_id_file, 'r'))

  print_status("Flattening result tree... \n")
  flattened_mcl_tree = flatten_result_tree(pipeline_result, annotated_food_dict, size_from_id=0, size_to_id=-1)
  image_counter = len(flattened_mcl_tree.subclusters[0]['subcluster'])

  print_status("Loading visual_features from file... \n")
  visual_features = general_helpers.load_visual_features()

  true_positives_total  = []
  false_negatives_total = []
  true_negatives_total  = []
  false_positives_total = []

  for i in range(0, 10):
    print_status("Calculating visual clusters (%d x)... \n" % i)
    visually_clustered_result = combined_clustering.cluster_visually(copy.deepcopy(flattened_mcl_tree),
                                                                     visual_clustering_threshold=4,
                                                                     visual_features=visual_features)
  
    print_status("Convert visual clusters to simpler structure... \n")
    visual_clusters = []
    for visual_cluster in visually_clustered_result.subclusters[0]['subcluster']:
      visual_clusters.append(set([image_tuple[0].split('\\')[-1].split('.')[0] for image_tuple in visual_cluster]))
  
    print_status("Done clustering %d images into %d visual clusters. \n" % (image_counter, len(visual_clusters)))
  
    # # Comment in to load preprocessed visual_clusters for dev mode
    # visual_clusters = pickle.load(open('visual_clusters.pickle', 'r'))
  
    print_status("Loading testset from database... \n")
    visually_similar_tuples, visually_different_tuples = retrieveTestsetResults(args.database_file)
  
    print_status("Comparing clusters to testset... \n")

    true_negatives  = 0
    false_positives = 0
    true_positives  = 0
    false_negatives = 0

    print_status("Starting with visually similar tuples... \n")
    for id_tuple in visually_similar_tuples:
      if both_ids_are_found(id_tuple, visual_clusters):
        if one_cluster_contains_both_ids(id_tuple, visual_clusters):
          true_negatives += 1
        else:
          false_positives += 1
  
    print_status("Now checking different image tuples... \n")
    for id_tuple in visually_different_tuples:
      if both_ids_are_found(id_tuple, visual_clusters):
        if one_cluster_contains_both_ids(id_tuple, visual_clusters):
          false_negatives += 1
        else:
          true_positives += 1

    true_positives_total.append(true_positives)
    false_negatives_total.append(false_negatives)
    true_negatives_total.append(true_negatives)
    false_positives_total.append(false_positives)

  average_true_positives  = float(sum(true_positives_total))  / len(true_positives_total)
  average_false_negatives = float(sum(false_negatives_total)) / len(false_negatives_total)
  average_true_negatives  = float(sum(true_negatives_total))  / len(true_negatives_total)
  average_false_positives = float(sum(false_positives_total)) / len(false_positives_total)

  precision = float(average_true_positives) / (average_true_positives + average_false_positives)
  recall    = float(average_true_positives) / (average_true_positives + average_false_negatives)

  print_status("Done!\n\n")
  sys.stdout.write("Testset contains %5d visually similar   image tuples \n" % len(visually_similar_tuples))
  sys.stdout.write("And there are    %5d visually different image tuples \n\n" % len(visually_different_tuples))

  sys.stdout.write("Average true  positives: %f \n"   % average_true_positives)
  sys.stdout.write("Average false negatives: %f \n"   % average_false_negatives)
  sys.stdout.write("Average true  negatives: %f \n"   % average_true_negatives)
  sys.stdout.write("Average false positives: %f \n\n" % average_false_positives)

  sys.stdout.write("Precision: %f (tp / (tp + fp))\n" % precision)
  sys.stdout.write("Recall:    %f (tp / (tp + fn))\n" % recall)
  sys.stdout.write("F-Measure: %f (2 * (p * r / (p + r)))\n" % (2 * (float(precision) * float(recall)) / (precision + recall)))
Пример #8
0
import sys
from flask import Flask, url_for, Response
from flask import render_template
from flask_assets import Environment, Bundle
import string

# import own modules
from clustering.pipeline import get_clusters
from helpers.general_helpers import print_status, load_visual_features, load_cluster_for_synsets, load_keywords_for_pictures
from helpers.general_helpers import load_cluster_representatives

app = Flask(__name__)
assets = Environment(app)

# Loading preprocessed features on startup
print_status("Loading cluster_for_synsets from mcl_clusters file... ")
cluster_for_synsets = load_cluster_for_synsets()
print "Done."
print_status("Loading keywords_for_pictures from file... ")
keywords_for_pictures = load_keywords_for_pictures()
print "Done."
print_status("Loading cluster_representatives from file... ")
cluster_representatives = load_cluster_representatives(how_many_per_cluster=6)
print "Done.\n\n"
bufferedSearches = {}

print_status("Server is ready!\n\n")

@app.route("/")
def hello():
  return render_template('index.html')