def plot_tradeoffs(db, allres, allstats, code_size, recall_rank): stat0 = next(iter(allstats.values())) d = stat0["d"] n_threads = stat0["n_threads"] recall_idx = stat0["ranks"].index(recall_rank) # times come after the perf measure times_idx = len(stat0["ranks"]) if type(code_size) == int: if code_size == 0: code_size = [0, 1e50] code_size_name = "any code size" else: code_size_name = "code_size=%d" % code_size code_size = [code_size, code_size] elif type(code_size) == tuple: code_size_name = "code_size in [%d, %d]" % code_size else: assert False names_maxperf = [] for k in sorted(allres): v = allres[k] if v.ndim != 2: continue us = unitsize(d, k) if not code_size[0] <= us <= code_size[1]: continue names_maxperf.append((v[-1, recall_idx], k)) # sort from lowest to highest topline accuracy names_maxperf.sort() names = [name for mp, name in names_maxperf] selected_methods, optimal_points = \ extract_pareto_optimal(allres, names, recall_idx, times_idx) not_selected = list(set(names) - set(selected_methods)) print("methods without an optimal OP: ", not_selected) pyplot.title('database ' + db + ' ' + code_size_name) # grayed out lines for k in not_selected: v = allres[k] if v.ndim != 2: continue us = unitsize(d, k) if not code_size[0] <= us <= code_size[1]: continue linestyle = (':' if 'PQ' in k else '-.' if 'SQ4' in k else '--' if 'SQ8' in k else '-') pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=None, linestyle=linestyle, marker='o' if 'HNSW' in k else '+', color='#cccccc', linewidth=0.2) plot_subset(allres, allstats, selected_methods, recall_idx, times_idx) if len(not_selected) == 0: om = '' else: om = '\nomitted:' nc = len(om) for m in not_selected: if nc > 80: om += '\n' nc = 0 om += ' ' + m nc += len(m) + 1 # pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s") # print(optimal_points[0, :]) pyplot.xlabel('1-recall at %d %s' % (recall_rank, om)) pyplot.ylabel('QPS (%d threads)' % n_threads) pyplot.legend() pyplot.grid() return selected_methods, not_selected
def plot_subset(allres, allstats, selected_methods, recall_idx, times_idx=3, report=["overhead", "build time"]): # important methods for k in selected_methods: v = allres[k] stats = allstats[k] d = stats["d"] dbsize = stats["nb"] if "index_size" in stats and "tables_size" in stats: tot_size = stats['index_size'] + stats['tables_size'] else: tot_size = -1 id_size = 8 # 64 bit addt = '' if 'add_time' in stats: add_time = stats['add_time'] if add_time > 7200: add_min = add_time / 60 addt = ', %dh%02d' % (add_min / 60, add_min % 60) else: add_sec = int(add_time) addt = ', %dm%02d' % (add_sec / 60, add_sec % 60) code_size = unitsize(d, k) label = k if "code_size" in report: label += " %d bytes" % code_size tight_size = (code_size + id_size) * dbsize if tot_size < 0 or "overhead" not in report: pass # don't know what the index size is elif tot_size > 10 * tight_size: label += " overhead x%.1f" % (tot_size / tight_size) else: label += " overhead+%.1f%%" % (tot_size / tight_size * 100 - 100) if "build time" in report: label += " " + addt linestyle = (':' if 'Refine' in k or 'RFlat' in k else '-.' if 'SQ' in k else '-' if '4fs' in k else '-') print(k, linestyle) pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=label, linestyle=linestyle, marker='o' if '4fs' in k else '+') recall_rank = stats["ranks"][recall_idx] if stats["measure"] == "recall": pyplot.xlabel('1-recall at %d' % recall_rank) elif stats["measure"] == "inter": pyplot.xlabel('inter @ %d' % recall_rank) else: assert False pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])