def dissimilarity_obj_fun(prioritized_tcs, tcs_minhashes): # Get first TC checked_tcs = [prioritized_tcs.pop()] total_dist = 0 while len(prioritized_tcs) > 0: # Get next TC current_tc = prioritized_tcs.pop() # Get average distance from others TCs acc_dist = 0 for tc in checked_tcs: acc_dist += lsh.jDistanceEstimate(tcs_minhashes[current_tc], tcs_minhashes[tc]) total_dist += acc_dist / len(checked_tcs) # Add to checked TCs to compare with the next checked_tcs.append(current_tc) # Return average distance return total_dist / len(checked_tcs)
def pw_fn(candidates, tcs_minhashes, selected_tcs_minhash, prioritized_tcs, tcs, n, times): selected_tc, max_dist = random.choice(tuple(candidates)), -1 for candidate in tcs_minhashes: if candidate in candidates: dist = lsh.jDistanceEstimate(selected_tcs_minhash, tcs_minhashes[candidate]) if dist > max_dist: selected_tc, max_dist = candidate, dist for i in xrange(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) tcs -= set([selected_tc]) del tcs_minhashes[selected_tc]
def fast_pw(input_file, r, b, bbox=False, k=5, memory=False, B=0): """INPUT (str)input_file: path of input file (int)r: number of rows (int)b: number of bands (bool)bbox: True if BB prioritization (int)k: k-shingle size (for BB prioritization) (bool)memory: if True keep signature in memory and do not store them to file OUTPUT (list)P: prioritized test suite """ n = r * b # number of hash functions hashes = [lsh.hashFamily(i) for i in range(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.perf_counter() tcs_minhashes = { tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items() } mh_time = time.perf_counter() - mh_t else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.perf_counter() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.perf_counter() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) tcs_minhashes, load_time = loadSignatures(sigfile) ptime_start = time.perf_counter() tcs = set(tcs_minhashes.keys()) # budget B modification if B == 0: B = len(tcs) BASE = 0.5 SIZE = int(len(tcs) * BASE) + 1 bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) prioritized_tcs = [0] # First TC selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) first_tc = random.choice(list(tcs_minhashes.keys())) for i in range(n): if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[first_tc][i] prioritized_tcs.append(first_tc) tcs -= set([first_tc]) del tcs_minhashes[first_tc] iteration, total = 0, float(len(tcs_minhashes)) while len(tcs_minhashes) > 0: iteration += 1 if iteration % 100 == 0: sys.stdout.write(" Progress: {}%\r".format( round(100 * iteration / total, 2))) sys.stdout.flush() if len(tcs_minhashes) < SIZE: bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) SIZE = int(SIZE * BASE) + 1 sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: candidates = tcs_minhashes.keys() selected_tc, max_dist = random.choice(tuple(candidates)), -1 for candidate in tcs_minhashes: if candidate in candidates: dist = lsh.jDistanceEstimate(selected_tcs_minhash, tcs_minhashes[candidate]) if dist > max_dist: selected_tc, max_dist = candidate, dist for i in range(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) # select budget B if len(prioritized_tcs) >= B + 1: break tcs -= set([selected_tc]) del tcs_minhashes[selected_tc] ptime = time.perf_counter() - ptime_start max_ts_size = sum((1 for line in open(input_file))) return mh_time, ptime, prioritized_tcs[1:max_ts_size]
def fast_pw(input_file, wBoxFile, r, b, bbox=False, k=5, memory=False): n = r * b # number of hash functions tC0 = time.clock() C = loadCoverage(wBoxFile) tC1 = time.clock() maxCov = reduce(lambda x, y: x | y, C.values()) hashes = [lsh.hashFamily(i) for i in range(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.clock() tcs_minhashes = { tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items() } mh_time = time.clock() - mh_t ptime_start = time.clock() else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.clock() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.clock() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) ptime_start = time.clock() tcs_minhashes, load_time = loadSignatures(sigfile) tcs = set(tcs_minhashes.keys()) BASE = 0.5 SIZE = int(len(tcs) * BASE) + 1 bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) prioritized_tcs = [0] # First TC selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) first_tc = random.choice(list(tcs_minhashes.keys())) for i in range(n): if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[first_tc][i] prioritized_tcs.append(first_tc) cov = C[first_tc] for tc in C.keys(): C[tc] = C[tc] - cov if tc in tcs and len(C[tc]) == 0: tcs -= set([tc]) del tcs_minhashes[tc] iteration, total = 0, float(len(tcs_minhashes)) while cov != maxCov: iteration += 1 if iteration % 100 == 0: sys.stdout.write(" Progress: {}%\r".format( round(100 * iteration / total, 2))) sys.stdout.flush() if len(tcs_minhashes) < SIZE: bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) SIZE = int(SIZE * BASE) + 1 sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: candidates = tcs_minhashes.keys() selected_tc, max_dist = random.choice(tuple(candidates)), -1 for candidate in tcs_minhashes: if candidate in candidates: dist = lsh.jDistanceEstimate(selected_tcs_minhash, tcs_minhashes[candidate]) if dist > max_dist: selected_tc, max_dist = candidate, dist for i in range(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) cov = cov | C[selected_tc] for tc in C.keys(): C[tc] = C[tc] - cov if tc in tcs and len(C[tc]) == 0: tcs -= set([tc]) del tcs_minhashes[tc] ptime = time.clock() - ptime_start max_ts_size = sum((1 for line in open(input_file))) return mh_time, tC1 - tC0, ptime, prioritized_tcs[1:max_ts_size]