def benchmark_lshensemble(threshold, num_perm, num_part, m, index_data, query_data): print("Building LSH Ensemble index") (minhashes, indexed_sets, keys) = index_data lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=num_part, m=m) lsh.index((key, minhash, len(s)) for key, minhash, s in \ zip(keys, minhashes[num_perm], indexed_sets)) print("Querying") (minhashes, sets, keys) = query_data probe_times = [] process_times = [] results = [] for qs, minhash in zip(sets, minhashes[num_perm]): # Record probing time start = time.perf_counter() result = list(lsh.query(minhash, len(qs))) probe_times.append(time.perf_counter() - start) # Record post processing time. start = time.perf_counter() [_compute_containment(qs, indexed_sets[key]) for key in result] process_times.append(time.perf_counter() - start) results.append(result) sys.stdout.write("\rQueried {} sets".format(len(results))) sys.stdout.write("\n") return results, probe_times, process_times
def load_LSH(lib_profiles, mode=MODE.SCALABLE, repackage=False, processes=None): """Load library profiles to an LSH object. Args: lib_profiles (list): The list of library profiles. mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details. repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used. """ global LSH, LIB_RELATIONSHIP_GRAPHS weights = (0.5, 0.5) if repackage else (0.1, 0.9) LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD, num_perm=LSH_PERM_NUM, num_part=32, weights=weights) (minhash_list, LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile( lib_profiles=lib_profiles, mode=mode, repackage=repackage, processes=processes) LOGGER.info("Start indexing LSH (this could take a while) ...") start_time = time.time() LSH.index(minhash_list) end_time = time.time() LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
def cluster_obs_group(self, candidates): ensemble = MinHashLSHEnsemble(threshold=0.95, num_perm=128) ensemble.index((c,) + self.hashes[c] for c in candidates if self.hashes[c][1] > 0) clusters = [] while candidates: rep = candidates.pop() clus = [rep] h, l = self.hashes[rep] if l == 0: # An empty representative will cause division by # zero in ensemble.query(); instead, group it with # all the other empty candidates, and no others. for other in list(candidates): if self.hashes[other][1] == 0: clus.append(other) candidates.discard(other) else: for other in ensemble.query(h, l): if other in candidates: clus.append(other) candidates.discard(other) clusters.append(clus) return clusters
def __init__(self, value_format: str = 'h', threshold: float = 0.8, num_perm: int = 128, num_part: int = 32, tokenizer: Tokenizer = Tokenizer('zh')): self.value_format = value_format self.threshold = threshold self.num_perm = num_perm self.num_part = num_part self.tokenizer = tokenizer self.lsh = MinHashLSHEnsemble(threshold=self.threshold, num_perm=self.num_perm) self.record_dawg = dawg.RecordDAWG(self.value_format)
def benchmark_lshensemble(threshold, num_perm, num_part, l, index_data, query_data): print("Building LSH Ensemble index") lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=num_part, l=l) lsh.index((key, minhash, len(set)) for key, minhash, set in \ zip(index_data.keys, index_data.minhashes[num_perm], index_data.sets)) print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = list(lsh.query(minhash, len(qs))) duration = time.clock() - start times.append(duration) results.append( sorted([[key, _compute_containment(qs, index_data.sets[key])] for key in result], key=lambda x: x[1], reverse=True)) return times, results
def exec_comparison_heuristic(self, source_func: BDFunction, target_func: BDFunction) \ -> bool: # Populate the attribute values source_hash = self.loaded_attributes[ 'FunctionMinHashLSH'].extract_attribute(source_func) target_hash = self.loaded_attributes[ 'FunctionMinHashLSH'].extract_attribute(target_func) # Create an LSH Ensemble index with threshold and number of partition # settings. lshensemble = MinHashLSHEnsemble( threshold=Configuration.MINHASH_LSH_ENSEMBLE_THRESHOLD, num_perm=Configuration.MINHASH_PERMUTATIONS, num_part=Configuration.MINHASH_LSH_ENSEMBLE_PARTITIONS) lshensemble.index([("source_function", source_hash, len()), ("m3", m3, len(set3))]) if source_hash == target_hash: return True return False
def __init__(self, hash_type=None, bits=None, hash_func=None, params=None): self.hash_type = hash_type self.hash_func = hash_func self.hash_bits = bits self.hashfunc = sha1_hash32 if self.hash_bits in {32, "32", None}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash32 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh32 else: # "hash32","default": self.hashfunc = sha1_hash32 elif self.hash_bits in {64, "64"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash64 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash64 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh64 else: self.hashfunc = sha1_hash64 elif self.hash_bits in {128, "128"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash128 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash128 else: raise ValueError("请检查对应的hash函数类型与位数") else: raise ValueError("请检查对应的hash函数的位数") if not params: params = {} """ 若只用redis 作为存储截止 配置 storage_config={ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}, } 要顺序插入大量MinHash,建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。 data_list = [("m1", m1), ("m2", m2), ("m3", m3)] with lsh.insertion_session() as session: for key, minhash in data_list: session.insert(key, minhash) 请注意,在打开插入会话期间查询LSH对象可能会导致不一致。 MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例(例如, 当您使用MinHashLSH逐步对文档进行群集时)。 Cassandra存储选项可以配置如下: storage_config={ 'type': 'cassandra', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, }} 参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间(如果尚不存在)时要使用的参数。 如果要强制创建表或键空间(因此要删除现有表或键空间),请将drop_tables和drop_keyspace选项设置为 True。 像Redis副本一样,建议使用插入会话来减少批量插入期间的网络调用数量。 +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 如果您的LSH使用外部存储层(例如Redis),则可以跨多个进程共享它。有两种方法可以做到这一点: 推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的,因此您可以调用pickle: import pickle # Create your LSH object lsh = ... # Serialize the LSH data = pickle.dumps(lsh) # Now you can pass it as an argument to a forked process or simply save it # in an external storage. # In a different process, deserialize the LSH lsh = pickle.loads(data) 使用pickle,您可以保存有关LSH所需的所有知识,例如在一个位置中进行各种参数设置。 另外,您可以在首次创建LSH时在存储配置中指定基本名称。例如: # For Redis. lsh = MinHashLSH( threshold=0.5, num_perm=128, storage_config={ 'type': 'redis', 'basename': b'unique_name_6ac4fg', 'redis': {'host': 'localhost', 'port': 6379}, } ) # For Cassandra. lsh = MinHashLSH( threashold=0.5, num_perm=128, storage_config={ 'type': 'cassandra', 'basename': b'unique_name', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } } ) 的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此,如果使用相同的基名创建新的LSH对象,则将在与旧LSH关联的存储层中使用相同的基础数据。 如果不指定basename,则MinHash LSH将生成一个随机字符串作为基本名称,并且极不可能发生冲突。 更详细的使用见 文档 :http://ekzhu.com/datasketch/lsh.html """ if self.hash_type in {"minhash", "MinHash"}: # 主要计算Jaccard 的相似度, 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度 self.hash = MinHash( num_perm=params.get( "num_perm", 128), # int可选项, 如果hashvalues值不是None,则被忽略。随机排列函数的数量 # 用来控制hash 的精度 seed=params.get("seed", 1), # 随机种子 可选 hashfunc=self. hashfunc, # 可选 使用的hash函数,将输入传递给update 方法。并返回一个可以用32位编码的整数 hashobj=params.get("hashobj", None), # Deprecated.已经被hashfunc 代替 hashvalues=params.get("hashvalues", None), # 可选 数组或列表 permutations=params.get( "permutations", None)) # 置换函数参数, 可选,可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化 elif self.hash_type in { "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh", "MinHashLSH" }: # 加权的最小哈希局部敏感哈希 # WeightedMinHashLSH() 与 MinHashLSH 等价 。 加权jaccard 相似度 查询 # 不支持top-k查询, 但minhashlshforest 支持top-k self.hash = MinHashLSH( threshold=params.get("threshold", 0.9), # 杰卡德距离的阈值 num_perm=params.get("num_perm", 128), # 置换函数设定个数, 在加权minihash 上的 样本规模大小 weights=params.get("weights", (0.5, 0.5)), # 元组, 可选项, 优化jaccard阈值 params=params.get("params", None), # 元组,可选项, – bands 的数量与规模大小 storage_config=params.get("storage_config", None), # 存储配置 prepickle=params.get("prepickle", None)) # 默认使用pk格式存储 elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}: # 相比MinHash 中,内存更小的哈希。 self.hash = LeanMinHash(minhash=params.get("minhash", None), seed=params.get("seed", None), hashvalues=params.get("hashvalues", None)) elif self.hash_type in { "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF" }: self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128), l=params.get("l", 8)) elif self.hash_type in { "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE" }: # 使用新距离做的minhashlsh操作 , 即使用Containment 中文简称为遏制 self.hash = MinHashLSHEnsemble( threshold=params.get("threshold", 0.9), num_perm=params.get("num_perm", 128), num_part=params.get("num_part", 16), # m=params.get("m", 8), weights=params.get("weights", (0.5, 0.5)), storage_config=params.get("storage_config", None), prepickle=params.get("prepickle", None)) elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}: # 相关的接口与HyperLogLog 相同 # HyperLogLog能够使用较小且固定的内存空间,单次估算数据集的基数(不同值的数量) self.hash = HyperLogLog( p=params.get("p", 8), # 与MinHash 中的数据相比较,num_perm 用于控制精度 reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash32), # 内部使用的hash 算法 hashobj=params.get("hashobj", None)) # 可选 数组或列表, 使用hashfunc 代替了 elif self.hash_type in { "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++", "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++" }: # 相关的接口与HyperLogLog 相同 self.hash = HyperLogLogPlusPlus( p=params.get("p", 8), reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash64), # 使用的64位的hash 算法 hashobj=params.get("hashobj", None)) else: raise ValueError("请选择正确的函数函数对象")
"rocket", "bike", "scooter", "motorcyle", "SUV", "jet", "horse" ]) # Create MinHash objects m1 = MinHash(num_perm=128) m2 = MinHash(num_perm=128) m3 = MinHash(num_perm=128) for d in set1: m1.update(d.encode('utf8')) for d in set2: m2.update(d.encode('utf8')) for d in set3: m3.update(d.encode('utf8')) # Create an LSH Ensemble index with a threshold lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128) # Index takes an iterable of (key, minhash, size) lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))]) # Check for membership using the key print("m2" in lshensemble) print("m3" in lshensemble) # Using m1 as the query, get an result iterator print("Sets with containment > 0.2:") for key in lshensemble.query(m1, len(set1)): print(key) from datasketch import HyperLogLog, HyperLogLogPlusPlus
def main(corpus: str, mode: str, lsh_file: str, minhash_file: str, num_perm: int, shingles: int, threshold: float, n_jobs: int, output_dir: str): if mode != 'query': assert not lsh_file if mode == 'minhash-only': assert not minhash_file print("Making output dir:", output_dir) output_dir = Path(output_dir) output_dir.mkdir() if minhash_file: print('Loading MinHashes from disk:', minhash_file) start = time.time() with open(minhash_file, 'rb') as f: cached_minhashes = pickle.load(f) print("Done loading MinHashes, time elapsed (sec):", time.time() - start) corpus_len = len(cached_minhashes) minhash_iter = starmap(lambda k, v: (k, v, None), cached_minhashes.items()) minhashes = None # Set to none to disable saving minhashes again else: if corpus == 'webtext': corpus_len = 8_282_020 corpus_iter = make_corpus_iter(DATA_DIR / 'webtext_detokenized') elif corpus == 'openwebtext': corpus_len = 8_013_769 corpus_iter = make_corpus_iter(DATA_DIR / 'openwebtext_shards') else: raise RuntimeError print("Using", n_jobs, "processes for MinHashing") minhashes = {} minhash_iter = parallel_create_minhashes(corpus_iter, shingles=shingles, num_perm=num_perm, n_jobs=n_jobs) print("Starting...") if mode == 'lsh' or mode == 'lsh-ensemble': if mode == 'lsh': lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) with lsh.insertion_session() as session: for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='Making MinHashLSH'): if minhashes: minhashes[key] = minhash session.insert( key, minhash, check_duplication=False) # All keys are unique doc ids else: assert mode == 'lsh-ensemble' lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=16) # TODO: try 32 lsh.index( tqdm(minhash_iter, total=corpus_len, desc='Making MinHashLSHEnsemble')) # Save LSH print("Saving LSH...") start = time.time() with open(output_dir / 'lsh.pkl', 'wb') as f: pickle.dump(lsh, f) print("Done saving LSH, time elapsed (sec):", time.time() - start) elif mode == 'query': print('Loading LSH:', lsh_file) start = time.time() with open(lsh_file, 'rb') as f: lsh = pickle.load(f) assert isinstance(lsh, MinHashLSH) and lsh.h == num_perm print("Done loading LSH, time elapsed (sec):", time.time() - start) duplicates_file = output_dir / 'duplicates.jsonl' print("Writing duplicates to", duplicates_file) with open(duplicates_file, 'a') as f: for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='Querying MinHashLSH'): if minhashes: minhashes[key] = minhash duplicates = lsh.query(minhash) if duplicates: json.dump({key: duplicates}, f) f.write('\n') elif mode == 'minhash-only': assert minhashes is not None for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='MinHashing'): minhashes[key] = minhash else: raise RuntimeError # Save MinHashes if not minhash_file: print("Saving MinHashes...") start = time.time() with open(output_dir / 'minhashes.pkl', 'wb') as f: pickle.dump(minhashes, f) print("Done saving MinHashes, time elapsed (sec):", time.time() - start)
class Sto: def __init__(self, value_format: str = 'h', threshold: float = 0.8, num_perm: int = 128, num_part: int = 32, tokenizer: Tokenizer = Tokenizer('zh')): self.value_format = value_format self.threshold = threshold self.num_perm = num_perm self.num_part = num_part self.tokenizer = tokenizer self.lsh = MinHashLSHEnsemble(threshold=self.threshold, num_perm=self.num_perm) self.record_dawg = dawg.RecordDAWG(self.value_format) def __check_get_store_path(self, data_path: Path): pnlp.check_dir(data_path) lsh_path = Path(data_path) / "lsh.pickle" dawg_path = Path(data_path) / "values.dawg" return lsh_path, dawg_path def load(self, data_path: Path): lsh_path, dawg_path = self.__check_get_store_path(data_path) if lsh_path.exists(): self.lsh = load_pickle(lsh_path) else: raise ValueError("lsh pickle: {} not exist.".format(lsh_path)) if dawg_path.exists(): self.record_dawg.load(str(dawg_path)) else: raise ValueError("dawg file: {} not exist.".format(dawg_path)) def store(self, data_path: Path): lsh_path, dawg_path = self.__check_get_store_path(data_path) dump_pickle(lsh_path, self.lsh) self.record_dawg.save(dawg_path) def __check_value_format(self, val: tuple): if len(val) != len(self.value_format): raise ValueError( "value format {} does not match the value {}".format( self.value_format, val)) def add(self, text_list: List[str], value_list: List[tuple]): len_text = len(text_list) len_value = len(value_list) assert len_text == len_value data = {} entries = [] for i, text in enumerate(text_list): entry = self.text_to_lsh_entry(text) key = entry[0] if key in data: continue value = value_list[i] self.__check_value_format(value) data[key] = value entries.append(entry) self.lsh.index(entries) self.record_dawg = dawg.RecordDAWG(self.value_format, data.items()) def query(self, text: str): key, mh, length = self.text_to_lsh_entry(text) if key in self.record_dawg: return self.record_dawg.get(key)[0] for sim_key in self.lsh.query(mh, length): return self.record_dawg.get(sim_key)[0] else: return def text_to_lsh_entry(self, text: str): words = self.tokenizer(text) bigrams = list(ngrams(words, 1, 2)) wset = set(bigrams) mh = MinHash(num_perm=self.num_perm) for w in wset: mh.update(w.encode('utf8')) unicode_hash = hashlib.sha1(text.encode("utf8")).hexdigest() return (unicode_hash, mh, len(wset)) def __getitem__(self, key: str): return self.query(key) def __setitem__(self, key: str, value: tuple): raise NotImplementedError def __contains__(self, key: str): if self.query(key): return True return False def __len__(self): return len(self.record_dawg.keys())
def main(): ss = int(sys.argv[1]) filelist = sys.argv[2] GB = 1024 * 1024 * 1024 t0 = time() if '--profile' in sys.argv or '--timing' in sys.argv: types, labels = defaultdict(set), defaultdict( set) # save time for profiling test else: print('reading typeslabels.pkl ..', file=sys.stderr) f = open('typeslabels.pkl', 'rb') types, labels = pickle.load(f) f.close() t1 = time() print('labels in dbpedia and wikidata:', len(types), file=sys.stderr) print('labels read in %.1f seconds or %.1f minutes' % (t1 - t0, (t1 - t0) / 60), file=sys.stderr) # sys.getsizeof does not work correctly, and pympler is WAY too slow # print('sizeof types %.3f GB sizeof labels %.3f GB' % (sys.getsizeof(types)/GB, sys.getsizeof(labels)/GB,), file=sys.stderr) files = [f.strip() for f in open(filelist).readlines()] # process all files instead of sample? if ss == 0: ss = len(files) nrows = np.zeros(ss) ncols = np.zeros(ss) cols = {} svals = 0 ne = 0 rels = {} titles = {} typcnt = defaultdict(int) usecnt = defaultdict(int) vcnt = defaultdict(int) typsets = {} tabs = {} nval = {} stats = {} hdr = 0 rtyp = defaultdict(int) # consistent row types # pat = re.compile("^[0-9\.,/Q_-]*$") print('reading', ss, 'out of total', len(files), 'files from', filelist, file=sys.stderr) # random.seed(4713) sam = random.sample(files, ss) # sqlite default autocommit, even when scripted # we need one big transaction # otherwise autocommit --> each statement a separate transaction, VERY slow fout.write("begin transaction;\n") fout.write('DROP TABLE IF EXISTS val;\n') fout.write('DROP TABLE IF EXISTS tab;\n') fout.write('DROP TABLE IF EXISTS col;\n') fout.write('DROP TABLE IF EXISTS err;\n') fout.write('DROP TABLE IF EXISTS sel;\n') fout.write('DROP TABLE IF EXISTS sub;\n') fout.write('DROP TABLE IF EXISTS vcnt;\n') fout.write('DROP TABLE IF EXISTS lsh;\n') fout.write('DROP TABLE IF EXISTS cnts;\n') fout.write( 'CREATE TABLE val (desc varchar, nvals int, wtyp int, frac float);\n') fout.write( 'CREATE TABLE vcnt (val varchar, nv int, nt int, typs varchar);\n') fout.write( 'CREATE TABLE tab (id int, rows int, cols int, head int, fn varchar, src varchar);\n' ) fout.write( 'CREATE TABLE col (tab int, col int, typ varchar(255), frac float, cov float);\n' ) fout.write( 'CREATE TABLE err (typ varchar, msg varchar, fn varchar, col int, src varchar);\n' ) fout.write( 'CREATE TABLE sel (tab int, col int, nval int, ndist int, sel int);\n') fout.write( 'CREATE TABLE sub (l int, j int, k int, i int, comtyp int, fracsubset float, nchild int, nparent int, fnchild varchar, fnparent varchar);\n' ) fout.write('CREATE TABLE lsh (k int, i int, l int, j int);\n') fout.write('CREATE TABLE cnts (desc varchar, cnt int);\n') # file for writing types of columns tsfile = open('typesets.csv', 'w', encoding='utf8', errors='ignore') tsfile.write('filename,column,fraction,type\n') fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('CSV Files:', ss)) ne = 0 nallcols = 0 # number of all columns # go through the files in the sample for k in range(ss): fn = sam[k] src = 'obd' # fn[6:9] # if src in ['kag', 'obd']: ftyp = 'csv' # else: ftyp = 'json' res = readf(fn, 'csv') # ftyp) if res == -1: ne += 1 continue # error reading file cols, h, nrows, ncols = res nallcols += len(cols) # # kaggle filenames contain ' !!! fout.write("insert into tab values (%d, %d, %d, %d, '%s', '%s');\n" % (k, nrows, ncols, h, cleanfn(fn), src)) nval[k] = { } # lengths of columns as lists, after removing ignored elements tabs[k] = {} # sets of col values, after removing ignored typsets[k] = {} stats[k] = {} ign = ('null', 'true', 'false', 't', 'f', 'yes', 'no', 'y', 'n', 'none', 'na', 'n/a', 'nan', 'n.a.', 'male', 'female', 'm', 'f', 'e') for i in range(len(cols)): # remove numeric and null-like lst = [ x.strip() for x in cols[i] if len(x.strip()) > 0 and not isnumber(x.strip()) and not x.strip().lower() in ign ] if len(lst) == 0: err('col', 'all num or null', fn, i) continue # log value counts for x in lst: vcnt[x] += 1 # only need list length and set for finding reference cols, not whole list: save lots of memory nval[k][i] = len(lst) tabs[k][i] = set(lst) # selectivity sel = len(tabs[k][i]) / len(lst) fout.write( "insert into sel (tab, col, nval, ndist, sel) values(%d, %d, %d, %d, %f);\n" % (k, i, len(lst), len(tabs[k][i]), sel)) # None types tsets = [types[x] for x in lst] ts = set.union(*tsets) typsets[k][i] = set() # all stats useless, performance worse if '--stats' in sys.argv: stats[k][i] = {} lens = [len(x) for x in slst] stats[k][i]['lmin'] = min(lens) stats[k][i]['lmax'] = max(lens) stats[k][i]['case'] = getcase(slst) stats[k][i]['alph'] = getalph(slst) for t in ts: # for more than one known value of type t: fraction of column values that are of type t if len(labels[t]) < 2: continue f = sum([int(t in s) for s in tsets]) / len(lst) # type coverage: fraction of col values of type t in relation to all known values of this type tset = set([x for x in lst if t in types[x]]) # col vals of type t cov = len(tset) / len(labels[t]) fout.write( "insert into col (tab, col, typ, frac, cov) values (%d, %d, '%s', %f, %f);\n" % (k, i, t.replace("'", "").replace('"', ''), f, cov)) if f >= 0.5: typsets[k][i].add(t) if t != 'www.w3.org/2002/07/owl#Thing': # column i in file fn has type t for at least fraction f of elements (set) tsfile.write("%s,%d,%.3f,%s\n" % (fn, i, f, t)) typsets[k][i].discard('www.w3.org/2002/07/owl#Thing' ) # messes up results. its always a thing t2 = time() print('files read in %.1f seconds or %.1f minutes' % (t2 - t1, (t2 - t1) / 60), file=sys.stderr) # some statistics on columns and values (in the sets): number columns, number of values, length of values nc = sum([len(tabs[k]) for k in tabs]) nv = sum([len(tabs[k][i]) for k in tabs for i in tabs[k]]) lv = sum([len(s) for k in tabs for i in tabs[k] for s in tabs[k][i]]) print( 'number of tables: %d cols: %d avg col size (set): %.1f avg item len: %.1f' % (len(tabs), nc, nv / nc, lv / nv)) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('Import Errors:', ne)) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('All Tables:', len(tabs))) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('All Columns:', nallcols)) # print("brk:", brk) # no gain # print('sizeof tabs %.3f GB sizeof typsets %.3f GB' % (sys.getsizeof(tabs)/GB, sys.getsizeof(typsets)/GB,), file=sys.stderr) # value and type counts, most frequent only for k in sorted(vcnt, key=vcnt.get, reverse=True)[:100]: fout.write( "insert into vcnt (val, nv, nt, typs) values ('%s', %d, %d, '%s');\n" % (k.replace("'", ""), vcnt[k], len(types[k]), ', '.join( list(types[k])[:5]))) # overall statistics on values and types nv, wt = sum([vcnt[k] for k in vcnt ]), sum([vcnt[k] * int(len(types[k]) > 0) for k in vcnt]) fout.write( "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n" % ('Values', nv, wt, wt / nv)) uv, wt = len(vcnt), sum([int(len(types[k]) > 0) for k in vcnt]) fout.write( "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n" % ('Unique', uv, wt, wt / uv)) # if '--refs' in sys.argv: print("finding reference columns..", file=sys.stderr) num_cpus = 8 # psutil.cpu_count(logical=False) # weird problems with that print("cpus:", num_cpus) print("init ray..", file=sys.stderr) # Starting Ray with .. GiB memory available for workers and up to .. GiB for objects. # ray.init(memory=<bytes>, object_store_memory=<bytes>). ray.init(num_cpus=num_cpus, memory=9 * 1024 * 1024 * 1024, object_store_memory=45 * 1024 * 1024 * 1024) print("put data into shared mem..", file=sys.stderr) tabs_id = ray.put(tabs) nval_id = ray.put(nval) stats_id = ray.put(stats) typsets_id = ray.put(typsets) sam_id = ray.put(sam) print("start parallel..", file=sys.stderr) # split task by assigning lists of keys in tabs to check. this will block until all are ready. sql = ray.get([ refcols.remote(tabs_id, nval_id, stats_id, typsets_id, sam_id, tx) for tx in np.array_split(list(tabs.keys()), num_cpus) ]) print("parallel section done.", file=sys.stderr) # write to file here, NOT in individual tasks for s in sql: fout.write(s) t3 = time() print('references done in %.1f seconds or %.1f minutes' % (t3 - t2, (t3 - t2) / 60), file=sys.stderr) # if '--lsh' in sys.argv: # locality sensitive hashing for minHashes of column value sets print('build hashes..', file=sys.stderr) # lsh = MinHashLSH(threshold=0.75, num_perm=128) lsh = MinHashLSHEnsemble(threshold=0.9, num_perm=128, num_part=32) mh = {} for k in tabs: mh[k] = {} for i in tabs[k]: mh[k][i] = MinHash(num_perm=128) for d in tabs[k][i]: mh[k][i].update(d.encode('utf8')) # lsh.insert((k,i), mh[k][i]) lsh.index([((k, i), mh[k][i], len(tabs[k][i])) for k in tabs for i in tabs[k]]) t4 = time() print('hashes built in %.1f seconds or %.1f minutes' % (t4 - t3, (t4 - t3) / 60), file=sys.stderr) # similar cols according to lsh print("query lsh..") for k in tabs: for i in tabs[k]: # for l, j in lsh.query(mh[k][i]): for l, j in lsh.query(mh[k][i], len(tabs[k][i])): if l == k: continue # same table # l,j is the parent! # min size for ref tab if len(tabs[l][j]) < 10: continue # selectivity must be 1 if len(tabs[l][j]) != nval[l][j]: continue if len(tabs[k][i]) < 10 and len(tabs[l][j]) < 10: print("TABS", k, i, tabs[k][i]) print("SIMT", l, j, tabs[l][j]) fout.write( "insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (l, j, k, i)) else: t4 = time() # need content for query? # fout.write("insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (0, 0, 0, 0)) t5 = time() print('hash queries done in %.1f seconds or %.1f minutes' % (t5 - t4, (t5 - t4) / 60), file=sys.stderr) # fout.write("commit;\n") fout.close() tsfile.close() print('total run time %.1f seconds or %.1f minutes' % (t5 - t0, (t5 - t0) / 60), file=sys.stderr) sys.exit(0)