def inner(query_features): for features in query_features: for feature in features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) for fp in fingerprints: yield fp
def process_shingles(shingles, record_data_vector, wl_it): next_shingle_id_key = "next_shingle_id" if accumulate_wl_shingles else "wl_{0}_next_shingle_id".format(wl_it) if not fingerprints: for shingle in shingles: if shingle not in shingle_id_map: shingle_id_map[shingle] = wl_state[next_shingle_id_key] wl_state[next_shingle_id_key] += 1 record_data_vector.add((shingle_id_map[shingle], 1)) else: shingle_ids = set(fingerprint.get_fingerprints(shingles, size=24)) record_data_vector |= set(map(lambda shingle_id: (shingle_id, 1), shingle_ids))
def build_with_w_shingles(self, w_shingle_lists, initial_sparse_matrix={}): self.sparse_matrix = initial_sparse_matrix i = -1 for _, record_w_shingles, _ in w_shingle_lists: i += 1 if self.print_progress: print "Ch.Mat.: Processing column", i, "of", self.cols_count fingerprints = fingerprint.get_fingerprints(record_w_shingles) for fp in fingerprints: if not self.sparse_matrix.has_key(fp): self.sparse_matrix[fp] = set() self.sparse_matrix[fp].add(i)
def build(self, feature_lists): self.sparse_matrix = {} i = -1 for _, record_features, _ in feature_lists: i += 1 if self.print_progress: print "Ch.Mat.: Processing column", i, "of", self.cols_count for feature in record_features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) for fp in fingerprints: if not self.sparse_matrix.has_key(fp): self.sparse_matrix[fp] = set() self.sparse_matrix[fp].add(i)
def compute_column_fingerprints(self, record_graphs): assert self.wl_state features = [] for hypergraph in record_graphs: new_features, self.wl_state = feature_extraction.extract_features(hypergraph, self.wl_iterations, self.wl_state) features += new_features column = set() for feature in features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) column |= set(fingerprints) return sorted(column)