def _check_statement_distillation(num_stmts): db = _get_loaded_db(num_stmts) assert db is not None, "Test was broken. Got None instead of db insance." stmts = db_util.distill_stmts(db, get_full_stmts=True) assert len(stmts), "Got zero statements." assert isinstance(list(stmts)[0], Statement), type(list(stmts)[0]) stmt_ids = db_util.distill_stmts(db) assert len(stmts) == len(stmt_ids), \ "stmts: %d, stmt_ids: %d" % (len(stmts), len(stmt_ids)) assert isinstance(list(stmt_ids)[0], int), type(list(stmt_ids)[0]) stmts_p = db_util.distill_stmts(db, num_procs=2) assert len(stmts_p) == len(stmt_ids) stmt_ids_p = db_util.distill_stmts(db, num_procs=2) assert stmt_ids_p == stmt_ids
def _check_preassembly_with_database(num_stmts, batch_size): db = _get_loaded_db(num_stmts) # Get the set of raw statements. raw_stmt_list = db.select_all(db.RawStatements) all_raw_uuids = {raw_stmt.uuid for raw_stmt in raw_stmt_list} assert len(raw_stmt_list) # Run the preassembly initialization. start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size) pa_manager.create_corpus(db) end = datetime.now() print("Duration:", end - start) pa_stmt_list = db.select_all(db.PAStatements) assert 0 < len(pa_stmt_list) < len(raw_stmt_list) raw_unique_link_list = db.select_all(db.RawUniqueLinks) assert len(raw_unique_link_list) all_link_uuids = {ru.raw_stmt_uuid for ru in raw_unique_link_list} all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list} assert len(all_link_uuids - all_raw_uuids) is 0 assert all( [pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list]) num_support_links = db.filter_query(db.PASupportLinks).count() assert num_support_links # Try to get all the preassembled statements from the table. pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts), len(pa_stmt_list)) # Now test the set of preassembled (pa) statements from the database against # what we get from old-fashioned preassembly (opa). raw_stmts = db_util.distill_stmts(db, get_full_stmts=True) _check_against_opa_stmts(raw_stmts, pa_stmts)
def _check_db_pa_supplement(num_stmts, batch_size, init_batch_size=None, split=0.8): if not init_batch_size: init_batch_size = batch_size db = _get_loaded_db(num_stmts, batch_size=init_batch_size, split=split, with_init_corpus=True) start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size) print("Beginning supplement...") pa_manager.supplement_corpus(db) end = datetime.now() print("Duration of incremental update:", end - start) raw_stmts = db_util.distill_stmts(db, get_full_stmts=True) pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) _check_against_opa_stmts(raw_stmts, pa_stmts)
def supplement_corpus(self, db, continuing=False): """Update the table of preassembled statements. This method will take any new raw statements that have not yet been incorporated into the preassembled table, and use them to augment the preassembled table. The resulting updated table is indistinguishable from the result you would achieve if you had simply re-run preassembly on _all_ the raw statements. """ self.__tag = 'supplement' last_update = self._get_latest_updatetime(db) self._log("Latest update was: %s" % last_update) # Get the new statements... self._log("Loading info about the existing state of preassembly. " "(This may take a little time)") new_ids = self._get_new_stmt_ids(db) # If we are continuing, check for support links that were already found. if continuing: self._log("Getting pre-existing links...") db_existing_links = db.select_all([ db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supporting_mk_hash ]) existing_links = {tuple(res) for res in db_existing_links} self._log("Found %d existing links." % len(existing_links)) else: existing_links = set() # Weed out exact duplicates. stmt_ids = distill_stmts(db, num_procs=self.n_proc) new_stmt_ids = new_ids & stmt_ids self._log("There are %d new distilled raw statement ids." % len(new_stmt_ids)) new_stmts = ((sid, _stmt_from_json(s_json)) for sid, s_json in db.select_all( [db.RawStatements.id, db.RawStatements.json], db.RawStatements.id.in_(new_stmt_ids), yield_per=self.batch_size)) # Get the set of new unique statements and link to any new evidence. old_mk_set = {mk for mk, in db.select_all(db.PAStatements.mk_hash)} self._log("Found %d old pa statements." % len(old_mk_set)) new_mk_set = self._get_unique_statements(db, new_stmts, len(new_stmt_ids), old_mk_set) self._log("Found %d new pa statements." % len(new_mk_set)) # Now find the new support links that need to be added. new_support_links = set() for npa_batch in self._pa_batch_iter(db, in_mks=new_mk_set): some_support_links = set() # Compare internally some_support_links |= self._get_support_links(npa_batch) # Compare against the other new batch statements. diff_new_mks = new_mk_set - { s.get_hash(shallow=True) for s in npa_batch } for diff_npa_batch in self._pa_batch_iter(db, in_mks=diff_new_mks): split_idx = len(npa_batch) full_list = npa_batch + diff_npa_batch some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) # Compare against the existing statements. for opa_batch in self._pa_batch_iter(db, in_mks=old_mk_set): split_idx = len(npa_batch) full_list = npa_batch + opa_batch some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) new_support_links |= (some_support_links - existing_links) # There are generally few support links compared to the number of # statements, so it doesn't make sense to copy every time, but for # long preassembly, this allows for better failure recovery. if len(new_support_links) >= self.batch_size: self._log("Copying batch of %d support links into db." % len(new_support_links)) db.copy('pa_support_links', new_support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= new_support_links new_support_links = set() # Insert any remaining support links. if new_support_links: self._log("Copying batch final of %d support links into db." % len(new_support_links)) db.copy('pa_support_links', new_support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= new_support_links return True
def create_corpus(self, db, continuing=False): """Initialize the table of preassembled statements. This method will find the set of unique knowledge represented in the table of raw statements, and it will populate the table of preassembled statements (PAStatements/pa_statements), while maintaining links between the raw statements and their unique (pa) counterparts. Furthermore, the refinement/support relationships between unique statements will be found and recorded in the PASupportLinks/pa_support_links table. For more detail on preassembly, see indra/preassembler/__init__.py """ self.__tag = 'create' # Get the statements stmt_ids = distill_stmts(db, num_procs=self.n_proc) if continuing: self._log("Getting set of statements already de-duplicated...") checked_raw_stmt_ids, pa_stmt_hashes = \ zip(*db.select_all([db.RawUniqueLinks.raw_stmt_uuid, db.RawUniqueLinks.pa_stmt_mk_hash])) stmt_ids -= set(checked_raw_stmt_ids) done_pa_ids = set(pa_stmt_hashes) self._log("Found %d preassembled statements already done." % len(done_pa_ids)) else: done_pa_ids = set() stmts = ((sid, _stmt_from_json(s_json)) for sid, s_json in db.select_all([db.RawStatements.id, db.RawStatements.json], db.RawStatements.id.in_(stmt_ids), yield_per=self.batch_size)) self._log("Found %d statements in all." % len(stmt_ids)) # Get the set of unique statements if stmt_ids: self._get_unique_statements(db, stmts, len(stmt_ids), done_pa_ids) # If we are continuing, check for support links that were already found. if continuing: self._log("Getting pre-existing links...") db_existing_links = db.select_all([ db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supporting_mk_hash ]) existing_links = {tuple(res) for res in db_existing_links} self._log("Found %d existing links." % len(existing_links)) else: existing_links = set() # Now get the support links between all batches. support_links = set() for outer_batch in self._pa_batch_iter(db): # Get internal support links some_support_links = self._get_support_links(outer_batch, poolsize=self.n_proc) outer_mk_hashes = {s.get_hash(shallow=True) for s in outer_batch} # Get links with all other batches for inner_batch in self._pa_batch_iter(db, ex_mks=outer_mk_hashes): split_idx = len(inner_batch) full_list = inner_batch + outer_batch some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) # Add all the new support links support_links |= (some_support_links - existing_links) # There are generally few support links compared to the number of # statements, so it doesn't make sense to copy every time, but for # long preassembly, this allows for better failure recovery. if len(support_links) >= self.batch_size: self._log("Copying batch of %d support links into db." % len(support_links)) db.copy('pa_support_links', support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= support_links support_links = set() # Insert any remaining support links. if support_links: self._log("Copying final batch of %d support links into db." % len(support_links)) db.copy('pa_support_links', support_links, ('supported_mk_hash', 'supporting_mk_hash')) return True
def supplement_corpus(self, db, continuing=False): """Update the table of preassembled statements. This method will take any new raw statements that have not yet been incorporated into the preassembled table, and use them to augment the preassembled table. The resulting updated table is indistinguishable from the result you would achieve if you had simply re-run preassembly on _all_ the raw statements. """ pickle_stashes = [] self.__tag = 'supplement' last_update = self._get_latest_updatetime(db) self._log("Latest update was: %s" % last_update) # Get the new statements... self._log("Loading info about the existing state of preassembly. " "(This may take a little time)") new_id_stash = 'new_ids.pkl' pickle_stashes.append(new_id_stash) if continuing and path.exists(new_id_stash): with open(new_id_stash, 'rb') as f: new_ids = pickle.load(f) else: new_ids = self._get_new_stmt_ids(db) # Stash the new ids in case we need to pick up where we left off. with open(new_id_stash, 'wb') as f: pickle.dump(new_ids, f) # Weed out exact duplicates. dist_stash = 'stmt_ids.pkl' pickle_stashes.append(dist_stash) if continuing and path.exists(dist_stash): with open(dist_stash, 'rb') as f: stmt_ids = pickle.load(f) else: stmt_ids = distill_stmts(db, num_procs=self.n_proc, get_full_stmts=False) with open(dist_stash, 'wb') as f: pickle.dump(stmt_ids, f) new_stmt_ids = new_ids & stmt_ids # Get the set of new unique statements and link to any new evidence. old_mk_set = {mk for mk, in db.select_all(db.PAStatements.mk_hash)} self._log("Found %d old pa statements." % len(old_mk_set)) new_mk_stash = 'new_mk_set.pkl' pickle_stashes.append(new_mk_stash) if continuing and path.exists(new_mk_stash): with open(new_mk_stash, 'rb') as f: new_mk_set = pickle.load(f) else: new_mk_set = self._get_unique_statements(db, new_stmt_ids, len(new_stmt_ids), old_mk_set) with open(new_mk_stash, 'wb') as f: pickle.dump(new_mk_set, f) # If we are continuing, check for support links that were already found. support_link_stash = 'new_support_links.pkl' pickle_stashes.append(support_link_stash) if continuing and path.exists(support_link_stash): with open(support_link_stash, 'rb') as f: existing_links = pickle.load(f) self._log("Found %d existing links." % len(existing_links)) else: existing_links = set() self._log("Found %d new pa statements." % len(new_mk_set)) # Now find the new support links that need to be added. new_support_links = set() new_stmt_iter = self._pa_batch_iter(db, in_mks=new_mk_set) try: for i, npa_batch in enumerate(new_stmt_iter): # Compare internally self._log("Getting support for new pa batch %d." % i) some_support_links = self._get_support_links(npa_batch) # Compare against the other new batch statements. diff_new_mks = new_mk_set - {shash(s) for s in npa_batch} other_new_stmt_iter = self._pa_batch_iter(db, in_mks=diff_new_mks) for j, diff_npa_batch in enumerate(other_new_stmt_iter): split_idx = len(npa_batch) full_list = npa_batch + diff_npa_batch self._log("Comparing %d to batch %d of other new " "statements." % (i, j)) some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) # Compare against the existing statements. old_stmt_iter = self._pa_batch_iter(db, in_mks=old_mk_set) for k, opa_batch in enumerate(old_stmt_iter): split_idx = len(npa_batch) full_list = npa_batch + opa_batch self._log( "Comparing %d to batch of %d of old statements." % (i, k)) some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) new_support_links |= (some_support_links - existing_links) # There are generally few support links compared to the number # of statements, so it doesn't make sense to copy every time, # but for long preassembly, this allows for better failure # recovery. if len(new_support_links) >= self.batch_size: self._log("Copying batch of %d support links into db." % len(new_support_links)) db.copy('pa_support_links', new_support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= new_support_links new_support_links = set() # Insert any remaining support links. if new_support_links: self._log("Copying batch final of %d support links into db." % len(new_support_links)) db.copy('pa_support_links', new_support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= new_support_links except Exception: logger.info("Stashing support links found so far.") if new_support_links: with open(support_link_stash, 'wb') as f: pickle.dump(existing_links, f) raise # Remove all the caches so they can't be picked up accidentally later. for cache in pickle_stashes: if path.exists(cache): remove(cache) return True
def create_corpus(self, db, continuing=False): """Initialize the table of preassembled statements. This method will find the set of unique knowledge represented in the table of raw statements, and it will populate the table of preassembled statements (PAStatements/pa_statements), while maintaining links between the raw statements and their unique (pa) counterparts. Furthermore, the refinement/support relationships between unique statements will be found and recorded in the PASupportLinks/pa_support_links table. For more detail on preassembly, see indra/preassembler/__init__.py """ self.__tag = 'create' # Get filtered statement ID's. sid_cache_fname = path.join(HERE, 'stmt_id_cache.pkl') if continuing and path.exists(sid_cache_fname): with open(sid_cache_fname, 'rb') as f: stmt_ids = pickle.load(f) else: # Get the statement ids. stmt_ids = distill_stmts(db, num_procs=self.n_proc) with open(sid_cache_fname, 'wb') as f: pickle.dump(stmt_ids, f) # Handle the possibility we're picking up after an earlier job... done_pa_ids = set() if continuing: self._log("Getting set of statements already de-duplicated...") link_resp = db.select_all([ db.RawUniqueLinks.raw_stmt_id, db.RawUniqueLinks.pa_stmt_mk_hash ]) if link_resp: checked_raw_stmt_ids, pa_stmt_hashes = \ zip(*db.select_all([db.RawUniqueLinks.raw_stmt_id, db.RawUniqueLinks.pa_stmt_mk_hash])) stmt_ids -= set(checked_raw_stmt_ids) done_pa_ids = set(pa_stmt_hashes) self._log("Found %d preassembled statements already done." % len(done_pa_ids)) # Get the set of unique statements self._get_unique_statements(db, stmt_ids, len(stmt_ids), done_pa_ids) # If we are continuing, check for support links that were already found. if continuing: self._log("Getting pre-existing links...") db_existing_links = db.select_all([ db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supporting_mk_hash ]) existing_links = {tuple(res) for res in db_existing_links} self._log("Found %d existing links." % len(existing_links)) else: existing_links = set() # Now get the support links between all batches. support_links = set() for i, outer_batch in enumerate(self._pa_batch_iter(db)): # Get internal support links self._log('Getting internal support links outer batch %d.' % i) some_support_links = self._get_support_links(outer_batch, poolsize=self.n_proc) outer_mk_hashes = {shash(s) for s in outer_batch} # Get links with all other batches ib_iter = self._pa_batch_iter(db, ex_mks=outer_mk_hashes) for j, inner_batch in enumerate(ib_iter): split_idx = len(inner_batch) full_list = inner_batch + outer_batch self._log('Getting support compared to other batch %d of outer' 'batch %d.' % (j, i)) some_support_links |= \ self._get_support_links(full_list, split_idx=split_idx, poolsize=self.n_proc) # Add all the new support links support_links |= (some_support_links - existing_links) # There are generally few support links compared to the number of # statements, so it doesn't make sense to copy every time, but for # long preassembly, this allows for better failure recovery. if len(support_links) >= self.batch_size: self._log("Copying batch of %d support links into db." % len(support_links)) db.copy('pa_support_links', support_links, ('supported_mk_hash', 'supporting_mk_hash')) existing_links |= support_links support_links = set() # Insert any remaining support links. if support_links: self._log("Copying final batch of %d support links into db." % len(support_links)) db.copy('pa_support_links', support_links, ('supported_mk_hash', 'supporting_mk_hash')) # Delete the pickle cache if path.exists(sid_cache_fname): remove(sid_cache_fname) return True