def _check_preassembly_with_database(num_stmts, batch_size): db = get_pa_loaded_db(num_stmts) # Now test the set of preassembled (pa) statements from the database # against what we get from old-fashioned preassembly (opa). opa_inp_stmts = _get_opa_input_stmts(db) # Get the set of raw statements. raw_stmt_list = db.select_all(db.RawStatements) all_raw_ids = {raw_stmt.id for raw_stmt in raw_stmt_list} assert len(raw_stmt_list) # Run the preassembly initialization. start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size, print_logs=True) pa_manager.create_corpus(db) end = datetime.now() print("Duration:", end - start) # Make sure the number of pa statements is within reasonable bounds. pa_stmt_list = db.select_all(db.PAStatements) assert 0 < len(pa_stmt_list) < len(raw_stmt_list) # Check the evidence links. raw_unique_link_list = db.select_all(db.RawUniqueLinks) assert len(raw_unique_link_list) all_link_ids = {ru.raw_stmt_id for ru in raw_unique_link_list} all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list} assert len(all_link_ids - all_raw_ids) is 0 assert all( [pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list]) # Check the support links. sup_links = db.select_all([ db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supported_mk_hash ]) assert sup_links assert not any([l[0] == l[1] for l in sup_links]),\ "Found self-support in the database." # Try to get all the preassembled statements from the table. pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts), len(pa_stmt_list)) self_supports = { shash(s): shash(s) in {shash(s_) for s_ in s.supported_by + s.supports} for s in pa_stmts } if any(self_supports.values()): assert False, "Found self-support in constructed pa statement objects." _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts) return
def _get_opa_input_stmts(db): stmt_nd = db_util.get_reading_stmt_dict(db, get_full_stmts=True) reading_stmts, _ =\ db_util.get_filtered_rdg_stmts(stmt_nd, get_full_stmts=True) db_stmts = db_client.get_statements( [db.RawStatements.reading_id.is_(None)], preassembled=False, db=db) stmts = reading_stmts | set(db_stmts) print("Got %d statements for opa." % len(stmts)) return stmts
def test_get_statements(): num_stmts = 10000 db, _ = _get_prepped_db(num_stmts) # Test getting all statements stmts = dbc.get_statements([], preassembled=False, db=db) assert len(stmts) == num_stmts, len(stmts) stmts = dbc.get_statements([db.RawStatements.reading_id.isnot(None)], preassembled=False, db=db) pmids = {s.evidence[0].pmid for s in random.sample(stmts, 100)} assert pmids assert pmids != {None} md_list = pubc.get_metadata_for_ids( [pmid for pmid in pmids if pmid is not None]) assert len(md_list) == len(pmids - {None}),\ (len(md_list), len(pmids - {None})) # Test getting some statements stmt_uuid = stmts[0].uuid stmts = dbc.get_statements([db.RawStatements.uuid != stmt_uuid], preassembled=False, db=db) assert len(stmts) == num_stmts - 1, len(stmts) # Test getting statements without fix refs. stmts = dbc.get_statements([ db.RawStatements.reading_id.isnot(None), db.RawStatements.reading_id == db.Reading.id, db.Reading.reader == 'SPARSER' ], preassembled=False, fix_refs=False, db=db) assert 0 < len(stmts) < num_stmts, len(stmts) pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)} assert None in pmids, pmids
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8, n_proc=1): pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc, print_logs=True) db = _get_loaded_db(num_stmts, split=split, pam=pa_manager) opa_inp_stmts = _get_opa_input_stmts(db) start = datetime.now() print('sleeping...') sleep(5) print("Beginning supplement...") pa_manager.supplement_corpus(db) end = datetime.now() print("Duration of incremental update:", end-start) pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts) return