Exemplo n.º 1
0
def test_get_statements():
    num_stmts = 10000
    db = _get_prepped_db(num_stmts)

    # Test getting all statements
    stmts = dbc.get_statements([], preassembled=False, db=db)
    assert len(stmts) == num_stmts, len(stmts)

    stmts = dbc.get_statements([db.RawStatements.reading_id.isnot(None)],
                               preassembled=False,
                               db=db)
    pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)}
    assert pmids
    assert None not in pmids
    md_list = pubc.get_metadata_for_ids(list(pmids))
    assert len(md_list) == len(pmids), (len(md_list), len(pmids))

    # Test getting some statements
    stmt_uuid = stmts[0].uuid
    stmts = dbc.get_statements([db.RawStatements.uuid != stmt_uuid],
                               preassembled=False,
                               db=db)
    assert len(stmts) == num_stmts - 1, len(stmts)

    # Test getting statements without fix refs.
    stmts = dbc.get_statements([
        db.RawStatements.reading_id.isnot(None), db.RawStatements.reading_id
        == db.Reading.id, db.Reading.reader == 'SPARSER'
    ],
                               preassembled=False,
                               fix_refs=False,
                               db=db)
    assert 0 < len(stmts) < num_stmts, len(stmts)
    pmids = {s.evidence[0].pmid for s in random.sample(stmts, 200)}
    assert None in pmids, pmids
Exemplo n.º 2
0
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8):
    db = _get_loaded_db(num_stmts, split=split, with_init_corpus=True)
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    print("Beginning supplement...")
    pa_manager.supplement_corpus(db)
    end = datetime.now()
    print("Duration of incremental update:", end-start)

    raw_stmts = db_client.get_statements([], preassembled=False, db=db)
    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    _check_against_opa_stmts(db, raw_stmts, pa_stmts)
Exemplo n.º 3
0
def _get_opa_input_stmts(db):
    stmt_nd = db_util._get_reading_statement_dict(db, get_full_stmts=True)
    reading_stmts, _, _ =\
        db_util._get_filtered_rdg_statements(stmt_nd, get_full_stmts=True,
                                             ignore_duplicates=True)
    db_stmts = db_client.get_statements([db.RawStatements.reading_id == None],
                                        preassembled=False, db=db)
    stmts = reading_stmts | set(db_stmts)
    print("Got %d statements for opa." % len(stmts))
    return stmts
Exemplo n.º 4
0
def _check_preassembly_with_database(num_stmts, batch_size, n_proc=1):
    db = _get_loaded_db(num_stmts)

    # Now test the set of preassembled (pa) statements from the database against
    # what we get from old-fashioned preassembly (opa).
    opa_inp_stmts = _get_opa_input_stmts(db)

    # Get the set of raw statements.
    raw_stmt_list = db.select_all(db.RawStatements)
    all_raw_ids = {raw_stmt.id for raw_stmt in raw_stmt_list}
    assert len(raw_stmt_list)

    # Run the preassembly initialization.
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc,
                                       print_logs=True)
    pa_manager.create_corpus(db)
    end = datetime.now()
    print("Duration:", end-start)

    # Make sure the number of pa statements is within reasonable bounds.
    pa_stmt_list = db.select_all(db.PAStatements)
    assert 0 < len(pa_stmt_list) < len(raw_stmt_list)

    # Check the evidence links.
    raw_unique_link_list = db.select_all(db.RawUniqueLinks)
    assert len(raw_unique_link_list)
    all_link_ids = {ru.raw_stmt_id for ru in raw_unique_link_list}
    all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list}
    assert len(all_link_ids - all_raw_ids) is 0
    assert all([pa_stmt.mk_hash in all_link_mk_hashes
                for pa_stmt in pa_stmt_list])

    # Check the support links.
    sup_links = db.select_all([db.PASupportLinks.supporting_mk_hash,
                               db.PASupportLinks.supported_mk_hash])
    assert sup_links
    assert not any([l[0] == l[1] for l in sup_links]),\
        "Found self-support in the database."

    # Try to get all the preassembled statements from the table.
    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts),
                                                len(pa_stmt_list))

    self_supports = {
        shash(s): shash(s) in {shash(s_) for s_ in s.supported_by + s.supports}
        for s in pa_stmts
        }
    if any(self_supports.values()):
        assert False, "Found self-support in constructed pa statement objects."

    _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts)
    return
Exemplo n.º 5
0
def _check_preassembly_with_database(num_stmts, batch_size):
    db = _get_loaded_db(num_stmts)

    # Get the set of raw statements.
    raw_stmt_list = db.select_all(db.RawStatements)
    all_raw_ids = {raw_stmt.id for raw_stmt in raw_stmt_list}
    assert len(raw_stmt_list)

    # Run the preassembly initialization.
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    pa_manager.create_corpus(db)
    end = datetime.now()
    print("Duration:", end-start)
    pa_stmt_list = db.select_all(db.PAStatements)
    assert 0 < len(pa_stmt_list) < len(raw_stmt_list)
    raw_unique_link_list = db.select_all(db.RawUniqueLinks)
    assert len(raw_unique_link_list)
    all_link_ids = {ru.raw_stmt_id for ru in raw_unique_link_list}
    all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list}
    assert len(all_link_ids - all_raw_ids) is 0
    assert all([pa_stmt.mk_hash in all_link_mk_hashes
                for pa_stmt in pa_stmt_list])
    num_support_links = db.filter_query(db.PASupportLinks).count()
    assert num_support_links

    # Try to get all the preassembled statements from the table.
    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts),
                                                len(pa_stmt_list))

    # Now test the set of preassembled (pa) statements from the database against
    # what we get from old-fashioned preassembly (opa).
    raw_stmts = db_client.get_statements([], preassembled=False, db=db)
    _check_against_opa_stmts(db, raw_stmts, pa_stmts)
Exemplo n.º 6
0
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8, n_proc=1):
    pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc,
                                       print_logs=True)
    db = _get_loaded_db(num_stmts, split=split, pam=pa_manager)
    opa_inp_stmts = _get_opa_input_stmts(db)
    start = datetime.now()
    print("Beginning supplement...")
    pa_manager.supplement_corpus(db)
    end = datetime.now()
    print("Duration of incremental update:", end-start)

    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts)
    return
Exemplo n.º 7
0
def preassemble_db_stmts(db, num_proc, *clauses):
    """Run pre-assembly on a set of statements in the database."""
    stmts = get_statements(clauses, db=db, do_stmt_count=False)
    unique_stmts, match_key_maps = process_statements(stmts, poolsize=num_proc)
    insert_pa_stmts(db, unique_stmts)
    return unique_stmts, match_key_maps