예제 #1
0
def _check_statement_distillation(num_stmts):
    db = _get_loaded_db(num_stmts)
    assert db is not None, "Test was broken. Got None instead of db insance."
    stmts = db_util.distill_stmts(db, get_full_stmts=True)
    assert len(stmts), "Got zero statements."
    assert isinstance(list(stmts)[0], Statement), type(list(stmts)[0])
    stmt_ids = db_util.distill_stmts(db)
    assert len(stmts) == len(stmt_ids), \
        "stmts: %d, stmt_ids: %d" % (len(stmts), len(stmt_ids))
    assert isinstance(list(stmt_ids)[0], int), type(list(stmt_ids)[0])
    stmts_p = db_util.distill_stmts(db, num_procs=2)
    assert len(stmts_p) == len(stmt_ids)
    stmt_ids_p = db_util.distill_stmts(db, num_procs=2)
    assert stmt_ids_p == stmt_ids
예제 #2
0
def _check_preassembly_with_database(num_stmts, batch_size):
    db = _get_loaded_db(num_stmts)

    # Get the set of raw statements.
    raw_stmt_list = db.select_all(db.RawStatements)
    all_raw_uuids = {raw_stmt.uuid for raw_stmt in raw_stmt_list}
    assert len(raw_stmt_list)

    # Run the preassembly initialization.
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    pa_manager.create_corpus(db)
    end = datetime.now()
    print("Duration:", end - start)
    pa_stmt_list = db.select_all(db.PAStatements)
    assert 0 < len(pa_stmt_list) < len(raw_stmt_list)
    raw_unique_link_list = db.select_all(db.RawUniqueLinks)
    assert len(raw_unique_link_list)
    all_link_uuids = {ru.raw_stmt_uuid for ru in raw_unique_link_list}
    all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list}
    assert len(all_link_uuids - all_raw_uuids) is 0
    assert all(
        [pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list])
    num_support_links = db.filter_query(db.PASupportLinks).count()
    assert num_support_links

    # Try to get all the preassembled statements from the table.
    pa_stmts = db_client.get_statements([],
                                        preassembled=True,
                                        db=db,
                                        with_support=True)
    assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts),
                                                len(pa_stmt_list))

    # Now test the set of preassembled (pa) statements from the database against
    # what we get from old-fashioned preassembly (opa).
    raw_stmts = db_util.distill_stmts(db, get_full_stmts=True)
    _check_against_opa_stmts(raw_stmts, pa_stmts)
예제 #3
0
def _check_db_pa_supplement(num_stmts,
                            batch_size,
                            init_batch_size=None,
                            split=0.8):
    if not init_batch_size:
        init_batch_size = batch_size
    db = _get_loaded_db(num_stmts,
                        batch_size=init_batch_size,
                        split=split,
                        with_init_corpus=True)
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    print("Beginning supplement...")
    pa_manager.supplement_corpus(db)
    end = datetime.now()
    print("Duration of incremental update:", end - start)

    raw_stmts = db_util.distill_stmts(db, get_full_stmts=True)
    pa_stmts = db_client.get_statements([],
                                        preassembled=True,
                                        db=db,
                                        with_support=True)
    _check_against_opa_stmts(raw_stmts, pa_stmts)
예제 #4
0
    def supplement_corpus(self, db, continuing=False):
        """Update the table of preassembled statements.

        This method will take any new raw statements that have not yet been
        incorporated into the preassembled table, and use them to augment the
        preassembled table.

        The resulting updated table is indistinguishable from the result you
        would achieve if you had simply re-run preassembly on _all_ the
        raw statements.
        """
        self.__tag = 'supplement'
        last_update = self._get_latest_updatetime(db)
        self._log("Latest update was: %s" % last_update)

        # Get the new statements...
        self._log("Loading info about the existing state of preassembly. "
                  "(This may take a little time)")
        new_ids = self._get_new_stmt_ids(db)

        # If we are continuing, check for support links that were already found.
        if continuing:
            self._log("Getting pre-existing links...")
            db_existing_links = db.select_all([
                db.PASupportLinks.supporting_mk_hash,
                db.PASupportLinks.supporting_mk_hash
            ])
            existing_links = {tuple(res) for res in db_existing_links}
            self._log("Found %d existing links." % len(existing_links))
        else:
            existing_links = set()

        # Weed out exact duplicates.
        stmt_ids = distill_stmts(db, num_procs=self.n_proc)
        new_stmt_ids = new_ids & stmt_ids
        self._log("There are %d new distilled raw statement ids." %
                  len(new_stmt_ids))
        new_stmts = ((sid, _stmt_from_json(s_json))
                     for sid, s_json in db.select_all(
                         [db.RawStatements.id, db.RawStatements.json],
                         db.RawStatements.id.in_(new_stmt_ids),
                         yield_per=self.batch_size))

        # Get the set of new unique statements and link to any new evidence.
        old_mk_set = {mk for mk, in db.select_all(db.PAStatements.mk_hash)}
        self._log("Found %d old pa statements." % len(old_mk_set))
        new_mk_set = self._get_unique_statements(db, new_stmts,
                                                 len(new_stmt_ids), old_mk_set)
        self._log("Found %d new pa statements." % len(new_mk_set))

        # Now find the new support links that need to be added.
        new_support_links = set()
        for npa_batch in self._pa_batch_iter(db, in_mks=new_mk_set):
            some_support_links = set()

            # Compare internally
            some_support_links |= self._get_support_links(npa_batch)

            # Compare against the other new batch statements.
            diff_new_mks = new_mk_set - {
                s.get_hash(shallow=True)
                for s in npa_batch
            }
            for diff_npa_batch in self._pa_batch_iter(db, in_mks=diff_new_mks):
                split_idx = len(npa_batch)
                full_list = npa_batch + diff_npa_batch
                some_support_links |= \
                    self._get_support_links(full_list, split_idx=split_idx,
                                            poolsize=self.n_proc)

            # Compare against the existing statements.
            for opa_batch in self._pa_batch_iter(db, in_mks=old_mk_set):
                split_idx = len(npa_batch)
                full_list = npa_batch + opa_batch
                some_support_links |= \
                    self._get_support_links(full_list, split_idx=split_idx,
                                            poolsize=self.n_proc)

            new_support_links |= (some_support_links - existing_links)

            # There are generally few support links compared to the number of
            # statements, so it doesn't make sense to copy every time, but for
            # long preassembly, this allows for better failure recovery.
            if len(new_support_links) >= self.batch_size:
                self._log("Copying batch of %d support links into db." %
                          len(new_support_links))
                db.copy('pa_support_links', new_support_links,
                        ('supported_mk_hash', 'supporting_mk_hash'))
                existing_links |= new_support_links
                new_support_links = set()

        # Insert any remaining support links.
        if new_support_links:
            self._log("Copying batch final of %d support links into db." %
                      len(new_support_links))
            db.copy('pa_support_links', new_support_links,
                    ('supported_mk_hash', 'supporting_mk_hash'))
            existing_links |= new_support_links

        return True
예제 #5
0
    def create_corpus(self, db, continuing=False):
        """Initialize the table of preassembled statements.

        This method will find the set of unique knowledge represented in the
        table of raw statements, and it will populate the table of preassembled
        statements (PAStatements/pa_statements), while maintaining links between
        the raw statements and their unique (pa) counterparts. Furthermore, the
        refinement/support relationships between unique statements will be found
        and recorded in the PASupportLinks/pa_support_links table.

        For more detail on preassembly, see indra/preassembler/__init__.py
        """
        self.__tag = 'create'
        # Get the statements
        stmt_ids = distill_stmts(db, num_procs=self.n_proc)
        if continuing:
            self._log("Getting set of statements already de-duplicated...")
            checked_raw_stmt_ids, pa_stmt_hashes = \
                zip(*db.select_all([db.RawUniqueLinks.raw_stmt_uuid,
                                    db.RawUniqueLinks.pa_stmt_mk_hash]))
            stmt_ids -= set(checked_raw_stmt_ids)
            done_pa_ids = set(pa_stmt_hashes)
            self._log("Found %d preassembled statements already done." %
                      len(done_pa_ids))
        else:
            done_pa_ids = set()
        stmts = ((sid, _stmt_from_json(s_json)) for sid, s_json in
                 db.select_all([db.RawStatements.id, db.RawStatements.json],
                               db.RawStatements.id.in_(stmt_ids),
                               yield_per=self.batch_size))
        self._log("Found %d statements in all." % len(stmt_ids))

        # Get the set of unique statements
        if stmt_ids:
            self._get_unique_statements(db, stmts, len(stmt_ids), done_pa_ids)

        # If we are continuing, check for support links that were already found.
        if continuing:
            self._log("Getting pre-existing links...")
            db_existing_links = db.select_all([
                db.PASupportLinks.supporting_mk_hash,
                db.PASupportLinks.supporting_mk_hash
            ])
            existing_links = {tuple(res) for res in db_existing_links}
            self._log("Found %d existing links." % len(existing_links))
        else:
            existing_links = set()

        # Now get the support links between all batches.
        support_links = set()
        for outer_batch in self._pa_batch_iter(db):
            # Get internal support links
            some_support_links = self._get_support_links(outer_batch,
                                                         poolsize=self.n_proc)
            outer_mk_hashes = {s.get_hash(shallow=True) for s in outer_batch}

            # Get links with all other batches
            for inner_batch in self._pa_batch_iter(db, ex_mks=outer_mk_hashes):
                split_idx = len(inner_batch)
                full_list = inner_batch + outer_batch
                some_support_links |= \
                    self._get_support_links(full_list, split_idx=split_idx,
                                            poolsize=self.n_proc)

            # Add all the new support links
            support_links |= (some_support_links - existing_links)

            # There are generally few support links compared to the number of
            # statements, so it doesn't make sense to copy every time, but for
            # long preassembly, this allows for better failure recovery.
            if len(support_links) >= self.batch_size:
                self._log("Copying batch of %d support links into db." %
                          len(support_links))
                db.copy('pa_support_links', support_links,
                        ('supported_mk_hash', 'supporting_mk_hash'))
                existing_links |= support_links
                support_links = set()

        # Insert any remaining support links.
        if support_links:
            self._log("Copying final batch of %d support links into db." %
                      len(support_links))
            db.copy('pa_support_links', support_links,
                    ('supported_mk_hash', 'supporting_mk_hash'))

        return True
예제 #6
0
    def supplement_corpus(self, db, continuing=False):
        """Update the table of preassembled statements.

        This method will take any new raw statements that have not yet been
        incorporated into the preassembled table, and use them to augment the
        preassembled table.

        The resulting updated table is indistinguishable from the result you
        would achieve if you had simply re-run preassembly on _all_ the
        raw statements.
        """
        pickle_stashes = []
        self.__tag = 'supplement'
        last_update = self._get_latest_updatetime(db)
        self._log("Latest update was: %s" % last_update)

        # Get the new statements...
        self._log("Loading info about the existing state of preassembly. "
                  "(This may take a little time)")
        new_id_stash = 'new_ids.pkl'
        pickle_stashes.append(new_id_stash)
        if continuing and path.exists(new_id_stash):
            with open(new_id_stash, 'rb') as f:
                new_ids = pickle.load(f)
        else:
            new_ids = self._get_new_stmt_ids(db)

            # Stash the new ids in case we need to pick up where we left off.
            with open(new_id_stash, 'wb') as f:
                pickle.dump(new_ids, f)

        # Weed out exact duplicates.
        dist_stash = 'stmt_ids.pkl'
        pickle_stashes.append(dist_stash)
        if continuing and path.exists(dist_stash):
            with open(dist_stash, 'rb') as f:
                stmt_ids = pickle.load(f)
        else:
            stmt_ids = distill_stmts(db,
                                     num_procs=self.n_proc,
                                     get_full_stmts=False)
            with open(dist_stash, 'wb') as f:
                pickle.dump(stmt_ids, f)

        new_stmt_ids = new_ids & stmt_ids

        # Get the set of new unique statements and link to any new evidence.
        old_mk_set = {mk for mk, in db.select_all(db.PAStatements.mk_hash)}
        self._log("Found %d old pa statements." % len(old_mk_set))
        new_mk_stash = 'new_mk_set.pkl'
        pickle_stashes.append(new_mk_stash)
        if continuing and path.exists(new_mk_stash):
            with open(new_mk_stash, 'rb') as f:
                new_mk_set = pickle.load(f)
        else:
            new_mk_set = self._get_unique_statements(db, new_stmt_ids,
                                                     len(new_stmt_ids),
                                                     old_mk_set)
            with open(new_mk_stash, 'wb') as f:
                pickle.dump(new_mk_set, f)

        # If we are continuing, check for support links that were already found.
        support_link_stash = 'new_support_links.pkl'
        pickle_stashes.append(support_link_stash)
        if continuing and path.exists(support_link_stash):
            with open(support_link_stash, 'rb') as f:
                existing_links = pickle.load(f)
            self._log("Found %d existing links." % len(existing_links))
        else:
            existing_links = set()

        self._log("Found %d new pa statements." % len(new_mk_set))

        # Now find the new support links that need to be added.
        new_support_links = set()
        new_stmt_iter = self._pa_batch_iter(db, in_mks=new_mk_set)
        try:
            for i, npa_batch in enumerate(new_stmt_iter):

                # Compare internally
                self._log("Getting support for new pa batch %d." % i)
                some_support_links = self._get_support_links(npa_batch)

                # Compare against the other new batch statements.
                diff_new_mks = new_mk_set - {shash(s) for s in npa_batch}
                other_new_stmt_iter = self._pa_batch_iter(db,
                                                          in_mks=diff_new_mks)
                for j, diff_npa_batch in enumerate(other_new_stmt_iter):
                    split_idx = len(npa_batch)
                    full_list = npa_batch + diff_npa_batch
                    self._log("Comparing %d to batch %d of other new "
                              "statements." % (i, j))
                    some_support_links |= \
                        self._get_support_links(full_list, split_idx=split_idx,
                                                poolsize=self.n_proc)

                # Compare against the existing statements.
                old_stmt_iter = self._pa_batch_iter(db, in_mks=old_mk_set)
                for k, opa_batch in enumerate(old_stmt_iter):
                    split_idx = len(npa_batch)
                    full_list = npa_batch + opa_batch
                    self._log(
                        "Comparing %d to batch of %d of old statements." %
                        (i, k))
                    some_support_links |= \
                        self._get_support_links(full_list, split_idx=split_idx,
                                                poolsize=self.n_proc)

                new_support_links |= (some_support_links - existing_links)

                # There are generally few support links compared to the number
                # of statements, so it doesn't make sense to copy every time,
                # but for long preassembly, this allows for better failure
                # recovery.
                if len(new_support_links) >= self.batch_size:
                    self._log("Copying batch of %d support links into db." %
                              len(new_support_links))
                    db.copy('pa_support_links', new_support_links,
                            ('supported_mk_hash', 'supporting_mk_hash'))
                    existing_links |= new_support_links
                    new_support_links = set()

            # Insert any remaining support links.
            if new_support_links:
                self._log("Copying batch final of %d support links into db." %
                          len(new_support_links))
                db.copy('pa_support_links', new_support_links,
                        ('supported_mk_hash', 'supporting_mk_hash'))
                existing_links |= new_support_links
        except Exception:
            logger.info("Stashing support links found so far.")
            if new_support_links:
                with open(support_link_stash, 'wb') as f:
                    pickle.dump(existing_links, f)
            raise

        # Remove all the caches so they can't be picked up accidentally later.
        for cache in pickle_stashes:
            if path.exists(cache):
                remove(cache)

        return True
예제 #7
0
    def create_corpus(self, db, continuing=False):
        """Initialize the table of preassembled statements.

        This method will find the set of unique knowledge represented in the
        table of raw statements, and it will populate the table of preassembled
        statements (PAStatements/pa_statements), while maintaining links between
        the raw statements and their unique (pa) counterparts. Furthermore, the
        refinement/support relationships between unique statements will be found
        and recorded in the PASupportLinks/pa_support_links table.

        For more detail on preassembly, see indra/preassembler/__init__.py
        """
        self.__tag = 'create'

        # Get filtered statement ID's.
        sid_cache_fname = path.join(HERE, 'stmt_id_cache.pkl')
        if continuing and path.exists(sid_cache_fname):
            with open(sid_cache_fname, 'rb') as f:
                stmt_ids = pickle.load(f)
        else:
            # Get the statement ids.
            stmt_ids = distill_stmts(db, num_procs=self.n_proc)
            with open(sid_cache_fname, 'wb') as f:
                pickle.dump(stmt_ids, f)

        # Handle the possibility we're picking up after an earlier job...
        done_pa_ids = set()
        if continuing:
            self._log("Getting set of statements already de-duplicated...")
            link_resp = db.select_all([
                db.RawUniqueLinks.raw_stmt_id,
                db.RawUniqueLinks.pa_stmt_mk_hash
            ])
            if link_resp:
                checked_raw_stmt_ids, pa_stmt_hashes = \
                    zip(*db.select_all([db.RawUniqueLinks.raw_stmt_id,
                                        db.RawUniqueLinks.pa_stmt_mk_hash]))
                stmt_ids -= set(checked_raw_stmt_ids)
                done_pa_ids = set(pa_stmt_hashes)
                self._log("Found %d preassembled statements already done." %
                          len(done_pa_ids))

        # Get the set of unique statements
        self._get_unique_statements(db, stmt_ids, len(stmt_ids), done_pa_ids)

        # If we are continuing, check for support links that were already found.
        if continuing:
            self._log("Getting pre-existing links...")
            db_existing_links = db.select_all([
                db.PASupportLinks.supporting_mk_hash,
                db.PASupportLinks.supporting_mk_hash
            ])
            existing_links = {tuple(res) for res in db_existing_links}
            self._log("Found %d existing links." % len(existing_links))
        else:
            existing_links = set()

        # Now get the support links between all batches.
        support_links = set()
        for i, outer_batch in enumerate(self._pa_batch_iter(db)):
            # Get internal support links
            self._log('Getting internal support links outer batch %d.' % i)
            some_support_links = self._get_support_links(outer_batch,
                                                         poolsize=self.n_proc)
            outer_mk_hashes = {shash(s) for s in outer_batch}

            # Get links with all other batches
            ib_iter = self._pa_batch_iter(db, ex_mks=outer_mk_hashes)
            for j, inner_batch in enumerate(ib_iter):
                split_idx = len(inner_batch)
                full_list = inner_batch + outer_batch
                self._log('Getting support compared to other batch %d of outer'
                          'batch %d.' % (j, i))
                some_support_links |= \
                    self._get_support_links(full_list, split_idx=split_idx,
                                            poolsize=self.n_proc)

            # Add all the new support links
            support_links |= (some_support_links - existing_links)

            # There are generally few support links compared to the number of
            # statements, so it doesn't make sense to copy every time, but for
            # long preassembly, this allows for better failure recovery.
            if len(support_links) >= self.batch_size:
                self._log("Copying batch of %d support links into db." %
                          len(support_links))
                db.copy('pa_support_links', support_links,
                        ('supported_mk_hash', 'supporting_mk_hash'))
                existing_links |= support_links
                support_links = set()

        # Insert any remaining support links.
        if support_links:
            self._log("Copying final batch of %d support links into db." %
                      len(support_links))
            db.copy('pa_support_links', support_links,
                    ('supported_mk_hash', 'supporting_mk_hash'))

        # Delete the pickle cache
        if path.exists(sid_cache_fname):
            remove(sid_cache_fname)

        return True