Пример #1
0
def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', N=0
        )
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=N, overwrite=True
            )
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')
            ).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=41, overwrite=True
            )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
Пример #2
0
  def query(self, query_string, *bindings):
    '''Basic querying without session capture or reporting.

    help_for_query'''
    self.check_representation()
    query_string = self.interpret_query(query_string)
    self.logger.info("BQL [%s] [%r]", query_string, bindings)
    with self.bdb.savepoint():
      try:
        res = self.bdb.execute(query_string, bindings)
        assert res is not None and res.description is not None
        self.logger.debug("BQL [%s] [%r] has returned a cursor." %
                          (query_string, bindings))
        df = bdbcontrib.cursor_to_df(res)
        self.logger.debug("BQL [%s] [%r] has created a dataframe." %
                            (query_string, bindings))
        return df
      except:
        self.logger.exception("")
Пример #3
0
  def query(self, query_string, *bindings):
    '''Basic querying without session capture or reporting.

    help_for_query'''
    self.check_representation()
    query_string = re.sub(r'(^|(?<=\s))%t\b',
                          bayeslite.bql_quote_name(self.name),
                          re.sub(r'(^|(?<=\s))%g\b',
                                 bayeslite.bql_quote_name(self.generator_name),
                                 query_string))
    self.logger.info("BQL [%s] [%r]", query_string, bindings)
    with self.bdb.savepoint():
      try:
        res = self.bdb.execute(query_string, bindings)
        assert res is not None and res.description is not None
        self.logger.debug("BQL [%s] [%r] has returned a cursor." %
                          (query_string, bindings))
        df = bdbcontrib.cursor_to_df(res)
        self.logger.debug("BQL [%s] [%r] has created a dataframe." %
                            (query_string, bindings))
        return df
      except:
        self.logger.exception("")
Пример #4
0
def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc'
        )

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', cores=0
            )

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc'
            )
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 'foo', 'foo_cc'
            )
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(
                    does_not_exist.name, 't', 't_cc'
                )

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', overwrite=True
        )

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', sim_table='t_similarity_2'
        )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim_2 = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity_2')
        ).sort_values(by=['rowid0', 'rowid1'])

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(
            parallel_sim, parallel_sim_2, check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)