def initialize(self): if self.bdb: self.check_representation() return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df( self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file( self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: tables = self.list_tables() metamodels = self.list_metamodels() if len(tables) + len(metamodels) == 0: raise BLE(ValueError("No data sources specified, and an empty bdb.")) else: raise BLE(ValueError("The name of the population must be the same" " as a table in the bdb, one of: " + ", ".join(tables) + "\nNote also that the bdb has the following" " metamodels defined: " + ", ".join(metamodels))) self.generators = self.query('''SELECT * FROM bayesdb_generator''') if len(self.generators) == 0: size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0] assert 0 < size self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''') self.check_representation()
def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=0 ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=N, overwrite=True ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=41, overwrite=True ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=N, overwrite=True) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=41, overwrite=True) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_cardinality(data, cols, cardinalities_expected): with tempfile.NamedTemporaryFile() as temp: temp.write(data) temp.seek(0) with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) cards = bql_utils.cardinality(bdb, 't', cols) for c in cards: assert 2 == len(c) assert c[0] in ('id', 'one', 'two', 'three', 'four') assert cardinalities_expected == [c[1] for c in cards]
def test_cardinality(data, cols, cardinalities_expected): with tempfile.NamedTemporaryFile() as temp: temp.write(data) temp.seek(0) with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) cards = bql_utils.cardinality(bdb, 't', cols) for col, count, expected_count in zip( cards['name'], cards['distinct_count'], cardinalities_expected): assert expected_count == count assert col in ('id', 'one', 'two', 'three', 'four') if cols is not None: expected_col = cols.pop(0) assert expected_col == col assert len(cards) == len(cardinalities_expected)
def test_cardinality(data, cols, cardinalities_expected): with tempfile.NamedTemporaryFile() as temp: temp.write(data) temp.seek(0) with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) cards = bql_utils.cardinality(bdb, 't', cols) for col, count, expected_count in zip(cards['name'], cards['distinct_count'], cardinalities_expected): assert expected_count == count assert col in ('id', 'one', 'two', 'three', 'four') if cols is not None: expected_col = cols.pop(0) assert expected_col == col assert len(cards) == len(cardinalities_expected)
def test_nullify_no_missing(data, value, num_nulls_expected): with tempfile.NamedTemporaryFile() as temp: temp.write(data) temp.seek(0) with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bql_utils.nullify(bdb, 't', value) c = bdb.execute('SELECT COUNT(*) FROM t WHERE one IS NULL;') assert c.fetchvalue() == num_nulls_expected[0] c = bdb.execute('SELECT COUNT(*) FROM t WHERE two IS NULL;') assert c.fetchvalue() == num_nulls_expected[1] c = bdb.execute('SELECT COUNT(*) FROM t WHERE three IS NULL;') assert c.fetchvalue() == num_nulls_expected[2] c = bdb.execute('SELECT COUNT(*) FROM t WHERE four IS NULL;') assert c.fetchvalue() == num_nulls_expected[3]
def initialize(self): if self.bdb: return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df( self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file( self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: raise BLE(ValueError("No data sources specified, and an empty bdb.")) self.generators = self.query('''SELECT * FROM bayesdb_generator''') if len(self.generators) == 0: size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0) assert 0 < size self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
def initialize(self): if self.bdb: self.check_representation() return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df(self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file(self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: tables = self.list_tables() metamodels = self.list_metamodels() if len(tables) + len(metamodels) == 0: raise BLE( ValueError( "No data sources specified, and an empty bdb.")) else: raise BLE( ValueError( "The name of the population must be the same" " as a table in the bdb, one of: " + ", ".join(tables) + "\nNote also that the bdb has the following" " metamodels defined: " + ", ".join(metamodels))) self.generators = self.query('''SELECT * FROM bayesdb_generator''') if len(self.generators) == 0: size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0] assert 0 < size self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''') self.check_representation()
def initialize(self): if self.bdb: return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df( self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file( self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: raise ValueError("No data sources specified, and an empty bdb.") size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0) assert 0 < size if "BAYESDB_WIZARD_MODE" in os.environ: old_wizmode = os.environ["BAYESDB_WIZARD_MODE"] else: old_wizmode = "" os.environ["BAYESDB_WIZARD_MODE"] = "1" self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''') os.environ["BAYESDB_WIZARD_MODE"] = old_wizmode
def test_read_csv(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1,2,3,'foo','bar',u'nan',u'',u'quagga'), (4,5,6,'baz','quux',42.0,u'',u'eland'), (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data
PATH_EXAMPLES = os.path.dirname(PATH_KEPLER) PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites') PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv') composer = Composer() composer.register_foreign_predictor(keplers_law.KeplersLaw) composer.register_foreign_predictor(random_forest.RandomForest) if os.path.exists(os.path.join(outdir, 'kepler.bdb')): os.remove(os.path.join(outdir, 'kepler.bdb')) bdb = bayeslite.bayesdb_open(os.path.join(outdir, 'kepler.bdb')) bayeslite.bayesdb_register_metamodel(bdb, composer) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) bdbcontrib.query( bdb, ''' CREATE GENERATOR sat_kepler FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
def test_read_csv(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) with pytest.raises(IOError): # Table must have no empty values in header. csv_hdrdata_prime = csv_hdrdata[1:] f = StringIO.StringIO(csv_hdrdata_prime) with bdb.savepoint(): bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1, 2, 3, 'foo', 'bar', u'nan', u'', u'quagga'), (4, 5, 6, 'baz', 'quux', 42.0, u'', u'eland'), (7, 8, 6, 'zot', 'mumble', 87.0, u'zoot', u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data # Test the BQL CREATE TABLE FROM <csv-file> syntax. f = StringIO.StringIO(csv_hdrdata) with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name, )) assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data # Trying to read a csv with an empty column name should fail. csv_header_corrupt = csv_hdr.replace('a,b', ',') csv_hdrdata_corrupt = csv_header_corrupt + csv_data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata_corrupt) with pytest.raises(IOError): bayeslite.bayesdb_read_csv_file(bdb, 't3', temp.name, header=True, create=True)
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE']='1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,)) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_bql_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', cores=0) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 'foo', 'foo_cc') # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(does_not_exist.name, 't', 't_cc') # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', overwrite=True) # Should be able to specify another table name parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', sim_table='t_similarity_2') parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim_2 = cursor_to_df( bdb.execute( 'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1')) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal(parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_composer_integration__ci_slow(): # But currently difficult to seperate these tests into smaller tests because # of their sequential nature. We will still test all internal functions # with different regimes of operation. # SETUP # ----- # Dataset. bdb = bayeslite.bayesdb_open() bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) bdbcontrib.nullify(bdb, 'satellites', 'NaN') # Composer. composer = Composer(n_samples=5) composer.register_foreign_predictor( multiple_regression.MultipleRegression) composer.register_foreign_predictor(keplers_law.KeplersLaw) composer.register_foreign_predictor(random_forest.RandomForest) # Use complex generator for interesting test cases. bayeslite.bayesdb_register_metamodel(bdb, composer) bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), DEPENDENT(Contractor, Country_of_Contractor), INDEPENDENT(Country_of_Operator, Date_of_Launch) );''') # ---------------------- # TEST INITIALIZE MODELS # ---------------------- bdb.execute('INITIALIZE 2 MODELS FOR t1') # Check number of models. df = bdbcontrib.describe_generator_models(bdb, 't1') assert len(df) == 2 df = bdbcontrib.describe_generator_models(bdb, 't1_cc') assert len(df) == 2 # ------------------- # TEST ANALYZE MODELS # ------------------- bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;') # Check number of iterations of composer. df = bdbcontrib.describe_generator_models(bdb, 't1') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # Check number of iterations of composer_cc. df = bdbcontrib.describe_generator_models(bdb, 't1_cc') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # ---------------------------------- # TEST COLUMN DEPENDENCE PROBABILITY # ---------------------------------- # Special 0/1 regimes. # Local with a INDEPENDENT local should be 0. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch WITH Country_of_Operator FROM t1 LIMIT 1 ''') assert curs.next()[0] == 0 # Local with a DEPENDENT local should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 # Foreign with a local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a local non-parent DEPENDENT with local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with foreign sharing common direct ancestor should be 1. # Launch_Mass_kg is the common parent. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1. # Eccentricity is a parent of Type_of_orbit, and is dependent # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity) curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Column with itself should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Unknown [0,1] regimes. # Foreign with a local of unknown relation with parents. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # Foreign with a foreign of unknown ancestry relation. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Period_minutes FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # ---------------------------------- # TEST SIMULATE # ---------------------------------- # Crash tests for various code paths. Quality of simulations ignored. # Joint local. curs = bdb.execute(''' SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign. curs = bdb.execute(''' SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign with missing parents. curs = bdb.execute(''' SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with parents, and missing parents. curs = bdb.execute(''' SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with non-parents. curs = bdb.execute(''' SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint local conditioned on two foreigns. curs = bdb.execute(''' SIMULATE Country_of_Operator, Inclination_radians FROM t1 GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint foreign conditioned on third foreign. curs = bdb.execute(''' SIMULATE Period_minutes, Anticipated_Lifetime FROM t1 GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2 ''') assert len(curs.fetchall()) == 2 # Simulate foreign conditioned on itself. curs = bdb.execute(''' SIMULATE Period_minutes, Apogee_km FROM t1 GIVEN Period_minutes = 102 LIMIT 2 ''') assert [s[0] for s in curs] == [102] * 2 # ----------------------------- # TEST COLUMN VALUE PROBABILITY # ----------------------------- # Crash tests for various code path. Quality of logpdf ignored. # Conditional local. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980, Launch_Mass_kg = 890) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Unconditional foreign curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Conditional foreign on parent and non-parents. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[0] # Conditional foriegn on foreign. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN (Class_of_Orbit = 'LEO', Purpose='Astrophysics', Period_minutes = 1436) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Categorical foreign should be less than 1. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1; ''') assert curs.next()[0] <= 1. # Query inconsistent with evidence should be 0. curs = bdb.execute(''' ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar' GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1; ''') assert curs.next()[0] == 0. # In theory, query consistent with evidence should be 1, but this is very # hard to ensure due to stochastic sampling giving different estimates of # P(Y), once in joint and once in marginal Monte Carlo estimation. # ----------------------- # TEST MUTUAL INFORMATION # ----------------------- # Two local columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX Small sample sizes non-deterministically produce negative MI assert -1 <= curs.next()[0] # One local and one foreign column. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # Two foreign columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # ----------------------- # TEST PREDICT CONFIDENCE # ----------------------- # Continuous local column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete local column with no children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Discrete local column with children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Continuous foreign columns. curs = bdb.execute(''' INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete foreign column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 bdb.close()
def test_create_generator_schema(): bdb = bayeslite.bayesdb_open() bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) composer = Composer(n_samples=5) bayeslite.bayesdb_register_metamodel(bdb, composer) # Using crosscat and default to specify models should work. bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), crosscat ( Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL ) );''') assert bayeslite.core.bayesdb_has_generator(bdb, 't1_cc') # IGNORE and GUESS(*) are forbidden and should crash. with pytest.raises(AttributeError): bdb.execute(''' CREATE GENERATOR t2 FOR satellites USING composer( default ( GUESS(*), Country_of_Operator IGNORE, Apogee_km NUMERICAL, Eccentricity NUMERICAL ) );''') # Test unregistered foreign predictor. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t3 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km NUMERICAL GIVEN Operator_Owner ) );''') # Unregistered foreign predictor should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t4 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km NUMERICAL GIVEN Operator_Owner ) );''') # Registered foreign predictor should work. composer.register_foreign_predictor(random_forest.RandomForest) bdb.execute(''' CREATE GENERATOR t5 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km NUMERICAL GIVEN Operator_Owner ) );''') # Wrong stattype in predictor should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t6 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km RADIAL GIVEN Operator_Owner ) );''') # Missing GIVEN keyword should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t6 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km NUMERICAL, Operator_Owner ) );''') # Missing conditions in random forest conditions should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t7 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Apogee_km NUMERICAL GIVEN Operator_Owner ) );''') # Test duplicate declarations. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t7 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Class_of_orbit CATEGORICAL GIVEN Operator_Owner ) );''') # Arbitrary DAG with foreign predictors. composer.register_foreign_predictor(multiple_regression.MultipleRegression) bdb.execute(''' CREATE GENERATOR t8 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, ), random_forest ( Apogee_km NUMERICAL GIVEN Operator_Owner, Users ), multiple_regression ( Eccentricity NUMERICAL GIVEN Apogee_km, Users, Perigee_km ) );''') # Duplicate declarations in foreign predictors should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t9 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL ), random_forest ( Perigee_km NUMERICAL GIVEN Purpose ), multiple_regression ( Perigee_km NUMERICAL GIVEN Operator_Owner ) );''') # MML for default models should work. bdb.execute(''' CREATE GENERATOR t10 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Apogee_km NUMERICAL ) random_forest ( Perigee_km NUMERICAL GIVEN Purpose ) multiple_regression ( Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit ) DEPENDENT(Apogee_km, Perigee_km, Purpose), INDEPENDENT(Country_of_Operator, Purpose) );''') # MML for foreign predictors should crash. with pytest.raises(BLE): bdb.execute(''' CREATE GENERATOR t11 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Apogee_km NUMERICAL ), random_forest ( Perigee_km NUMERICAL GIVEN Purpose ), multiple_regression ( Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit ) DEPENDENT(Apogee_km, Eccentricity, Country_of_Operator), INDEPENDENT(Perigee_km, Purpose) );''') # Test full generator. composer.register_foreign_predictor(keplers_law.KeplersLaw) bdb.execute(''' CREATE GENERATOR t12 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), INDEPENDENT(Country_of_Operator, longitude_radians_of_geo) );''') bdb.close()
def test_composer_integration__ci_slow(): # But currently difficult to seperate these tests into smaller tests because # of their sequential nature. We will still test all internal functions # with different regimes of operation. # SETUP # ----- # Dataset. bdb = bayeslite.bayesdb_open() bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN') # Composer. composer = Composer(n_samples=5) composer.register_foreign_predictor(multiple_regression.MultipleRegression) composer.register_foreign_predictor(keplers_law.KeplersLaw) composer.register_foreign_predictor(random_forest.RandomForest) # Use complex generator for interesting test cases. bayeslite.bayesdb_register_metamodel(bdb, composer) bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), DEPENDENT(Contractor, Country_of_Contractor), INDEPENDENT(Country_of_Operator, Date_of_Launch) );''') # ---------------------- # TEST INITIALIZE MODELS # ---------------------- bdb.execute('INITIALIZE 2 MODELS FOR t1') # Check number of models. df = describe_generator_models(bdb, 't1') assert len(df) == 2 df = describe_generator_models(bdb, 't1_cc') assert len(df) == 2 # ------------------- # TEST ANALYZE MODELS # ------------------- bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;') # Check number of iterations of composer. df = describe_generator_models(bdb, 't1') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # Check number of iterations of composer_cc. df = describe_generator_models(bdb, 't1_cc') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # ---------------------------------- # TEST COLUMN DEPENDENCE PROBABILITY # ---------------------------------- # Special 0/1 regimes. # Local with a INDEPENDENT local should be 0. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch WITH Country_of_Operator FROM t1 LIMIT 1 ''') assert curs.next()[0] == 0 # Local with a DEPENDENT local should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 # Foreign with a local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a local non-parent DEPENDENT with local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with foreign sharing common direct ancestor should be 1. # Launch_Mass_kg is the common parent. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1. # Eccentricity is a parent of Type_of_orbit, and is dependent # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity) curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Column with itself should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Unknown [0,1] regimes. # Foreign with a local of unknown relation with parents. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # Foreign with a foreign of unknown ancestry relation. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Period_minutes FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # ---------------------------------- # TEST SIMULATE # ---------------------------------- # Crash tests for various code paths. Quality of simulations ignored. # Joint local. curs = bdb.execute(''' SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign. curs = bdb.execute(''' SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign with missing parents. curs = bdb.execute(''' SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with parents, and missing parents. curs = bdb.execute(''' SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with non-parents. curs = bdb.execute(''' SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint local conditioned on two foreigns. curs = bdb.execute(''' SIMULATE Country_of_Operator, Inclination_radians FROM t1 GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint foreign conditioned on third foreign. curs = bdb.execute(''' SIMULATE Period_minutes, Anticipated_Lifetime FROM t1 GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2 ''') assert len(curs.fetchall()) == 2 # Simulate foreign conditioned on itself. curs = bdb.execute(''' SIMULATE Period_minutes, Apogee_km FROM t1 GIVEN Period_minutes = 102 LIMIT 2 ''') assert [s[0] for s in curs] == [102] * 2 # ----------------------------- # TEST COLUMN VALUE PROBABILITY # ----------------------------- # Crash tests for various code path. Quality of logpdf ignored. # Conditional local. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980, Launch_Mass_kg = 890) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Unconditional foreign curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Conditional foreign on parent and non-parents. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[0] # Conditional foriegn on foreign. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN (Class_of_Orbit = 'LEO', Purpose='Astrophysics', Period_minutes = 1436) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Categorical foreign should be less than 1. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1; ''') assert curs.next()[0] <= 1. # Query inconsistent with evidence should be 0. curs = bdb.execute(''' ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar' GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1; ''') assert curs.next()[0] == 0. # In theory, query consistent with evidence should be 1, but this is very # hard to ensure due to stochastic sampling giving different estimates of # P(Y), once in joint and once in marginal Monte Carlo estimation. # ----------------------- # TEST MUTUAL INFORMATION # ----------------------- # Two local columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX Small sample sizes non-deterministically produce negative MI assert -1 <= curs.next()[0] # One local and one foreign column. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # Two foreign columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # ----------------------- # TEST PREDICT CONFIDENCE # ----------------------- # Continuous local column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete local column with no children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Discrete local column with children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Continuous foreign columns. curs = bdb.execute(''' INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete foreign column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 bdb.close()
def test_drop_generator(): bdb = bayeslite.bayesdb_open() # Initialize the database bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) composer = Composer(n_samples=5) bayeslite.bayesdb_register_metamodel(bdb, composer) composer.register_foreign_predictor(random_forest.RandomForest) composer.register_foreign_predictor(multiple_regression.MultipleRegression) composer.register_foreign_predictor(keplers_law.KeplersLaw) bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), DEPENDENT(Contractor, Country_of_Contractor), INDEPENDENT(Country_of_Operator, Date_of_Launch) );''') generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1') schema = [ ('table', 'bayesdb_composer_cc_id'), ('table', 'bayesdb_composer_column_owner'), ('table', 'bayesdb_composer_column_toposort'), ('table', 'bayesdb_composer_column_parents'), ('table', 'bayesdb_composer_column_foreign_predictor'), ] # Iterate through tables before dropping. for _, name in schema: bdb.sql_execute( ''' SELECT * FROM {} WHERE generator_id=? '''.format(quote(name)), (generator_id, )).next() # Drop generator and ensure table lookups with generator_id throw error. bdb.execute('DROP GENERATOR t1') for _, name in schema: with pytest.raises(StopIteration): bdb.sql_execute( ''' SELECT * FROM {} WHERE generator_id=? '''.format(quote(name)), (generator_id, )).next() assert not bayeslite.core.bayesdb_has_generator(bdb, 't1') assert not bayeslite.core.bayesdb_has_generator(bdb, 't1_cc') bdb.close()
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE'] = '1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, )) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics( bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))
def test_mml_csv(): with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file( bdb, 't', 'tests/mml.csv', header=True, create=True) guesses = mml_utils.guess_types(bdb, 't') # Testing these strings is going to be brittle, but I don't have a # great answer. assert guesses == ({ 'col1': ('IGNORE', 'Column is constant'), 'col2': ('CATEGORICAL', 'Only 5 distinct values'), 'col3': ('IGNORE', 'Column is constant'), 'col4': ('NUMERICAL', 'Contains exclusively numbers (24 of them).'), 'col5': ('CATEGORICAL', 'Only 2 distinct values'), 'col6': ('NUMERICAL', 'Contains exclusively numbers (25 of them).')}) mml_json = mml_utils.to_json(guesses) assert mml_json == { 'metamodel': 'crosscat', 'columns': { 'col1': {'stattype': 'IGNORE', 'reason': 'Column is constant'}, 'col2': {'stattype': 'CATEGORICAL', 'reason': 'Only 5 distinct values'}, 'col3': {'stattype': 'IGNORE', 'reason': 'Column is constant'}, 'col4': {'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (24 of them).'}, 'col5': {'stattype': 'CATEGORICAL', 'reason': 'Only 2 distinct values'}, 'col6': {'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (25 of them).'} }} mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator') assert mml_statement == ( 'CREATE GENERATOR "generator" FOR "table" ' 'USING crosscat( ' '"col6" NUMERICAL,"col4" NUMERICAL,' '"col5" CATEGORICAL,"col2" CATEGORICAL);') # col6's values are constructed in such a way as to break crosscat. # See https://github.com/probcomp/bayeslite/issues/284 # On validation the column should be ignored mod_schema = mml_utils.validate_schema(bdb, 't', mml_json) assert mod_schema == { 'metamodel': 'crosscat', 'columns': { 'col1': {'stattype': 'IGNORE', 'reason': 'Column is constant'}, 'col2': {'stattype': 'CATEGORICAL', 'reason': 'Only 5 distinct values'}, 'col3': {'stattype': 'IGNORE', 'reason': 'Column is constant'}, 'col4': {'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (24 of them).'}, 'col5': {'stattype': 'CATEGORICAL', 'reason': 'Only 2 distinct values'}, 'col6': {'stattype': 'IGNORE', 'guessed': 'NUMERICAL', 'reason': 'Caused ANALYZE to error'}}}
def test_drop_generator(): bdb = bayeslite.bayesdb_open() # Initialize the database bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) composer = Composer(n_samples=5) bayeslite.bayesdb_register_metamodel(bdb, composer) composer.register_foreign_predictor(random_forest.RandomForest) composer.register_foreign_predictor(multiple_regression.MultipleRegression) composer.register_foreign_predictor(keplers_law.KeplersLaw) bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), DEPENDENT(Contractor, Country_of_Contractor), INDEPENDENT(Country_of_Operator, Date_of_Launch) );''') generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1') schema = [ ('table', 'bayesdb_composer_cc_id'), ('table', 'bayesdb_composer_column_owner'), ('table', 'bayesdb_composer_column_toposort'), ('table', 'bayesdb_composer_column_parents'), ('table', 'bayesdb_composer_column_foreign_predictor'), ] # Iterate through tables before dropping. for _, name in schema: bdb.sql_execute(''' SELECT * FROM {} WHERE generator_id=? '''.format(quote(name)), (generator_id,)).next() # Drop generator and ensure table lookups with generator_id throw error. bdb.execute('DROP GENERATOR t1') for _, name in schema: with pytest.raises(StopIteration): bdb.sql_execute(''' SELECT * FROM {} WHERE generator_id=? '''.format(quote(name)), (generator_id,)).next() assert not bayeslite.core.bayesdb_has_generator(bdb, 't1') assert not bayeslite.core.bayesdb_has_generator(bdb, 't1_cc') bdb.close()
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', cores=0 ) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 'foo', 'foo_cc' ) # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( does_not_exist.name, 't', 't_cc' ) # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', overwrite=True ) # Should be able to specify another table name parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', sim_table='t_similarity_2' ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim_2 = cursor_to_df( bdb.execute('SELECT * FROM t_similarity_2') ).sort_values(by=['rowid0', 'rowid1']) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal( parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_mml_csv(): with bayeslite.bayesdb_open() as bdb: bayeslite.bayesdb_read_csv_file(bdb, 't', 'tests/mml.csv', header=True, create=True) guesses = mml_utils.guess_types(bdb, 't') # Testing these strings is going to be brittle, but I don't have a # great answer. assert guesses == ({ 'col1': ('IGNORE', 'Column is constant'), 'col2': ('CATEGORICAL', 'Only 5 distinct values'), 'col3': ('IGNORE', 'Column is constant'), 'col4': ('NUMERICAL', 'Contains exclusively numbers (24 of them).'), 'col5': ('CATEGORICAL', 'Only 2 distinct values'), 'col6': ('NUMERICAL', 'Contains exclusively numbers (25 of them).') }) mml_json = mml_utils.to_json(guesses) assert mml_json == { 'metamodel': 'crosscat', 'columns': { 'col1': { 'stattype': 'IGNORE', 'reason': 'Column is constant' }, 'col2': { 'stattype': 'CATEGORICAL', 'reason': 'Only 5 distinct values' }, 'col3': { 'stattype': 'IGNORE', 'reason': 'Column is constant' }, 'col4': { 'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (24 of them).' }, 'col5': { 'stattype': 'CATEGORICAL', 'reason': 'Only 2 distinct values' }, 'col6': { 'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (25 of them).' } } } mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator') assert mml_statement == ('CREATE GENERATOR "generator" FOR "table" ' 'USING crosscat( ' '"col6" NUMERICAL,"col4" NUMERICAL,' '"col5" CATEGORICAL,"col2" CATEGORICAL);') # col6's values are constructed in such a way as to break crosscat. # See https://github.com/probcomp/bayeslite/issues/284 # On validation the column should be ignored mod_schema = mml_utils.validate_schema(bdb, 't', mml_json) assert mod_schema == { 'metamodel': 'crosscat', 'columns': { 'col1': { 'stattype': 'IGNORE', 'reason': 'Column is constant' }, 'col2': { 'stattype': 'CATEGORICAL', 'reason': 'Only 5 distinct values' }, 'col3': { 'stattype': 'IGNORE', 'reason': 'Column is constant' }, 'col4': { 'stattype': 'NUMERICAL', 'reason': 'Contains exclusively numbers (24 of them).' }, 'col5': { 'stattype': 'CATEGORICAL', 'reason': 'Only 2 distinct values' }, 'col6': { 'stattype': 'IGNORE', 'guessed': 'NUMERICAL', 'reason': 'Caused ANALYZE to error' } } }
# Find the satellites file. PATH_KEPLER = os.path.dirname(os.path.abspath(__file__)) PATH_EXAMPLES = os.path.dirname(PATH_KEPLER) PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites') PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv') composer = Composer() composer.register_foreign_predictor(keplers_law.KeplersLaw) composer.register_foreign_predictor(random_forest.RandomForest) if os.path.exists(os.path.join(outdir, 'kepler.bdb')): os.remove(os.path.join(outdir, 'kepler.bdb')) bdb = bayeslite.bayesdb_open(os.path.join(outdir, 'kepler.bdb')) bayeslite.bayesdb_register_metamodel(bdb, composer) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) bdbcontrib.query(bdb, ''' CREATE GENERATOR sat_kepler FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL
def test_read_csv(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) with pytest.raises(IOError): # Table must have no empty values in header. csv_hdrdata_prime = csv_hdrdata[1:] f = StringIO.StringIO(csv_hdrdata_prime) with bdb.savepoint(): bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1,2,3,'foo','bar',u'nan',u'',u'quagga'), (4,5,6,'baz','quux',42.0,u'',u'eland'), (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data # Test the BQL CREATE TABLE FROM <csv-file> syntax. f = StringIO.StringIO(csv_hdrdata) with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name,)) assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data # Trying to read a csv with an empty column name should fail. csv_header_corrupt = csv_hdr.replace('a,b',',') csv_hdrdata_corrupt = csv_header_corrupt + csv_data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata_corrupt) with pytest.raises(IOError): bayeslite.bayesdb_read_csv_file( bdb, 't3', temp.name, header=True, create=True)