def refresh_data_samples(name, num_tables=50, organization='default'): """Refresh table samples by data source name.""" try: org = models.Organization.get_by_slug(organization) data_source = models.DataSource.query.filter( models.DataSource.name == name, models.DataSource.org == org).one() print("Refreshing samples for data source: {} (id={})".format( name, data_source.id)) refresh_samples(data_source.id, num_tables) except NoResultFound: print("Couldn't find data source named: {}".format(name)) exit(1)
def test_refresh_samples_applied_to_one_data_source(self): ds1 = self.factory.create_data_source() ds2 = self.factory.create_data_source() ds1.query_runner.configuration['samples'] = True ds2.query_runner.configuration['samples'] = True refresh_schema(ds1.id) refresh_schema(ds2.id) refresh_samples(ds1.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertEqual(table_metadata.count(), len(self.default_schema_return_value))
def test_refresh_samples_refreshes(self): NEW_COLUMN_NAME = 'new_column' NUM_TABLES = 5 TIME_BEFORE_UPDATE = utils.utcnow() tables = [] for i in range(NUM_TABLES): tables.append({ 'name': 'table{}'.format(i), 'columns': [NEW_COLUMN_NAME], 'metadata': [{ 'name': NEW_COLUMN_NAME, 'type': self.COLUMN_TYPE, }] }) self.patched_get_schema.return_value = tables self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) refresh_samples(self.factory.data_source.id, 50) # There's a total of 5 processed tables table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertEqual(table_metadata.count(), NUM_TABLES) self.assertTrue( table_metadata.first().sample_updated_at > TIME_BEFORE_UPDATE) table_metadata.update({ 'sample_updated_at': utils.utcnow() - datetime.timedelta(days=30) }) models.db.session.commit() TIME_BEFORE_UPDATE = utils.utcnow() refresh_samples(self.factory.data_source.id, 50) table_metadata_list = TableMetadata.query.filter( TableMetadata.sample_updated_at.isnot(None)) self.assertTrue( table_metadata_list.first().sample_updated_at > TIME_BEFORE_UPDATE)
def test_refresh_samples_rate_limits(self): NEW_COLUMN_NAME = 'new_column' NUM_TABLES = 105 tables = [] for i in range(NUM_TABLES): tables.append({ 'name': 'table{}'.format(i), 'columns': [NEW_COLUMN_NAME], 'metadata': [{ 'name': NEW_COLUMN_NAME, 'type': self.COLUMN_TYPE, }] }) self.patched_get_schema.return_value = tables self.factory.data_source.query_runner.configuration['samples'] = True refresh_schema(self.factory.data_source.id) refresh_samples(self.factory.data_source.id, 50) # There's a total of 105 tables table_metadata = TableMetadata.query.count() self.assertEqual(table_metadata, NUM_TABLES) # 50 tables are processed on the first call table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 55) # 50 more tables are processed on the second call refresh_samples(self.factory.data_source.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 5) # All tables are processed by the third call refresh_samples(self.factory.data_source.id, 50) table_metadata = TableMetadata.query.filter( TableMetadata.sample_updated_at.is_(None)).all() self.assertEqual(len(table_metadata), 0)