def get_db(): """Connect to the application's configured database. The connection is unique for each request and will be reused if this is called again. """ if 'db' not in g: sql_conf = current_app.config['SQL_CONF'] db = Database(sql_conf.dialect, sql_conf.database, sql_conf.username, sql_conf.password, sql_conf.host, sql_conf.port, sql_conf.query) check_db_mappers(db) db.session = db.get_session() g.db = db return g.db
def enter_datarun(sql_config, run_config, aws_config=None, upload_data=False, run_per_partition=False): """ Generate a datarun, including a dataset if necessary. sql_config: Object with all attributes necessary to initialize a Database. run_config: all attributes necessary to initialize a Datarun, including Dataset info if the dataset has not already been created. aws_config: all attributes necessary to connect to an S3 bucket. upload_data: whether to store processed data in the cloud Returns: ID of the generated datarun """ # connect to the database db = Database(sql_config.dialect, sql_config.database, sql_config.username, sql_config.password, sql_config.host, sql_config.port, sql_config.query) # if the user has provided a dataset id, use that. Otherwise, create a new # dataset based on the arguments we were passed. if run_config.dataset_id is None: dataset = enter_dataset(db, run_config, aws_config=aws_config, upload_data=upload_data) else: dataset = db.get_dataset(run_config.dataset_id) # create hyperpartitions for the new datarun print print 'creating hyperpartitions...' session = db.get_session() method_and_parts = [] for m in run_config.methods: # enumerate all combinations of categorical variables for this method method = Method(METHODS_MAP[m]) method_hyperparitions = method.get_hyperpartitions() for method_hyperparition in method_hyperparitions: method_and_parts.append((m, method_hyperparition)) print 'method', m, 'has', len(method_hyperparitions), 'hyperpartitions' # create and save datarun to database print print 'creating datarun...' # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: datarun = create_datarun(db, session, dataset, run_config) session.commit() for method, part in method_and_parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = create_datarun(db, session, dataset, run_config) session.commit() run_ids.append(datarun.id) hp = db.Hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) session.add(hp) session.commit() print print '========== Summary ==========' print 'Dataset ID:', dataset.id print 'Training data:', dataset.train_path print 'Test data:', (dataset.test_path or '(None)') if run_per_partition: print 'Datarun IDs:', ', '.join(map(str, run_ids)) else: print 'Datarun ID:', datarun.id print 'Hyperpartition selection strategy:', datarun.selector print 'Parameter tuning strategy:', datarun.tuner print 'Budget: %d (%s)' % (datarun.budget, datarun.budget_type) print return run_ids or datarun.id