def data_from_config(config_file): """ Returns features, scores, elapsed time in seconds etc. and searchspace from a config file """ config = Config(config_file) session = config.trials() searchspace = config.search_space() history = [[t.parameters, t.test_scores, t.status, t.elapsed] for t in session.query(Trial).all()] return get_data(history, searchspace) + (searchspace, )
print('usage: sample_db.py config.yaml sample_size n_samples') exit(1) inp_file = argv[1] num = int(argv[2]) # TOTAL size of samples to use e.g. 100 iter = int(argv[3]) # Number of splits e.g. 5 # This will give 5 splits of 20 samples each.100 if num % iter != 0: print('sample_size not strictly divisible by n_samples') exit(1) # Get original database and history config1 = Config(inp_file) df1 = config1.trial_results() hist1 = config1.trials().query(Trial).all() # Main loop for name, group in df1.groupby('project_name'): # Sample the group sample = group.sample(num, random_state=42) cv = KFold(n_splits=iter, random_state=42) all_keep = sample['id'].values for i, (_, test_idx) in enumerate(cv.split(all_keep)): keep = all_keep[test_idx] db2 = make_session('sqlite:///osprey-trials-{0}-{1}.db'.format(int(num/iter), i), project_name=name) # Get the relevant trial objects from original db