예제 #1
0
def upload_files(share_name,
                 share_path,
                 local_path,
                 username=None,
                 overwrite=True,
                 show_progress=True):
    '''
    uploads files specified by local_path to the datastore path formed by:
        share_name/username/share_path.

        :param share_name: the XT share name where files will be stored (usually one of: data, models, trajectories)
        :param share_path: the path where file(s) will be stored on the share (e.g., "maze" or "procgen")
        :param local_path: the path to the local files to be uploaded
        :param username: the username associated with the data on the share (if None, will use OS username)
        :param overwrite: if False, existing files will not be overwritten (not yet supported)
        :param show_progress: if True, progress messages will be printed 
    '''
    if username is None:
        username = utils.get_username()

    share_path = os.path.join(username, share_path)
    share_path = share_path.replace("\\", "/")

    # use XT to prevent interactive authenication (which will fail for remote runs)
    xt_run = Run()
    results = xt_run.upload_files_to_share(share_name,
                                           share_path,
                                           local_path,
                                           show_feedback=show_progress)
    return results
예제 #2
0
    def init_xt_run(self, logging, tb_path, args):
        # init xtlib
        self.run = None

        if args.xtlib and (os.getenv("XT_RUN_NAME") or tb_path):
            # access to the XTLib API
            from xtlib.run import Run as XTRun

            # create an instance of XTRunLog to log info for current run
            print("---> tb_path=", tb_path)
            self.run = XTRun(xt_logging=logging, aml_logging=logging, checkpoints_enabled=logging, tensorboard_path=tb_path)

            #utils.debug_break()

            if args.tag_job:
                self.run.tag_job( {"plotted_metric": "test_acc"} )

            # if "call search API" test was specified and if we are running under XT
            if args.search_api and run.run_name:
                fn_sweeps = os.path.join(file_utils.get_my_file_dir(__file__), "miniSweeps.yaml")
                sweeps = file_utils.load_yaml(fn_sweeps)
                hp_space_dict = sweeps[constants.HPARAMS_DIR]
                print("hp_space_dict=", hp_space_dict)
                search_type = "random"

                hp_set = run.get_next_hp_set_in_search(hp_space_dict, search_type=search_type)
                print("hp_set=", hp_set)

                # apply to args
                for name, value in hp_set.items():
                    setattr(args, name, value)
예제 #3
0
 def __init__(self):
     self.file_paths = []
     self.stage_num = 0
     self.last_metric = None
     self.last_y_axis_name = None
     self.last_x_axis_name = None
     self.last_x_axis_value = None
     self.last_stage_num = None
     # XT related
     self.xt_run_name = os.getenv("XT_RUN_NAME", None)
     self.xt_run = None
     if self.xt_run_name:
         from xtlib.run import Run as XTRun
         self.xt_run = XTRun()
예제 #4
0
 def __init__(self):
     """
     Preprocesses the runspec before the call to yaml.load().
     Manages communication with XT regarding hyperparameters.
     """
     self.uploaded_hp_config_filename = 'uploaded_hp_config.yaml'
     self.downloaded_hp_config_filename = 'downloaded_hp_config.yaml'
     self.xt_run_name = os.getenv("XT_RUN_NAME")
     self.xt_run = None
     self.in_hp_search = False
     self.randint_in_spec = False
     if self.xt_run_name:
         from xtlib.run import Run as XTRun
         self.xt_run = XTRun()
         if os.path.isfile(self.downloaded_hp_config_filename):
             self.in_hp_search = True
     self.hparams = []
예제 #5
0
 def __init__(self):
     self.file_paths = []
     self.stage_num = 0
     self.last_metric = None
     self.last_y_axis_name = None
     self.last_x_axis_name = None
     self.last_x_axis_value = None
     self.last_stage_num = None
     # XT related
     self.xt_run_name = os.getenv("XT_RUN_NAME", None)
     self.xt_run = None
     if self.xt_run_name:
         from xtlib.run import Run as XTRun
         self.xt_run = XTRun()
     self.estimate_record = {}
     self.mse_record = {}
     self.all_estimators = ['PDIS', 'WPDIS', 'MB-K', 'LSTD', 'LSTDQ', 'TDREG-K', 'MWL', 'MSWL', 'MQL', 'DualDICE', \
         'TDREG-N', 'FQE', 'MB-N', 'W-Regression', 'FL', 'On_Policy', 'Behavior']
     for estimator in self.all_estimators:
         self.estimate_record[estimator] = []
         self.mse_record[estimator] = []
예제 #6
0
class NexusLogger():
    '''
        Centralizes various forms of low-frequency output, such as occasional metric reports.
        Not intended for high-frequency logging (multiple calls per second throughout a run).
    '''
    def __init__(self):
        self.file_paths = []
        self.stage_num = 0
        self.last_metric = None
        self.last_y_axis_name = None
        self.last_x_axis_name = None
        self.last_x_axis_value = None
        self.last_stage_num = None
        # XT related
        self.xt_run_name = os.getenv("XT_RUN_NAME", None)
        self.xt_run = None
        if self.xt_run_name:
            from xtlib.run import Run as XTRun
            self.xt_run = XTRun()

    def add_output_file(self, file_path):
        # Add one console output file.
        file_path = os.path.abspath(file_path)
        self.file_paths.append(file_path)
        utils.ensure_dir_exists(file=file_path)
        output_file = open(file_path, 'w')
        output_file.close()

    def write_line(self, line):
        # Write one line to stdout and all console files.
        print(line)
        for path in self.file_paths:
            output_file = open(path, 'a')
            output_file.write(line + '\n')
            output_file.close()

    def write_and_condense_metrics(self, total_seconds, x_axis_name,
                                   x_axis_value, saved, metrics, tf_writer):
        '''
            Outputs the given metric values for the last reporting period and condenses the metric.
        '''
        hours = total_seconds / 3600
        self.last_x_axis_name = x_axis_name
        self.last_x_axis_value = x_axis_value
        self.last_stage_num = self.stage_num

        # Report one line.
        sz = "{:7.3f} hrs  {:12,d} {}".format(hours, x_axis_value, x_axis_name)

        # Write one line of formatted metrics.
        for metric in metrics:
            sz_format = '      {} {{}}'.format(metric.formatting_string)
            sz += sz_format.format(metric.aggregate_value, metric.short_name)
        if saved:
            sz += "    SAVED"
        self.write_line(sz)

        if self.xt_run:
            # Log metrics to XT
            xt_metrics = {}
            xt_metrics["hrs"] = hours
            xt_metrics[x_axis_name] = x_axis_value
            for metric in metrics:
                xt_metrics[metric.short_name] = metric.aggregate_value
            self.xt_run.log_metrics(data_dict=xt_metrics,
                                    step_name=x_axis_name,
                                    stage='s{}'.format(self.stage_num))

        if tf_writer:
            # Log metrics to tensorboard.
            for metric in metrics:
                tf_writer.add_scalar(metric.long_name, metric.aggregate_value,
                                     x_axis_value)
            tf_writer.flush()

        # Condense the metrics
        for metric in metrics:
            metric.condense_values()

    def summarize_stage(self, metric):
        '''
            Outputs the metric value for the entire processing stage.
        '''
        metric.condense_values(
        )  # Condense any values accumulated since the last report.
        sz_format = 'Stage summary (mean {{}}):  {}'.format(
            metric.formatting_string)
        self.write_line(
            sz_format.format(metric.long_name, metric.lifetime_value))
        self.last_metric = metric
        self.last_y_axis_name = metric.short_name
        return metric.lifetime_value

    def finish_run(self, in_hp_search):
        '''
            Outputs the final stage's summary metric as hpmax (used for hyperparameter tuning).
        '''
        if self.last_metric:
            # Log hpmax.
            explanation = 'Objective that would be maximized by hyperparameter tuning (hpmax):'
            hpmax = self.last_metric.lifetime_value
            if not self.last_metric.higher_is_better:
                hpmax = -hpmax
            if self.xt_run:
                # Log hpmax to XT
                xt_metrics = {}
                xt_metrics[self.last_x_axis_name] = self.last_x_axis_value
                xt_metrics['hpmax'] = hpmax
                self.xt_run.log_metrics(data_dict=xt_metrics,
                                        step_name=self.last_x_axis_name)
                self.xt_run.tag_job({
                    'plotted_metric':
                    's{}-{}'.format(self.last_stage_num, self.last_y_axis_name)
                })
                # self.xt_run.tag_job({'primary_metric': 'hpmax'})  # To override xt_config.yaml's default of 'hpmax'.
                # self.xt_run.tag_job({'step_name': 'iters'})  # To override xt_config.yaml's default of 'iters'.
                if in_hp_search:
                    explanation = 'Objective being maximized by hyperparameter tuning (hpmax):'
            sz_format = '{}  {}\n'.format(explanation,
                                          self.last_metric.formatting_string)
            self.write_line(sz_format.format(hpmax))
예제 #7
0
class OPELogger():
    '''
        Centralizes various forms of low-frequency output, such as occasional metric reports.
        Not intended for high-frequency logging (multiple calls per second throughout a run).
    '''
    def __init__(self):
        self.file_paths = []
        self.stage_num = 0
        self.last_metric = None
        self.last_y_axis_name = None
        self.last_x_axis_name = None
        self.last_x_axis_value = None
        self.last_stage_num = None
        # XT related
        self.xt_run_name = os.getenv("XT_RUN_NAME", None)
        self.xt_run = None
        if self.xt_run_name:
            from xtlib.run import Run as XTRun
            self.xt_run = XTRun()
        self.estimate_record = {}
        self.mse_record = {}
        self.all_estimators = ['PDIS', 'WPDIS', 'MB-K', 'LSTD', 'LSTDQ', 'TDREG-K', 'MWL', 'MSWL', 'MQL', 'DualDICE', \
            'TDREG-N', 'FQE', 'MB-N', 'W-Regression', 'FL', 'On_Policy', 'Behavior']
        for estimator in self.all_estimators:
            self.estimate_record[estimator] = []
            self.mse_record[estimator] = []
        # self.estimate_record['On_Policy'] = []
        # self.estimate_record['LSTDQ'] = []
        # self.mse_record['LSTDQ'] = []
        # self.estimate_record['MB'] = []
        # self.mse_record['MB'] = []
        # self.estimate_record['TDREG_Neural'] = []
        # self.mse_record['TDREG_Neural'] = []
        # self.estimate_record['FQE'] = []
        # self.mse_record['FQE'] = []


    def add_output_file(self, file_path):
        # Add one console output file.
        file_path = os.path.abspath(file_path)
        self.file_paths.append(file_path)
        utils.ensure_dir_exists(file=file_path)
        output_file = open(file_path, 'w')
        output_file.close()

    def write_line(self, line):
        # Write one line to stdout and all console files.
        print(line)
        for path in self.file_paths:
            output_file = open(path, 'a')
            output_file.write(line + '\n')
            output_file.close()

    def write_ope_metrics(self, dataset_seed, metrics, result):
        # report one line
        formatting_string = '{:6.4f}'
        sz = "Dataset {} - Relative Error:".format(dataset_seed)
        for estimator, error in metrics.items():
            sz_format = ' {{}}: {}'.format(formatting_string)
            sz += sz_format.format(estimator,error)
        self.write_line(sz)
        self.estimate_record['On_Policy'].append(result['On_Policy'])
        self.estimate_record['Behavior'].append(result['Behavior'])
        for estimator, error in metrics.items():
            self.estimate_record[estimator].append(result[estimator])
            self.mse_record[estimator].append(error)
        # summary_metrics = {}
        # summary_metrics['On_Policy'] = sum(self.estimate_record['On_Policy']) / len(self.estimate_record['On_Policy'])
        # summary_metrics['LSTDQ'] = sum(self.estimate_record['LSTDQ']) / len(self.estimate_record['LSTDQ'])
        # summary_metrics['squared_error'] = sum(self.mse_record['LSTDQ']) / len(self.mse_record['LSTDQ'])
        # print(summary_metrics)

        # if self.xt_run:
        #     xt_metrics = {}
        #     xt_metrics["True Val"] = result['On_Policy']
        #     # xt_metrics[x_axis_name] = x_axis_value
        #     for estimator, error in metrics.items():
        #         xt_metrics[estimator] = result[estimator]
        #         xt_metrics['squared_error'] = error
        #         # xt_metrics[estimator] = error
        #     # self.xt_run.log_metrics(data_dict=xt_metrics, step_name="Dataset")
        #     self.xt_run.log_metrics(data_dict=xt_metrics)


    def write_and_condense_metrics(self, total_seconds, x_axis_name, x_axis_value, saved, metrics, tf_writer):
        '''
            Outputs the given metric values for the last reporting period and condenses the metric.
        '''
        hours = total_seconds / 3600
        self.last_x_axis_name = x_axis_name
        self.last_x_axis_value = x_axis_value
        self.last_stage_num = self.stage_num

        # Report one line.
        sz = "{:7.3f} hrs  {:12,d} {}".format(hours, x_axis_value, x_axis_name)

        # Write one line of formatted metrics.
        for metric in metrics:
            sz_format = '      {} {{}}'.format(metric.formatting_string)
            sz += sz_format.format(metric.aggregate_value, metric.short_name)
        if saved:
            sz += "    SAVED"
        self.write_line(sz)

        if self.xt_run:
            # Log metrics to XT
            xt_metrics = {}
            xt_metrics["hrs"] = hours
            xt_metrics[x_axis_name] = x_axis_value
            for metric in metrics:
                xt_metrics[metric.short_name] = metric.aggregate_value
            self.xt_run.log_metrics(data_dict=xt_metrics, step_name=x_axis_name, stage='s{}'.format(self.stage_num))

        if tf_writer:
            # Log metrics to tensorboard.
            for metric in metrics:
                tf_writer.add_scalar(metric.long_name, metric.aggregate_value, x_axis_value)
            tf_writer.flush()

        # Condense the metrics
        for metric in metrics:
            metric.condense_values()

    def summarize_stage(self, metric):
        '''
            Outputs the metric value for the entire processing stage.
        '''
        metric.condense_values() # Condense any values accumulated since the last report.
        sz_format = 'Stage summary (mean {{}}):  {}'.format(metric.formatting_string)
        self.write_line(sz_format.format(metric.long_name, metric.lifetime_value))
        self.last_metric = metric
        self.last_y_axis_name = metric.short_name
        return metric.lifetime_value

    def finish_run(self, in_hp_search):
        '''
            Outputs the final stage's summary metric as hpmax (used for hyperparameter tuning).
        '''
        summary_metrics = {}
        summary_metrics['On_Policy'] = sum(self.estimate_record['On_Policy']) / len(self.estimate_record['On_Policy'])
        summary_metrics['Behavior'] = sum(self.estimate_record['Behavior']) / len(self.estimate_record['Behavior'])
        # summary_metrics['MB'] = sum(self.estimate_record['MB']) / len(self.estimate_record['MB'])
        # summary_metrics['squared_error'] = sum(self.mse_record['MB']) / len(self.mse_record['MB'])
        # summary_metrics['TDREG_Neural'] = sum(self.estimate_record['TDREG_Neural']) / len(self.estimate_record['TDREG_Neural'])
        # summary_metrics['squared_error'] = sum(self.mse_record['TDREG_Neural']) / len(self.mse_record['TDREG_Neural'])
        for estimator in self.all_estimators:
            if estimator != 'On_Policy' and estimator != 'Behavior' and len(self.estimate_record[estimator]) >0:
                summary_metrics[estimator] = sum(self.estimate_record[estimator]) / len(self.estimate_record[estimator])
                summary_metrics[estimator+'_se'] = sum(self.mse_record[estimator]) / len(self.mse_record[estimator])
        # summary_metrics['FQE'] = sum(self.estimate_record['FQE']) / len(self.estimate_record['FQE'])
        # summary_metrics['squared_error'] = sum(self.mse_record['FQE']) / len(self.mse_record['FQE'])

        # print(summary_metrics)

        if self.xt_run:
            self.xt_run.log_metrics(data_dict = summary_metrics)
예제 #8
0
    def runner(self, concurrent_index, job_id, delay, duration, child_count,
               reports, search_type):
        ws_name = "quick-test"
        exper_name = "qtexper"

        fn = "code/miniSweeps.yaml"
        yd = file_utils.load_yaml(fn)
        hd = yd[constants.HPARAM_DIST]

        # simulate a controller for each concurrent runner
        hparam_search = HParamSearch()

        for index in range(child_count):
            # create a new RUN record
            run_name = self.store.start_run(ws_name,
                                            exper_name=exper_name,
                                            is_parent=False,
                                            job_id=job_id,
                                            node_index=0,
                                            search_type=search_type,
                                            search_style="dynamic")

            os.environ["XT_RUN_NAME"] = run_name
            os.environ["XT_WORKSPACE_NAME"] = ws_name
            os.environ["XT_EXPERIMENT_NAME"] = exper_name

            fake_context = cmd_core.build_mock_context(self.config, job_id,
                                                       ws_name, exper_name,
                                                       run_name)
            metric_name = fake_context.primary_metric

            xt_run = Run(self.config, self.store, supress_normal_output=True)
            xt_run.direct_run = True
            xt_run.context = fake_context

            #print("  starting: concurrent_index={}, child_index={}".format(concurrent_index, index))
            # delay start
            sleep_time = delay * random.random()
            time.sleep(sleep_time)

            hp_set = xt_run.get_next_hp_set_in_search(
                hd, search_type, hparam_search=hparam_search)
            self._assert("channels1" in hp_set)

            # log HPARAMS
            xt_run.log_hparams(hp_set)

            for i in range(reports):
                run_time = (duration / reports) * random.random()
                time.sleep(run_time)

                # log METRICS
                fake_metric = random.random()
                md = {"epoch": 1 + i, "acc": fake_metric}
                xt_run.log_metrics(md, step_name="epoch", stage="test")

            # mark the run as completed
            xt_run.close()
예제 #9
0
class Trainer():
    def __init__(self):
        pass

    def train(self, args, model, device, optimizer, epoch):
        model.train()
        total_correct = 0
        total = 0
        steps = 0

        for batch_idx, (data, target) in enumerate(self.train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

            # compute train-acc
            pred = output.argmax(
                dim=1,
                keepdim=True)  # get the index of the max log-probability
            correct = pred.eq(target.view_as(pred)).sum().item()
            total_correct += correct
            total += len(data)
            steps += 1

        return loss.item(), total_correct / total, steps, len(
            data), loss, total_correct, total

    def test(self, args, model, device):
        test_loader = self.test_loader

        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(
                    output, target,
                    reduction='sum').item()  # sum up batch loss
                pred = output.argmax(
                    dim=1,
                    keepdim=True)  # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        test_acc = correct / len(test_loader.dataset)

        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.
              format(test_loss, correct, len(test_loader.dataset),
                     100. * test_acc))

        return test_loss, test_acc

    def get_dataset(self, data_dir, train, auto_download):
        ds = datasets.MNIST(
            data_dir,
            train=train,
            download=auto_download,
            transform=transforms.Compose([
                # PIL transforms
                #transforms.Resize(22),
                #transforms.Resize(28),
                #transforms.RandomCrop(28),
                #transforms.RandomHorizontalFlip(),
                #transforms.RandomRotation(3, resample=PIL.Image.BILINEAR),
                # TENSOR transforms
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, )),

                # requires pytorch 1.2
                #transforms.RandomErasing(p=.25, value="random"),
            ]))
        return ds

    def sample_mnist(self, data_dir, train, rand, percent, auto_download):

        # get MNIST data
        ds = self.get_dataset(data_dir, train, auto_download)

        # support previous torchvision version as well as current  (AML workaround)
        if hasattr(ds, "data"):
            data_attr = "data"
            target_attr = "targets"
        elif train:
            data_attr = "train_data"
            target_attr = "train_labels"
        else:
            data_attr = "test_data"
            target_attr = "test_labels"

        # extract data and targets
        data = getattr(ds, data_attr)
        targets = getattr(ds, target_attr)

        count = len(data)
        indexes = list(range(count))

        rand.shuffle(indexes)

        samples = int(count * percent)
        indexes = indexes[0:samples]

        # update data
        setattr(ds, data_attr, data[indexes])

        # update targets
        setattr(ds, target_attr, targets[indexes])

        which = "TRAIN" if train else "TEST"
        print("Sampled " + which + " data: ", len(data), ", targets=",
              len(targets))
        return ds

    def save_model(self, model, fn):
        # ensure output dir exists
        dir = os.path.dirname(fn)
        if not os.path.exists(dir):
            os.makedirs(dir)

        torch.save(model.state_dict(), fn)

    def text_log(self, msg):
        with open(self.fn_text_log, "a") as outfile:
            outfile.write(msg + "\n")

    def log_stats_and_test(self, epoch, steps, data_len, loss, total_correct,
                           total, model, device, checkpoint_freq, run,
                           train_loss, train_acc, args):

        msg = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f}'.format(
            epoch, steps * data_len, len(self.train_loader.dataset),
            100. * steps / len(self.train_loader), loss.item(),
            total_correct / total)

        # print to console
        print(msg)

        # log to simple text logger
        self.text_log(msg)

        if checkpoint_freq and run and run.store:
            if checkpoint_units == "epochs" and epoch % checkpoint_freq == 0:
                cp_now = True
            elif checkpoint_units == "mins" and time.time(
            ) - last_checkpoint > checkpoint_freq * 60:
                cp_now = True
            else:
                cp_now = False

            if cp_now:
                checkpoint_count += 1
                print("checkpointing model (#{})\n".format(checkpoint_count))

                save_model(model, fn_checkpoint)
                run.set_checkpoint({"epoch": epoch}, fn_checkpoint)
                last_checkpoint = time.time()

        if run:
            # log TRAINING stats
            run.log_metrics(
                {
                    "epoch": epoch,
                    "loss": train_loss,
                    "acc": train_acc
                },
                step_name="epoch",
                stage="train")

            # log EVAL/TEST stats half as often
            if (epoch / args.log_interval) % 2 == 0:
                test_loss, test_acc = self.test_model_and_log_metrics(
                    run, model, device, epoch, args)

                # early stopping
                if math.isnan(test_loss):
                    run.log_event("early_stopping", {"reason": "loss_is_nan"})
                    # exit without error
                    sys.exit(0)

    def test_model_and_log_metrics(self, run, model, device, epoch, args):
        # TEST the model
        test_loss, test_acc = self.test(args, model, device)

        # log TEST METRICS
        #print("test_loss=", test_loss, ", test_acc=", test_acc)
        run.log_metrics({
            "epoch": epoch,
            "loss": test_loss,
            "acc": test_acc
        },
                        step_name="epoch",
                        stage="test")

        return test_loss, test_acc

    def train_test_loop(self, run, model, device, optimizer, start_epoch,
                        checkpoint_freq, args):

        total_steps = 0
        start = time.time()
        print("train_test_loop: start_epoch={} end_epoch={}\n".format(
            start_epoch, args.epochs + 1))

        for epoch in range(start_epoch, args.epochs + 1):

            # train an epoch
            train_loss, train_acc, steps, data_len,  loss, total_correct, total = \
                self.train(args, model, device, optimizer, epoch)

            total_steps += steps

            if epoch % args.log_interval == 0:
                elapsed = time.time() - start
                #print("{} epoch(s) training took: {:.2f} secs".format(args.log_interval, elapsed))

                self.log_stats_and_test(epoch, steps, data_len, loss,
                                        total_correct, total, model, device,
                                        checkpoint_freq, run, train_loss,
                                        train_acc, args)

                start = time.time()

    def init_xt_run(self, logging, tb_path, args):
        # init xtlib
        self.run = None

        if args.xtlib and (os.getenv("XT_RUN_NAME") or tb_path):
            # access to the XTLib API
            from xtlib.run import Run as XTRun

            # create an instance of XTRunLog to log info for current run
            print("---> tb_path=", tb_path)
            self.run = XTRun(xt_logging=logging,
                             aml_logging=logging,
                             checkpoints_enabled=logging,
                             tensorboard_path=tb_path)

            #utils.debug_break()

            if args.tag_job:
                self.run.tag_job({"plotted_metric": "test_acc"})

            # if "call search API" test was specified and if we are running under XT
            if args.search_api and run.run_name:
                fn_sweeps = os.path.join(file_utils.get_my_file_dir(__file__),
                                         "miniSweeps.yaml")
                sweeps = file_utils.load_yaml(fn_sweeps)
                hp_space_dict = sweeps[constants.HPARAMS_DIR]
                print("hp_space_dict=", hp_space_dict)
                search_type = "random"

                hp_set = run.get_next_hp_set_in_search(hp_space_dict,
                                                       search_type=search_type)
                print("hp_set=", hp_set)

                # apply to args
                for name, value in hp_set.items():
                    setattr(args, name, value)

    def init_datasets(self, data_dir, use_cuda, args):
        kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}

        # load subset of training and test data
        ds_train = self.sample_mnist(data_dir, True, self.rand,
                                     args.train_percent, args.auto_download)
        ds_test = self.sample_mnist(data_dir, False, self.rand,
                                    args.test_percent, args.auto_download)

        if args.distributed:
            # Partition dataset among workers using DistributedSampler
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                ds_train, num_replicas=hvd.size(), rank=hvd.rank())
            shuffle = False
        else:
            train_sampler = None
            shuffle = True

        print("loading TRAIN data...")
        self.train_loader = torch.utils.data.DataLoader(
            ds_train,
            batch_size=args.batch_size,
            shuffle=shuffle,
            sampler=train_sampler,
            **kwargs)

        print("loading TEST data...")
        self.test_loader = torch.utils.data.DataLoader(
            ds_test, batch_size=args.test_batch_size, shuffle=True, **kwargs)

    def init_cuda(self, args):
        #---- CUDA init ----
        cuda_avail = torch.cuda.is_available()
        use_cuda = cuda_avail and args.cuda
        gpu_count = torch.cuda.device_count()

        if use_cuda and not args.parallel:
            torch.cuda.set_device(args.gpu)

        print("  cuda_avail={}, GPU count={}, use_cuda={}, gpu={} ---".format(
            cuda_avail, gpu_count, use_cuda, args.gpu))

        if use_cuda and not cuda_avail:
            # if we cannot find a GPU, consider that a hard error (used to detect problems with seeing Philly GPUs)
            errors.env_error("CUDA not available on this platform")

        if args.distributed:
            # Initialize Horovod
            global hvd
            import horovod.torch as hvd

            hvd.init()
            # Pin GPU to be used to process local rank (one GPU per process)
            print("  distributed: rank={}, size={}".format(
                hvd.rank(), hvd.size()))
            device = torch.device("cuda:" + str(hvd.local_rank()))

            # only log HPARAMS and METRICS for job if running as rank 0
            logging = (hvd.rank() == 0)
        else:
            device = torch.device("cuda" if use_cuda else "cpu")
            logging = True

        return use_cuda, device, logging

    def init_dirs(self, args):
        # set mnt_output_dir (using environment variable setting from xt)
        mnt_output_dir = os.getenv("XT_OUTPUT_MNT", "output")
        mnt_output_dir = os.path.expanduser(mnt_output_dir)
        file_utils.ensure_dir_exists(mnt_output_dir)
        print("writing mnt_output to: " + mnt_output_dir)

        # set local_output_dir (using environment variable setting from xt)
        local_output_dir = "output"
        file_utils.ensure_dir_exists(local_output_dir)
        print("writing local_output to: " + local_output_dir)

        # set data_dir (allowing overridden by environment variable)
        data_dir = os.getenv("XT_DATA_DIR", args.data)
        data_dir = os.path.expanduser(data_dir)
        file_utils.ensure_dir_exists(data_dir)
        print("getting data from: " + data_dir)

        fn_test = data_dir + "/MNIST/processed/test.pt"
        exists = os.path.exists(fn_test)
        print("fn_test={}, exists={}".format(fn_test, exists))

        fn_train = data_dir + "/MNIST/processed/training.pt"
        exists = os.path.exists(fn_train)
        print("fn_train={}, exists={}".format(fn_train, exists))

        if args.download_only:
            print("miniMnist (ensuring data is downloaded)")
            self.get_dataget_dataset(data_dir, True, True)
            self.get_dataset(data_dir, False, True)

        return mnt_output_dir, local_output_dir, data_dir

    def print_settings(self, args):
        print("--- miniMnist settings ---")
        print("  command-line args:", sys.argv)

        if args.env_vars:
            print("  env vars:")
            keys = list(os.environ.keys())
            keys.sort()

            for key in keys:
                value = os.environ[key]
                if len(value) > 100:
                    value = value[0:100] + "..."
                print("    {}: {}".format(key, value))

        print("  cwd: " + os.getcwd())
        print("  python: " + sys.version.replace("\n", " "))
        print("  torch.__version__=", torch.__version__)

        # bug workaround: torchvision version 0.4.2 is missing the "__version__" attribute
        if hasattr(torchvision, "__version__"):
            print("  torchvision: " + str(torchvision.__version__))
        else:
            print("  dir(torchvision)=", dir(torchvision))

        in_docker = os.path.exists(".dockerenv") or os.getenv("XT_IN_DOCKER")
        print("  in_docker: " + str(in_docker))

        if args.xtlib:
            import xtlib
            print("  xtlib: " + str(xtlib.__version__))

    def init_model(self, device, args):
        use_cnn = True
        if use_cnn:
            print("created CNN model...")
            model = SimpleCNN(num_mid_conv=args.mid_conv,
                              channels1=args.channels1,
                              channels2=args.channels2,
                              kernel_size=args.kernel_size,
                              mlp_units=args.mlp_units)
        else:
            print("created MLP model...")
            model = MLP()

        gpu_count = torch.cuda.device_count()

        if args.parallel and gpu_count > 1:
            model = nn.DataParallel(model)
            print("using PARALLEL training with {} GPUs".format(gpu_count))
        elif args.parallel:
            print(
                "PARALLEL requested but only found {} GPUs".format(gpu_count))
        else:
            print("using single GPU; gpu_count=", gpu_count)
        model.to(device)

        return model

    def init_random_seeds(self, args):
        #---- random seeds ----
        if args.seed == 0:
            args.seed = int(time.time())
        self.rand = random.Random(args.seed)
        fn_checkpoint = "checkpoints/mnist_cnn.pt"
        torch.manual_seed(args.seed)

    def init_stuff(self):
        args = self.args

        mnt_output_dir, local_output_dir, data_dir = self.init_dirs(args)

        self.print_settings(args)
        self.init_random_seeds(args)

        use_cuda, device, logging = self.init_cuda(args)

        print("-------------")

        tb_path = mnt_output_dir if args.tensorboard else None
        self.init_xt_run(logging, tb_path, args)

        self.init_datasets(data_dir, use_cuda, args)

        model = self.init_model(device, args)

        return model, device, mnt_output_dir, local_output_dir

    def apply_runset_file(self, args, fn):
        #utils.debug_break()

        fn = os.path.abspath(fn)
        with open(fn, "rt") as infile:
            yd = yaml.safe_load(infile)

        if not constants.HPARAM_RUNSET in yd:
            errors.internal_error(
                "found runset file without {} property: {}".format(
                    constants.HPARAM_RUNSET, fn))

        print("applying runset file to args: {}".format(fn))

        hd = yd[constants.HPARAM_RUNSET]

        for prop, val in hd.items():
            prop = prop.replace("-", "_")
            setattr(args, prop, val)

    def run(self):

        print("args=", sys.argv)
        self.args = parse_cmdline_args()
        args = self.args

        fn_runset = "runset.yaml"
        if os.path.exists(fn_runset):
            self.apply_runset_file(args, fn_runset)

        model, device, mnt_output_dir, local_output_dir = self.init_stuff()

        start_epoch = 1
        run = self.run

        if args.raise_error:
            #errors.internal_error("Raising an intentional error")
            # try a different type of error
            abc.foo = 1

        # log hyperparameters to xt
        if run:
            hp_dict = {
                "seed": args.seed,
                "batch-size": args.batch_size,
                "epochs": args.epochs,
                "lr": args.lr,
                "momentum": args.momentum,
                "channels1": args.channels1,
                "channels2": args.channels2,
                "kernel_size": args.kernel_size,
                "mlp-units": args.mlp_units,
                "weight-decay": args.weight_decay,
                "optimizer": args.optimizer,
                "mid-conv": args.mid_conv,
                "gpu": args.gpu,
                "log-interval": args.log_interval
            }

            run.log_hparams(hp_dict)

        if args.cuda:
            # if on linux, show GPU info
            if os.name != "nt":
                os.system("nvidia-smi")

        # print hyperparameters
        print("hyperparameters:", hp_dict)
        print()

        # see if we are resuming a preempted run
        if run and run.resume_name:
            print("resuming from run=", run.resume_name)
            dd = run.get_checkpoint(fn_checkpoint)
            if dd and dd["epoch"]:
                model.load_state_dict(torch.load(fn_checkpoint))
                start_epoch = 1 + dd["epoch"]

        if args.optimizer == "sgd":
            #print("using SGD optimizer")
            optimizer = optim.SGD(model.parameters(),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
        else:
            #print("using Adam optimizer")
            optimizer = optim.Adam(model.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.weight_decay)

        if args.distributed:
            optimizer = hvd.DistributedOptimizer(
                optimizer, named_parameters=model.named_parameters())

            # Broadcast parameters from rank 0 to all other processes.
            hvd.broadcast_parameters(model.state_dict(), root_rank=0)

        checkpoint_freq = 0
        checkpoint_units = ""
        last_checkpoint = time.time()
        checkpoint_count = 0

        # force a ML app error to kill the app
        #x = foo/bar

        # parse checkpoint arg
        #print("args.checkpoint=", args.checkpoint, ", type(args.checkpoint)", type(args.checkpoint))

        if False:  # args.checkpoint:
            if type(args.checkpoint) in ["int", "float"]:
                checkpoint_freq = int(args.checkpoint)
                checkpoint_units = "epochs"
            elif isinstance(args.checkpoint, str):
                parts = args.checkpoint.split(' ')
                if len(parts) == 2:
                    checkpoint_freq, checkpoint_units = parts
                    checkpoint_freq = float(checkpoint_freq)
                    checkpoint_units = checkpoint_units.strip().lower()
                else:
                    checkpoint_freq = float(args.checkpoint)
                    checkpoint_units = "epochs"

        model_dir = os.getenv("XT_MODEL_DIR", "models/miniMnist")
        fn_model = model_dir + "/mnist_cnn.pt"
        self.fn_text_log = mnt_output_dir + "/text_log.txt"

        if args.eval_model:
            # load model and evaluate it
            print("loading existing MODEL and evaluating it, fn=", fn_model)
            exists = os.path.exists(fn_model)
            print("model exists=", exists)

            model.load_state_dict(torch.load(fn_model))
            print("model loaded!")

            # just test model
            self.test_model_and_log_metrics(run,
                                            model,
                                            device,
                                            epoch=1,
                                            args=args)
        else:
            self.train_test_loop(run,
                                 model,
                                 device,
                                 optimizer,
                                 1,
                                 checkpoint_freq,
                                 args=args)

        if (args.save_model):
            file_utils.ensure_dir_exists(model_dir)
            self.save_model(model, fn_model)

        # always save a copy of model in the AFTER FILES
        self.save_model(model, "output/mnist_cnn.pt")

        if args.clear_checkpoint_at_end:
            if checkpoint_freq and run and run.store:
                run.clear_checkpoint()

        # create a file to be captured in OUTPUT FILES
        fn_app_log = os.path.join(local_output_dir, "miniMnist_log.txt")
        with open(fn_app_log, "wt") as outfile:
            outfile.write("This is a log for miniMnist app\n")
            outfile.write("miniMnist app completed\n")

        # create a file to be ignored in OUTPUT FILES
        fn_app_log = os.path.join(local_output_dir, "test.junk")
        with open(fn_app_log, "wt") as outfile:
            outfile.write(
                "This is a file that should be omitted from AFTER upload\n")
            outfile.write("end of junk file\n")

        if run:
            # ensure we close all logging
            run.close()
예제 #10
0
class HyperparameterHandler():
    def __init__(self):
        """
        Preprocesses the runspec before the call to yaml.load().
        Manages communication with XT regarding hyperparameters.
        """
        self.uploaded_hp_config_filename = 'uploaded_hp_config.yaml'
        self.downloaded_hp_config_filename = 'downloaded_hp_config.yaml'
        self.xt_run_name = os.getenv("XT_RUN_NAME")
        self.xt_run = None
        self.in_hp_search = False
        self.randint_in_spec = False
        if self.xt_run_name:
            from xtlib.run import Run as XTRun
            self.xt_run = XTRun()
            if os.path.isfile(self.downloaded_hp_config_filename):
                self.in_hp_search = True
        self.hparams = []

    def split_spec(self, run_spec_file):
        # Read the spec into 3 sections.
        pre_hp_section = []
        hp_section = []
        post_hp_section = []
        current_section = pre_hp_section
        for line in run_spec_file:
            if current_section == pre_hp_section:
                # Look for the start of the hp section.
                if line.startswith('hyperparameters:'):
                    current_section = hp_section
            elif current_section == hp_section:
                # Look for the end of the hp section.
                if line[0] not in ' -#\n\r':
                    current_section = post_hp_section
            else:
                assert current_section == post_hp_section
            # Append this line to the current section.
            current_section.append(line)
        return pre_hp_section, hp_section, post_hp_section

    def preprocess(self, run_spec_file):
        """ Modifies the hyperparameter section of a runspec before yaml.load() is called on it. """

        # Read the spec into 3 sections.
        pre_hp_section, hp_section, post_hp_section = self.split_spec(run_spec_file)

        # Modify the HP section, if present.
        if len(hp_section) > 0:
            self.hparams = self.parse_hp_section(hp_section)
            if self.in_hp_search:
                self.read_hp_config_file()
            else:
                for hp in self.hparams:
                    hp.choose_value(self.in_hp_search)
            parsed_hp_section = ['hyperparameters:\n']
            for hp in self.hparams:
                parsed_hp_section += hp.format_chosen_value()
            parsed_hp_section.append('\n')
        else:
            parsed_hp_section = []

        # Reassemble the modified runspec.
        spec_str = ''.join(pre_hp_section + parsed_hp_section + post_hp_section)

        # Check for randint.
        self.randint_in_spec = 'randint' in spec_str

        # Return the modified runspec.
        return spec_str

    def parse_hp_section(self, hp_section_in):
        """
        Parses the hyperparameters section of a runspec.
        Returns a list of Hparam objects. For example...
        Input string hp_section_in:
            hyperparameters:
              - name: &rscale
                  ordered_tuning_values: [2, 4, 8, 16, 32]
                  tuned_value: 32
              - name: &units
                  ordered_tuning_values: [128, 192, 256, 384, 512]
                  tuned_value: 384
        Output returned:
            List of Hparam objects:
                hp[0].name = 'rscale'
                     .values = [2, 4, 8, 16, 32]
                     .tuned_value = 32
                hp[1].name = 'units'
                     .values = [128, 192, 256, 384, 512]
                     .tuned_value = 384
        """
        hparams = []
        name_line = ''
        values_line = ''
        i = 0
        for full_line in hp_section_in:
            line = full_line.strip().rstrip()
            if line.startswith('hyperparameters:') or (len(line) == 0) or (line[0] == '#'):
                continue
            if i == 0:
                if line.startswith('- name:'):
                    name_line = line
                    i = 1
                else:
                    raise SyntaxError('First line of a hyperparameter definition must start with "- name:"\n=====> {}'.format(line))
            elif i == 1:
                if (line.startswith('ordered_tuning_values:')) or (line.startswith('unordered_tuning_values:')):
                    values_line = line
                    i = 2
                else:
                    raise SyntaxError('Second line of a hyperparameter definition must start with "ordered_tuning_values:" or "unordered_tuning_values:"\n=====> {}'.format(line))
            elif i == 2:
                if line.startswith('tuned_value:'):
                    hp = Hparam(name_line, values_line, line)
                    hparams.append(hp)
                    i = 0
                else:
                    raise SyntaxError('Third line of a hyperparameter definition must start with "tuned_value:"\n=====> {}'.format(line))
            else:
                raise SyntaxError('Unexpected line in the hyperparameters section of the runspec:{}'.format(line))
        return hparams

    def log_chosen_values(self, logger):
        """ Logs the chosen HP values to the console for reference, and (optionally) to XT. """
        if len(self.hparams) > 0:
            hparam_dict = {}
            logger.write_line("Chosen hyperparameter values:")
            for hp in self.hparams:
                hp.log_chosen_value(logger)
                hparam_dict[hp.name] = hp.chosen_value
            logger.write_line('')
            if self.xt_run:
                self.xt_run.log_hparams(hparam_dict)

    def write_hp_config_file(self):
        """ Generates the file that XT needs to support HP tuning. """
        assert len(self.hparams) > 0, 'Hyperparameters must be specified.'
        # Warn the user if randint is missing from a hyperparameter search.
        if not self.randint_in_spec:
            response = None
            while (response != 'y') and (response != 'n'):
                print("WARNING: Hyperparameter tuning typically requires randomization,")
                print("which is usually achieved by setting the environment or agent seed to randint,")
                print("but randint is missing from this runspec. Are you sure you want to proceed? [y/n]")
                response = input()
            if response == 'n':
                exit(0)
        # Write the hp config file for the job launcher.
        hp_config_file = open(self.uploaded_hp_config_filename, 'w')
        hp_config_file.write('hyperparameter-distributions:\n')
        for hp in self.hparams:
            value_list = []
            for value in hp.values:
                value_list.append(hp.yaml_value_from_python(value))
            values_str = ', '.join(value_list)
            hp_config_file.write('  {}: [{}]\n'.format(hp.name, values_str))
        hp_config_file.close()

    def read_hp_config_file(self):
        """ Reads the file containing the HP values chosen by XT. """
        assert len(self.hparams) > 0, 'Hyperparameters must be specified.'
        print('Reading chosen hp values from downloaded_hp_config.yaml')
        chosen_hp_value_dict = yaml.load(open(self.downloaded_hp_config_filename, 'r'), Loader=yaml.Loader)
        hp_runset = chosen_hp_value_dict['hyperparameter-runset']
        # for hp_name in hp_runset:
        #     print('{}  {}'.format(hp_name, hp_runset[hp_name]))
        assert len(hp_runset) == len(self.hparams)
        for hp in self.hparams:
            hp.chosen_value = hp_runset[hp.name]
예제 #11
0
                    help='number of units in the MLP layer of the model')

# OPTIMIZER
parser.add_argument('--optimizer',
                    type=str,
                    default="sgd",
                    help='sets the optimizer for the model')
parser.add_argument('--weight-decay',
                    type=float,
                    default=0,
                    help='sets rate of weight decay for weights')

args = parser.parse_args()

# create an instance of XTRunLog to log info for current run
run = Run()

# log hyperparameters to xt
hp_dict = {
    "seed": args.seed,
    "batch-size": args.batch_size,
    "epochs": args.epochs,
    "lr": args.lr,
    "momentum": args.momentum,
    "channels1": args.channels1,
    "channels2": args.channels2,
    "kernel_size": args.kernel_size,
    "mlp-units": args.mlp_units,
    "weight-decay": args.weight_decay,
    "optimizer": args.optimizer,
    "mid-conv": args.mid_conv