def main():
    mongo_db_host = os.environ["MONGO_DB_HOST"]
    mongo_db_port = os.environ["MONGO_DB_PORT"]
    experiment_name = os.environ["EXPERIMENT_NAME"]

    mongo_connect_str = "mongo://{0}:{1}/foo_db/jobs".format(
        mongo_db_host, mongo_db_port)

    while True:
        try:
            trials = MongoTrials(mongo_connect_str, exp_key=experiment_name)
        except ServerSelectionTimeoutError:
            pass
        else:
            space = {'x': hp.uniform('x', -2, 2)}
            best = fmin(obj,
                        space=space,
                        trials=trials,
                        algo=tpe.suggest,
                        max_evals=100)

            if os.environ["JOB_NAME"] == "ps":
                save_path = os.path.join(get_logs_path("./logs"),
                                         "results.json")
                with open(save_path, "w") as f:
                    json.dump(json.dumps(best), f)

            return
示例#2
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir', type=str, default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir', type=str, default='logs/',
                        help='Path to local log directory')
    parser.add_argument('--fashion', type=str2bool, default=False,
                        help='Use Fashion MNIST data')

    # Model params
    parser.add_argument('--cnn', type=str2bool, default=False,
                        help='If true, use CNN. Otherwise, use MLP. Default: False')
    parser.add_argument('--kernel_size', type=int, default=3,
                        help='Ignored if cnn is False')
    parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=512)

    # Training params
    parser.add_argument('--eval_secs', type=int, default=120,
                        help='throttle_secs for EvalSpec')

    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name = 'adrianyi/mnist-data',
                                 local_root = opts.local_data_dir,
                                 local_repo = '',
                                 path = '')
    opts.log_dir = get_logs_path(root = opts.local_log_dir)

    return opts
示例#3
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[256, 256])
    parser.add_argument(
        '--activation',
        type=str,
        default='relu',
        help=
        'Activation function. See Keras activation functions. Default: relu')
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=128)
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='adrianyi/mnist-data',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    return opts
def parse_args():
    """Parse arguments"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            description='''Train a convolution neural network with MNIST dataset.
                            For distributed mode, you must run this with mpirun. See README.md''')

    # Experiment related parameters
    parser.add_argument('--local_data_root', type=str, default=os.path.join(FILE_DIR, 'data'),
                        help='Path to dataset. This path will be /data on Clusterone.')
    parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'),
                        help='Path to store logs and checkpoints. This path will be /logs on Clusterone.')
    parser.add_argument('--data_subpath', type=str, default='',
                        help='Which sub-directory the data will sit inside local_data_root (locally) ' +
                             'or /data/ (on Clusterone).')

    # CNN model params
    parser.add_argument('--kernel_size', type=int, default=3,
                        help='Size of the CNN kernels to use.')
    parser.add_argument('--hidden_units', type=str, default='32,64',
                        help='Comma-separated list of integers. Number of hidden units to use in CNN model.')
    parser.add_argument('--learning_rate', type=float, default=0.01,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument('--learning_decay', type=float, default=0.0001,
                        help='Exponential decay rate of the learning rate per step.')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout rate used after each convolutional layer.')
    parser.add_argument('--batch_size', type=int, default=512,
                        help='Batch size to use during training and evaluation.')

    # Training params
    parser.add_argument('--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'],
                        help='TF logging level. To see intermediate results printed, set this to INFO or DEBUG.')
    parser.add_argument('--fashion', action='store_true',
                        help='Download and use fashion MNIST data instead of the default handwritten digit MNIST.')
    parser.add_argument('--parallel_batches', type=int, default=2,
                        help='Number of parallel batches to prepare in data pipeline.')
    parser.add_argument('--max_ckpts', type=int, default=2,
                        help='Maximum number of checkpoints to keep.')
    parser.add_argument('--ckpt_steps', type=int, default=100,
                        help='How frequently to save a model checkpoint.')
    parser.add_argument('--save_summary_steps', type=int, default=10,
                        help='How frequently to save TensorBoard summaries.')
    parser.add_argument('--log_step_count_steps', type=int, default=10,
                        help='How frequently to log loss & global steps/s.')
    parser.add_argument('--eval_steps', type=int, default=100,
                        help='How frequently to run evaluation step.')
    parser.add_argument('--max_steps', type=int, default=1000000,
                        help='Maximum number of steps to run.')

    # Parse args
    opts = parser.parse_args()
    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_root,
                                  local_repo='',
                                  path=opts.data_subpath)
    opts.log_dir = get_logs_path(root=opts.local_log_root)

    opts.hidden_units = [int(n) for n in opts.hidden_units.split(',')]

    return opts
示例#5
0
def main():
    test_dict = {'dog': 'bernese'}
    file_path = os.path.join(get_logs_path('./logs'), 'test_json.json')
    with open(file_path, 'w') as f:
        json.dump(json.dumps(test_dict), f)

    return
示例#6
0
def get_logs_path(path):
    """
    Log dir specification, see: get_logs_path,
    https://clusterone.com/documentation/api/#get_logs_path
    :param str path: the path for the logs dir
    :return str: the real path for the logs
    """
    if path.startswith('gs://'):
        return path
    return clusterone.get_logs_path(path)
示例#7
0
def get_args():
    """Return parsed args"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    parser.add_argument('--dist', type=str2bool, default='False')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[32, 64])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--epochs', type=int, default=9999999)
    parser.add_argument(
        '--cuda',
        type=str2bool,
        default=None,
        help='Use CUDA. If left empty, CUDA will be used if available.')
    parser.add_argument('--ckpt_epochs', type=int, default=1)
    # Logging
    parser.add_argument('--log_freq',
                        type=int,
                        default=100,
                        help='Number of steps before saving loss, etc.')
    parser.add_argument('--log_level',
                        type=str,
                        default='info',
                        choices=['info', 'debug'])
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    opts.cuda = opts.cuda or torch.cuda.is_available()
    opts.device = torch.device('cuda' if opts.cuda else 'cpu')

    opts.distributed = n_workers > 1 or opts.dist

    return opts
    def get_env(self):
        # Configure  distributed task
        try:
            job_name = os.environ['JOB_NAME']
            task_index = os.environ['TASK_INDEX']
            ps_hosts = os.environ['PS_HOSTS']
            worker_hosts = os.environ['WORKER_HOSTS']
        except:
            job_name = None
            task_index = 0
            ps_hosts = None
            worker_hosts = None

        flags = self.flags
        # Flags for configuring the distributed task
        flags.DEFINE_string("job_name", job_name,
                            "job name: worker or ps")
        flags.DEFINE_integer("task_index", task_index,
                             "Worker task index, should be >= 0. task_index=0 is "
                             "the chief worker task that performs the variable "
                             "initialization and checkpoint handling")
        flags.DEFINE_string("ps_hosts", ps_hosts,
                            "Comma-separated list of hostname:port pairs")
        flags.DEFINE_string("worker_hosts", worker_hosts,
                            "Comma-separated list of hostname:port pairs")

        # Training related flags
        flags.DEFINE_string("data_dir",
                            get_data_path(
                                dataset_name = self.cloud_user_repo, #all mounted repo
                                local_root = self.data_path,
                                local_repo = self.local_repo,
                                path = self.cloud_data_path
                                ),
                            "Path to dataset. It is recommended to use get_data_path()"
                            "to define your data directory.so that you can switch "
                            "from local to clusterone without changing your code."
                            "If you set the data directory manually make sure to use"
                            "/data/ as root path when running on ClusterOne cloud.")

        flags.DEFINE_string("log_dir",
                             get_logs_path(root=self.logs_path),
                            "Path to store logs and checkpoints. It is recommended"
                            "to use get_logs_path() to define your logs directory."
                            "so that you can switch from local to clusterone without"
                            "changing your code."
                            "If you set your logs directory manually make sure"
                            "to use /logs/ when running on ClusterOne cloud.")

        self.flags = flags
示例#9
0
def get_args():
    """Parse arguments"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            description='''Train a convolution neural network with MNIST dataset.
                            For distributed mode, the script will use few environment variables as defaults:
                            JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be
                            available on distributed Tensorflow jobs on Clusterone platform by default.
                            If running this locally, you will need to set these environment variables
                            or pass them in as arguments (i.e. python mnist.py --job_name worker --task_index 0
                            --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224").
                            If these are not set, the script will run in non-distributed (single instance) mode.''')

    # Configuration for distributed task
    parser.add_argument('--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'],
                        help='Task type for the node in the distributed cluster. Worker-0 will be set as master.')
    parser.add_argument('--task_index', type=int, default=os.environ.get('TASK_INDEX', 0),
                        help='Worker task index, should be >= 0. task_index=0 is the chief worker.')
    parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')
    parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')

    # Experiment related parameters
    parser.add_argument('--local_data_dir', type=str, default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir', type=str, default='logs/',
                        help='Path to local log directory')

    # Training params
    parser.add_argument('--learning_rate', type=float, default=0.001,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument('--learning_decay', type=float, default=0.001,
                        help='Exponential decay rate of the learning rate per step.')
    parser.add_argument('--batch_size', type=int, default=512,
                        help='Batch size to use during training and evaluation.')
    opts = parser.parse_args()

    # Clusterone snippet: Grabs the correct paths, depending on if the job is running local or on Clusterone
    opts.data_dir = get_data_path(dataset_name='',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    return opts
示例#10
0
def main(argv):
    args = parser.parse_args(argv[1:])

    log_path = get_logs_path(root=os.path.abspath(
        os.path.expanduser('~/Documents/tf_logs/logs/titanic')))

    train, test = load_data(args.train_path, args.test_path)
    (train_x,
     train_y), new_feature_classes = preprocess_data(train,
                                                     ['sex', 'embarked'])
    (test_x,
     test_y), new_feature_classes = preprocess_data(test, ['sex', 'embarked'])

    passenger_features = []
    passenger_features.append(tf.feature_column.numeric_column(key='pclass'))
    passenger_features.append(tf.feature_column.numeric_column(key='age'))
    passenger_features.append(tf.feature_column.numeric_column(key='sibsp'))
    passenger_features.append(tf.feature_column.numeric_column(key='parch'))
    passenger_features.append(tf.feature_column.numeric_column(key='sex_male'))
    passenger_features.append(
        tf.feature_column.numeric_column(key='sex_female'))
    passenger_features.append(
        tf.feature_column.numeric_column(key='embarked_C'))
    passenger_features.append(
        tf.feature_column.numeric_column(key='embarked_Q'))
    passenger_features.append(
        tf.feature_column.numeric_column(key='embarked_S'))

    classifier = tf.estimator.DNNClassifier(hidden_units=network,
                                            feature_columns=passenger_features,
                                            model_dir=log_path,
                                            n_classes=2)

    classifier.train(input_fn=lambda: train_input_fn(train_x, train_y),
                     steps=1000)

    eval_result = classifier.evaluate(
        input_fn=lambda: eval_input_fn(test_x, test_y))

    print('\nNetwork layout: %s' % network)
    print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
示例#11
0
def main(argv):
    args = parser.parse_args(argv[1:])

    log_path = get_logs_path(root=os.path.abspath(os.path.expanduser('~/Documents/tf_logs/logs/titanic_basic')))

    (train_x, train_y), (test_x, test_y) = load_data(args.train_path, args.test_path)

    passenger_features = []
    passenger_features.append(tf.feature_column.numeric_column(key='pclass'))
    passenger_features.append(tf.feature_column.numeric_column(key='age'))

    classifier = tf.estimator.DNNClassifier(
                                            hidden_units=[20, 20, 20], 
                                            feature_columns=passenger_features, 
                                            model_dir=log_path,
                                            n_classes=2)

    classifier.train(input_fn=lambda:train_input_fn(train_x, train_y), steps=1000)

    eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y))

    print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
示例#12
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[32, 64])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.5)
    # Runtime params
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--num_steps', type=int, default=9999999)
    parser.add_argument('--input_threads', type=int, default=None)
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    if opts.input_threads is None:
        import multiprocessing
        opts.input_threads = multiprocessing.cpu_count()

    return opts
flags.DEFINE_string(
    "val_data_dir",
    get_data_path(
        dataset_name='artem/artem-tiny-imagenet',
        local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'),
        local_repo='tiny-imagenet-200',
        path='val/for_keras'),
    "Path to store logs and checkpoints. It is recommended"
    "to use get_logs_path() to define your logs directory."
    "so that you can switch from local to clusterone without"
    "changing your code."
    "If you set your logs directory manually make sure"
    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string(
    "log_dir",
    get_logs_path(
        os.path.expanduser('~/Documents/Scratch/tiny_imagenet/logs/')),
    "Path to dataset. It is recommended to use get_data_path()"
    "to define your data directory.so that you can switch "
    "from local to clusterone without changing your code."
    "If you set the data directory manually makue sure to use"
    "/data/ as root path when running on ClusterOne cloud.")
FLAGS = flags.FLAGS


def device_and_target():
    # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
    # Don't set a device.
    if FLAGS.job_name is None:
        print("Running single-machine training")
        return (None, "")
示例#14
0
    print('[*] Finished')


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='toynet template')
    parser.add_argument('--epoch', type=int, default=20, help='epoch size')
    parser.add_argument('--batch_size', type=int, default=100, help='mini-batch size')
    parser.add_argument('--lr', type=float, default=2e-4, help='learning rate')
    parser.add_argument('--y_dim', type=int, default=10, help='the number of classes')
    parser.add_argument('--target', type=int, default=-1, help='target class for targeted generation')
    parser.add_argument('--eps', type=float, default=1e-9, help='epsilon')
    parser.add_argument('--env_name', type=str, default='train', help='experiment name')
    parser.add_argument('--dataset', type=str, default='FMNIST', help='dataset type')
    parser.add_argument('--dset_dir', type=str, default='datasets', help='dataset directory path')
    parser.add_argument('--summary_dir', type=str, default=get_logs_path('summary'), help='summary directory path')
    parser.add_argument('--output_dir', type=str, default=get_logs_path('output'),help='output directory path')
    parser.add_argument('--ckpt_dir', type=str, default=get_logs_path('checkpoints'), help='checkpoint directory path')
    parser.add_argument('--load_ckpt', type=str, default='', help='')
    parser.add_argument('--cuda', type=str2bool, default=True, help='enable cuda')
    parser.add_argument('--silent', type=str2bool, default=False, help='')
    parser.add_argument('--mode', type=str, default='train', help='train / test / generate / universal')
    parser.add_argument('--seed', type=int, default=1, help='random seed')
    parser.add_argument('--iteration', type=int, default=1, help='the number of iteration for FGSM')
    parser.add_argument('--epsilon', type=float, default=0.03, help='epsilon for FGSM and i-FGSM')
    parser.add_argument('--alpha', type=float, default=2/255, help='alpha for i-FGSM')
    parser.add_argument('--tensorboard', type=str2bool, default=True, help='enable tensorboard')
    parser.add_argument('--visdom', type=str2bool, default=False, help='enable visdom')
    parser.add_argument('--visdom_port', type=str, default=55558, help='visdom port')
    args = parser.parse_args()
示例#15
0
# Training related flags
flags.DEFINE_string(
    "data_dir",
    get_data_path(
        dataset_name="malo/mnist",  #all mounted repo
        local_root=ROOT_PATH_TO_LOCAL_DATA,
        local_repo="mnist",
        path=''),
    "Path to store logs and checkpoints. It is recommended"
    "to use get_logs_path() to define your logs directory."
    "so that you can switch from local to clusterone without"
    "changing your code."
    "If you set your logs directory manually make sure"
    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string(
    "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
    "Path to dataset. It is recommended to use get_data_path()"
    "to define your data directory.so that you can switch "
    "from local to clusterone without changing your code."
    "If you set the data directory manually makue sure to use"
    "/data/ as root path when running on ClusterOne cloud.")

FLAGS = flags.FLAGS


def device_and_target():
    # If FLAGS.job_name is not set, we're running single-node TensorFlow.
    # Don't set a device.
    if FLAGS.job_name is None:
        print("Running single-machine training")
        return (None, "")
示例#16
0
def main():
    """ Main wrapper"""

    # clusterone snippet 1 - get environment variables
    try:
        job_name = os.environ['JOB_NAME']
        task_index = os.environ['TASK_INDEX']
        ps_hosts = os.environ['PS_HOSTS']
        worker_hosts = os.environ['WORKER_HOSTS']
    except:
        job_name = None
        task_index = 0
        ps_hosts = None
        worker_hosts = None

    if job_name == None:  #if running locally
        if LOCAL_LOG_LOCATION == "...":
            raise ValueError("LOCAL_LOG_LOCATION needs to be defined")
        if LOCAL_DATASET_LOCATION == "...":
            raise ValueError("LOCAL_DATASET_LOCATION needs to be defined")
        if LOCAL_DATASET_NAME == "...":
            raise ValueError("LOCAL_DATASET_NAME needs to be defined")

    #Path to your data locally. This will enable to run the model both locally and on
    # ClusterOne without changes
    PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION)
    ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION)
    #end of clusterone snippet 1

    #Flags
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    # clusterone snippet 2: flags.

    #Define the path from the root data directory to your data.
    #We use glob to match any .h5 datasets in Documents/comma locally, or in data/ on ClusterOne
    flags.DEFINE_string(
        "train_data_dir",
        get_data_path(
            dataset_name="tensorbot/*",
            local_root=ROOT_PATH_TO_LOCAL_DATA,
            local_repo=
            LOCAL_DATASET_NAME,  #all repos (we use glob downstream, see read_data.py)
            path='camera/training/*.h5'  #all .h5 files
        ),
        """Path to training dataset. It is recommended to use get_data_path()
        to define your data directory. If you set your dataset directory manually make sure to use /data/
        as root path when running on TensorPort cloud.
        On tensrport, the data will be mounted in /data/user/clusterone_dataset_name,
        so you can acces `path` with  /data/user/clusterone_dataset_name/path
        """)
    flags.DEFINE_string(
        "logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
        "Path to store logs and checkpoints. It is recommended"
        "to use get_logs_path() to define your logs directory."
        "If you set your logs directory manually make sure"
        "to use /logs/ when running on TensorPort cloud.")

    # Define worker specific environment variables. Handled automatically.
    flags.DEFINE_string("job_name", job_name, "job name: worker or ps")
    flags.DEFINE_integer(
        "task_index", task_index,
        "Worker task index, should be >= 0. task_index=0 is "
        "the chief worker task the performs the variable "
        "initialization")
    flags.DEFINE_string("ps_hosts", ps_hosts,
                        "Comma-separated list of hostname:port pairs")
    flags.DEFINE_string("worker_hosts", worker_hosts,
                        "Comma-separated list of hostname:port pairs")

    # end of clusterone snippet 2

    # Training flags - feel free to play with that!
    flags.DEFINE_integer("batch", 64, "Batch size")
    flags.DEFINE_integer("time", 1, "Number of frames per sample")
    flags.DEFINE_integer("steps_per_epoch", 10000,
                         "Number of training steps per epoch")
    flags.DEFINE_integer("nb_epochs", 200, "Number of epochs")

    # Model flags - feel free to play with that!
    flags.DEFINE_float("dropout_rate1", .2,
                       "Dropout rate on first dropout layer")
    flags.DEFINE_float("dropout_rate2", .5,
                       "Dropout rate on second dropout layer")
    flags.DEFINE_float("starter_lr", 1e-6,
                       "Starter learning rate. Exponential decay is applied")
    flags.DEFINE_integer("fc_dim", 512, "Size of the dense layer")
    flags.DEFINE_boolean("nogood", False, "Ignore `goods` filters.")

    # clusterone snippet 3: configure distributed environment
    def device_and_target():
        # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
        # Don't set a device.
        if FLAGS.job_name is None:
            print("Running single-machine training")
            return (None, "")

        # Otherwise we're running distributed TensorFlow.
        print("Running distributed training")
        if FLAGS.task_index is None or FLAGS.task_index == "":
            raise ValueError("Must specify an explicit `task_index`")
        if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "":
            raise ValueError("Must specify an explicit `ps_hosts`")
        if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "":
            raise ValueError("Must specify an explicit `worker_hosts`")

        cluster_spec = tf.train.ClusterSpec({
            "ps":
            FLAGS.ps_hosts.split(","),
            "worker":
            FLAGS.worker_hosts.split(","),
        })
        server = tf.train.Server(cluster_spec,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

        worker_device = "/job:worker/task:{}".format(FLAGS.task_index)
        # The device setter will automatically place Variables ops on separate
        # parameter servers (ps). The non-Variable ops will be placed on the workers.
        return (
            tf.train.replica_device_setter(worker_device=worker_device,
                                           cluster=cluster_spec),
            server.target,
        )

    device, target = device_and_target()
    # end of clusterone snippet 3

    print(FLAGS.logs_dir)
    print(FLAGS.train_data_dir)

    if FLAGS.logs_dir is None or FLAGS.logs_dir == "":
        raise ValueError("Must specify an explicit `logs_dir`")
    if FLAGS.train_data_dir is None or FLAGS.train_data_dir == "":
        raise ValueError("Must specify an explicit `train_data_dir`")
    # if FLAGS.val_data_dir is None or FLAGS.val_data_dir == "":
    #     raise ValueError("Must specify an explicit `val_data_dir`")

    TIME_LEN = 1  #1 video frame. Other not supported.

    # Define graph
    with tf.device(device):
        # X = tf.placeholder(tf.float32, [FLAGS.batch, 3, 160, 320], name="X")
        # Y = tf.placeholder(tf.float32,[FLAGS.batch,1], name="Y") # angle only
        # S = tf.placeholder(tf.float32,[FLAGS.batch,1], name="S") #speed

        if FLAGS.task_index == 0:
            print("Looking for data in %s" % FLAGS.train_data_dir)
        reader = DataReader(FLAGS.train_data_dir)
        x, y, s = reader.read_row_tf()
        x.set_shape((3, 160, 320))
        y.set_shape((1))
        s.set_shape((1))

        X, Y, S = tf.train.batch([x, y, s], batch_size=FLAGS.batch)
        predictions = get_model(X, FLAGS)
        steering_summary = tf.summary.image(
            "green-is-predicted", render_steering_tf(X, Y, S, predictions)
        )  # Adding numpy operation to graph. Adding image to summary
        loss = get_loss(predictions, Y)
        training_summary = tf.summary.scalar('Training_Loss',
                                             loss)  #add to tboard

        #Batch generators
        global_step = tf.contrib.framework.get_or_create_global_step()
        learning_rate = tf.train.exponential_decay(FLAGS.starter_lr,
                                                   global_step,
                                                   1000,
                                                   0.96,
                                                   staircase=True)

        train_step = (tf.train.AdamOptimizer(learning_rate).minimize(
            loss, global_step=global_step))

    def run_train_epoch(target, FLAGS, epoch_index):
        """Restores the last checkpoint and runs a training epoch
        Inputs:
            - target: device setter for distributed work
            - FLAGS:
                - requires FLAGS.logs_dir from which the model will be restored.
                Note that whatever most recent checkpoint from that directory will be used.
                - requires FLAGS.steps_per_epoch
            - epoch_index: index of current epoch
        """

        hooks = [
            tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch *
                                    epoch_index)
        ]  # Increment number of required training steps
        i = 1

        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir=FLAGS.logs_dir,
                hooks=hooks) as sess:

            while not sess.should_stop():
                variables = [loss, learning_rate, train_step]
                current_loss, lr, _ = sess.run(variables)

                print(
                    "Iteration %s - Batch loss: %s" %
                    ((epoch_index) * FLAGS.steps_per_epoch + i, current_loss))
                i += 1

    for e in range(FLAGS.nb_epochs):
        run_train_epoch(target, FLAGS, e)
示例#17
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--noplot',
                        dest='plot',
                        action='store_false',
                        help='Disable PlotReport extension')
    args = parser.parse_args()

    args.out = get_logs_path(root=args.out)

    print('GPU: {}'.format(args.gpu))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    model = L.Classifier(MLP(args.unit, 10))
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist()

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.dump_graph('main/loss'))

    # Take a snapshot for each specified epoch
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Save two plot images to the result dir
    if args.plot and extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                file_name='accuracy.png'))

    # TensorBoard
    trainer.extend(TensorBoardReport(args.out))

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()

    writer.close()
import tensorflow as tf
from keras import backend as K
from keras.datasets import mnist
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, Flatten
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from clusterone import get_data_path, get_logs_path

log_dir = get_logs_path(
    '/Users/artem/Documents/Scratch/mnist_keras_distributed/logs/')


def train():
    #
    # Data
    #

    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    y_train = to_categorical(y_train, num_classes=10)
    y_test = to_categorical(y_test, num_classes=10)
    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.

    #
    # Model
    #

    img_inp = Input(shape=(28, 28))
示例#19
0
    local_repo = 'tiny-imagenet-200',
    path = 'train'
)
EVAL_DATA_DIR = get_data_path(
    dataset_name = 'mohsen/clusterone-tiny-imagenet-example',
    local_root = os.path.expanduser('~/'),
    local_repo = 'tiny-imagenet-200',
    path = 'val/for_keras'
)
UNIQUE_LABELS_PATH = get_data_path(
    dataset_name = 'mohsen/clusterone-tiny-imagenet-example',
    local_root = os.path.expanduser('~/'),
    local_repo = 'tiny-imagenet-200',
    path = 'wnids.txt'
)
LOGS_PATH = get_logs_path('./logs')

configure(LOGS_PATH)

parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=64, type=int)
parser.add_argument('--num_workers', default=4, type=int)
parser.add_argument('--num_epochs', default=1, type=int)
parser.add_argument('--save_summary_steps', default=50, type=int)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
if args.cuda:
    device = torch.device('cuda:0')
    print('Using GPU')
else:
    device = torch.device('cpu')
示例#20
0
    # separating data into training, validation, and testing
    x_train = x_data[:train]
    x_val = x_data[train:train + val]
    x_test = x_data[train + val:]
    y_train = y_data[:train]
    y_val = y_data[train:train + val]
    y_test = y_data[train + val:]

    return x_train, x_val, x_test, y_train, y_val, y_test


if __name__ == "__main__":
    # turn of warnings
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    log_path = get_logs_path(
        r"C:\Users\Ryan Meredith\Documents\github\housing_prices\logs\\")

    # root mean squared metric
    def rmse(y_true, y_pred):
        return backend.sqrt(
            backend.mean(backend.square(y_pred - y_true), axis=-1))

    print("Begin collecting data")
    x_train, x_val, x_test, y_train, y_val, y_test = get_data()
    print("Finished collecting data")

    input_nodes = x_train.shape[1]

    # layers: (input: 326), 1000, 500, 200, 100, (output: 1)
    # dropout of 10% between each layer
    model = Sequential()
    if (task_type in ('chief', 'master')) or (task_type == 'worker'
                                              and task_index == 0):
        TF_CONFIG['cluster']['worker'][task_index] = local_ip
        TF_CONFIG['task']['type'] = 'chief'

    os.environ['TF_CONFIG'] = json.dumps(TF_CONFIG)
except KeyError as ex:
    print(ex)
    job_name = None
    task_index = 0
    ps_hosts = None
    worker_hosts = None

flags.DEFINE_string(
    "log_dir",
    get_logs_path(
        os.path.expanduser('~/Documents/Scratch/cluster1_experiments/logs')),
    "Path to dataset. It is recommended to use get_data_path()"
    "to define your data directory.so that you can switch "
    "from local to clusterone without changing your code."
    "If you set the data directory manually makue sure to use"
    "/data/ as root path when running on ClusterOne cloud.")
tf.flags.DEFINE_integer('n_gpus', 1, 'number of gpus to utilize')

FLAGS = flags.FLAGS


def make_model():
    model_inp = tf.keras.layers.Input(shape=(
        28,
        28,
    ), name='input')
示例#22
0
def main():
    
    #Training Data
    xtrain = 'Xtrain.txt'
    ytrain = 'Ytrain.txt'
    
    #Validation Data
    xtest = 'Xtest.txt'
    ytest = 'Ytest.txt'
    
    # Training Parameters
    batch_size = 500  # Batch size
    num_epochs = 5  # Number epochs
    train_holdout = 0.2  # Portion of training features used for valisation
    learning_rate = 0.005  # Starting learning rate
    steps_per_epoch = 50 # Number of training steps per epoch
    
#----- Begin Main Code    
    
    # Get environment variables
    try:
        job_name = os.environ['JOB_NAME']
        task_index = os.environ['TASK_INDEX']
        ps_hosts = os.environ['PS_HOSTS']
        worker_hosts = os.environ['WORKER_HOSTS']
    except:
        job_name = None
        task_index = 0
        ps_hosts = None
        worker_hosts = None
        
    # Get local file paths
    PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION)
    ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION)
   
    # Flags
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    # Flags for environment variables
    flags.DEFINE_string("job_name", job_name,
                        "job name: worker or ps")
    flags.DEFINE_integer("task_index", task_index,
                         "Worker task index, should be >= 0. task_index=0 is "
                         "the chief worker task that performs the variable "
                         "initialization and checkpoint handling")
    flags.DEFINE_string("ps_hosts", ps_hosts,
                        "Comma-separated list of hostname:port pairs")
    flags.DEFINE_string("worker_hosts", worker_hosts,
                        "Comma-separated list of hostname:port pairs")
    
    # Training file flags
    flags.DEFINE_string("xtrain",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = xtrain
                            ),
                        "Path to training dataset.")
    flags.DEFINE_string("ytrain",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = ytrain
                            ),
                        "Path to training dataset.")
    
    flags.DEFINE_string("log_dir",
                         get_logs_path(root=PATH_TO_LOCAL_LOGS),
                         "Path to store logs and checkpoints.")
    
    # Validation file flags
    flags.DEFINE_string("xtest",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = xtest
                            ),
                        "Path to testing dataset.")
    flags.DEFINE_string("ytest",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = ytest
                            ),
                        "Path to testing dataset.")

    # Training parameter flags
    flags.DEFINE_integer("batch_size", batch_size,
                        "Batch size [100].")
    flags.DEFINE_integer("num_epochs", num_epochs,
                        "Number epochs [50].")
    flags.DEFINE_float("train_holdout", train_holdout,
                        "Portion of training features withheld from traing and used for validation [0.2].")
    flags.DEFINE_float("learning_rate", learning_rate,
                        "Starting learning rate [0.0005].")
    flags.DEFINE_integer("steps_per_epoch", steps_per_epoch, 
                         "Number of training steps per epoch")

    # Configure Distributed Environment
    def device_and_target():
        # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
        # Don't set a device.
        if FLAGS.job_name is None:
            print("Running single-machine training")
            return (None, "")

        # Otherwise we're running distributed TensorFlow.
        print("Running distributed training")
        if FLAGS.task_index is None or FLAGS.task_index == "":
            raise ValueError("Must specify an explicit `task_index`")
        if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "":
            raise ValueError("Must specify an explicit `ps_hosts`")
        if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "":
            raise ValueError("Must specify an explicit `worker_hosts`")

        cluster_spec = tf.train.ClusterSpec({
                "ps": FLAGS.ps_hosts.split(","),
                "worker": FLAGS.worker_hosts.split(","),
        })
        server = tf.train.Server(
                cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

        worker_device = "/job:worker/task:{}".format(FLAGS.task_index)
        # The device setter will automatically place Variables ops on separate
        # parameter servers (ps). The non-Variable ops will be placed on the workers.
        return (
                tf.train.replica_device_setter(
                        worker_device=worker_device,
                        cluster=cluster_spec),
                server.target,
        )

    device, target = device_and_target()

# ----- Read Data  -----   
    # Check Flags
    if FLAGS.log_dir is None or FLAGS.log_dir == "":
        raise ValueError("Must specify an explicit `log_dir`")
    if FLAGS.xtrain is None or FLAGS.xtrain == "":
        raise ValueError("Must specify an explicit `xtrain`")
    if FLAGS.ytrain is None or FLAGS.ytrain == "":
        raise ValueError("Must specify an explicit `ytrain`")
    if FLAGS.xtest is None or FLAGS.xtest == "":
        raise ValueError("Must specify an explicit `xtest`")
    if FLAGS.ytest is None or FLAGS.ytest == "":
        raise ValueError("Must specify an explicit `ytest`")
        
    print('Training dataset file: ', FLAGS.xtrain)
    print('Training target file: ', FLAGS.ytrain)

    print('Testing dataset file: ', FLAGS.xtest)
    print('Testing target file: ', FLAGS.ytest)
    
    print('Log Files Saved To: ', FLAGS.log_dir)

    # Read in data
    Xtrain, Ytrain = read_flat_file(FLAGS.xtrain, FLAGS.ytrain)           
    Xtest, Ytest = read_flat_file(FLAGS.xtest, FLAGS.ytest)  
    
    num_train = int(np.round(Xtrain.shape[0] * (1-FLAGS.train_holdout)))
    num_held = int(Xtrain.shape[0]-num_train)
    print('Training on {:d} features'.format(num_train))
    print('Validating on {:d} features (once per epoch)'.format(num_held)) 
    Xval = Xtrain[num_train:]
    Yval = Ytrain[num_train:]
    Xtrain = Xtrain[:num_train]
    Ytrain = Ytrain[:num_train]
    
    num_batches = int(np.floor(Ytrain.shape[0]/FLAGS.batch_size))
    if num_batches==0: # if defined bach size is below dataset, read as1 batch
        num_batches=1
        FLAGS.batch_size = Ytrain.shape[0]

# ----- Define Graph -----

    tf.reset_default_graph()
    with tf.device(device):            
#        X_in = tf.placeholder(tf.float32, [None, 15, 4, 3])
#        Y_out = tf.placeholder(tf.float32, [None, 8])
        global_step = tf.train.get_or_create_global_step()

        # Create Datasets
        train_dataset = tf.data.Dataset.from_tensor_slices((Xtrain, Ytrain))
#        train_dataset = train_dataset.shuffle(buffer_size=10000)
        train_dataset = train_dataset.batch(FLAGS.batch_size)
#        train_dataset = train_dataset.repeat(FLAGS.num_epochs)
        
        val_dataset = tf.data.Dataset.from_tensor_slices((Xval, Yval))
        val_dataset = val_dataset.batch(Yval.shape[0])
#        val_dataset = val_dataset.repeat(FLAGS.num_epochs)

        test_dataset = tf.data.Dataset.from_tensor_slices((Xtest, Ytest))
        test_dataset = test_dataset.batch(FLAGS.batch_size)

        # Create Iterator
        iter = tf.data.Iterator.from_structure(train_dataset.output_types,
                                           train_dataset.output_shapes)
        features, labels = iter.get_next()
        
        # Create initialisation operations
        train_init_op = iter.make_initializer(train_dataset)
        val_init_op = iter.make_initializer(val_dataset)
        test_init_op = iter.make_initializer(test_dataset)
        
        # Apply model
        with tf.name_scope('predictions'):
            predictions = get_model(features, FLAGS)
        with tf.name_scope('loss'):    
            loss = get_loss(predictions,labels)
        tf.summary.scalar('loss', loss)#add to tboard
         
        with tf.name_scope('train'):
            train_step = (
                tf.train.AdamOptimizer(FLAGS.learning_rate)
                .minimize(loss, global_step=global_step)
                )
            
        summ = tf.summary.merge_all()
        writer = tf.summary.FileWriter(FLAGS.log_dir)
        
#%% Train Model with periodic validation
    def run_train_epoch(target, FLAGS, epoch_index):
        print('Epoch {:d} Training...'.format(epoch_index))
        i=1
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch*epoch_index)] # Increment number of required training steps
        scaffold = tf.train.Scaffold(
                local_init_op=[train_init_op, val_init_op],
                saver=tf.train.Saver(max_to_keep=5)
                )
    
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir=FLAGS.log_dir,
                hooks = hooks,
                scaffold=scaffold
                ) as sess:
            
            writer.add_graph(sess.graph)
            sess.run(train_init_op) # switch to train dataset
            
            while not sess.should_stop():
                
                [current_loss,_,s] = sess.run([loss, train_step, summ])
                iteration = (epoch_index)*FLAGS.steps_per_epoch + i
                print("Iteration {}  Training Loss: {:.4f}".format(iteration,current_loss))
                i += 1
                #writer.add_summary(s, i)
                if i==FLAGS.steps_per_epoch: # validate on last session
                    sess.run(val_init_op) # switch to val dataset
                    while True:
                        try: # run and save validation parameters
                            v_loss = sess.run(loss)
                            print("Epoch {}  Validation Loss: {:.4f}".format(epoch_index, v_loss))
                        except tf.errors.OutOfRangeError:
                            break
                        
    for e in range(1,FLAGS.num_epochs+1):
        run_train_epoch(target, FLAGS,e)
    
    # ----- Test Model on Different Dataset -----                  
    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=(FLAGS.task_index == 0)
            ) as sess:  
        sess.run(test_init_op) # initialize to test dataset
        loss = sess.run(loss)
        
    print("Test Set Loss (independent dataset): {:.4f}".format(loss))
def main():
    # This script is almost identical to train_mnist.py. The only difference is
    # that this script uses data-parallel computation on two GPUs.
    # See train_mnist.py for more details.
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=400,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--out',
                        '-o',
                        default='result_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    args.out = get_logs_path(root=args.out)

    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    chainer.backends.cuda.get_device_from_id(0).use()

    model = L.Classifier(MLP(args.unit, 10))
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train, test = chainer.datasets.get_mnist()
    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # ParallelUpdater implements the data-parallel gradient computation on
    # multiple GPUs. It accepts "devices" argument that specifies which GPU to
    # use.
    try:
        config = os.environ['TF_CONFIG']
        config = json.loads(config)
        n_gpus = len(config['cluster']['worker'])
        devices = {str(i) for i in range(1, n_gpus)}
        devices['main'] = 0
    except:
        devices = {'main': 0}

    updater = training.updaters.ParallelUpdater(
        train_iter,
        optimizer,
        # The device of the name 'main' is used as a "master", while others are
        # used as slaves. Names other than 'main' are arbitrary.
        devices=devices,
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=0))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(TensorBoardReport(args.out))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
示例#24
0
flags.DEFINE_integer("batch_size", 2097152, "Batch size")

FLAGS = tf.flags.FLAGS

USERNAME = "******"

DATASET_NAME = "openslr_small"
PROBLEM = 'librispeech_clean'

DATA_PATH = get_data_path(
    dataset_name="%s/%s" % (USERNAME, DATASET_NAME),  #on clusterone
    local_root=os.path.expanduser("~/Data"),
    local_repo="openSLR",
    path='')

CHECKPOINTS_PATH = get_logs_path(root=os.path.expanduser("~/logs"))

if not os.path.exists(CHECKPOINTS_PATH):
    os.makedirs(CHECKPOINTS_PATH)

try:
    job_name = os.environ['JOB_NAME']
    task_index = int(os.environ['TASK_INDEX'])
    ps_hosts = os.environ['PS_HOSTS'].split(',')
    worker_hosts = os.environ['WORKER_HOSTS'].split(',')
    if job_name == 'ps':
        ps_hosts[task_index] = 'localhost:%s' % (
            ps_hosts[task_index].split(':')[-1])
    elif job_name == 'worker':
        worker_hosts[task_index] = 'localhost:%s' % (
            worker_hosts[task_index].split(':')[-1])
示例#25
0
)
flags.DEFINE_integer("output_height", 64,
                     "The size of the output images to produce [64]")
flags.DEFINE_integer(
    "output_width", None,
    "The size of the output images to produce. If None, same value as output_height [None]"
)
flags.DEFINE_string("dataset", "celebA",
                    "The name of dataset [celebA, mnist, lsun]")
flags.DEFINE_string(
    "data_path",
    get_data_path(dataset_name="%s/*" % CLUSTERONE_USERNAME,
                  local_root=ROOT_PATH_TO_LOCAL_DATA,
                  local_repo=LOCAL_REPO,
                  path=""), "data path for zip file")
flags.DEFINE_string("checkpoint_dir", get_logs_path(LOCAL_PATH_TO_LOGS),
                    "Directory name to save the checkpoints [checkpoint]")
flags.DEFINE_string(
    "sample_dir", get_logs_path("samples"),
    "Directory name to save the image samples [samples]"
)  #TODO: replace with os.path.join(logs/samples) when folders are supported
flags.DEFINE_boolean("train", True,
                     "True for training, False for testing [True]")
flags.DEFINE_boolean("crop", True,
                     "True for training, False for testing [True]")
flags.DEFINE_boolean("visualize", False,
                     "True for visualizing, False for nothing [False]")
FLAGS = flags.FLAGS


def main(_):
示例#26
0
def parse_args():
    """Parse args"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            description='''Train a logistic regressor using XGBoost.
                            For distributed mode, use dmlc-core submit.
                            ''')

    # Experiment related parameters
    parser.add_argument('--data_dir', type=str, default=os.path.join(FILE_DIR, 'data'),
                        help='Directory where your data files are.')
    parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'),
                        help='Path to store logs and checkpoints. This path will be /logs on Clusterone.')
    parser.add_argument('--train_file_pattern', type=str, default='*.train',
                        help='Use * as wildcard. Describe sub-directory/filename pattern for train data.')
    parser.add_argument('--test_file_pattern', type=str, default='*.test',
                        help='Use * as wildcard. Describe sub-directory/filename pattern for test data.')
    parser.add_argument('--model_name', type=str, default='saved.model',
                        help='Filename to use for saved model.')

    # General params
    parser.add_argument('--silent', type=int, default=0, choices=[0, 1],
                        help='0 means printing running messages, 1 means silent mode')

    # Booster params
    parser.add_argument('--eta', type=float, default=0.3,
                        help='Step size shrinkage used in update to prevents overfitting. '
                             'After each boosting step, we can directly get the weights of new features, '
                             'and eta shrinks the feature weights to make the boosting process more conservative. '
                             'Range: [0,1]')
    parser.add_argument('--gamma', type=float, default=0.0,
                        help='Minimum loss reduction required to make a further partition on a leaf node of the tree. '
                             'The larger gamma is, the more conservative the algorithm will be. Range: [0,inf]')
    parser.add_argument('--max_depth', type=int, default=6,
                        help='Maximum depth of a tree. Increasing this value will make the model more complex and more '
                             'likely to overfit. 0 indicates no limit. Note that limit is required when grow_policy is '
                             'set of depthwise.')
    parser.add_argument('--min_child_weight', type=float, default=1.0,
                        help='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step '
                             'results in a leaf node with the sum of instance weight less than min_child_weight, then '
                             'the building process will give up further partitioning. In linear regression task, this '
                             'simply corresponds to minimum number of instances needed to be in each node. The larger '
                             'min_child_weight is, the more conservative the algorithm will be.')
    parser.add_argument('--subsample', type=float, default=1.0,
                        help='Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would '
                             'randomly sample half of the training data prior to growing trees. and this will prevent '
                             'overfitting. Subsampling will occur once in every boosting iteration.')
    parser.add_argument('--colsample_bytree', type=float, default=1.0,
                        help='Subsample ratio of columns when constructing each tree. Subsampling will occur once in '
                             'every boosting iteration.')
    parser.add_argument('--l2', type=float, default=1.0,
                        help='L2 regularization term on weights. Increasing this value will make model more '
                             'conservative.')
    parser.add_argument('--l1', type=float, default=0.0,
                        help='L1 regularization term on weights. Increasing this value will make model more '
                             'conservative.')
    parser.add_argument('--tree_method', type=str, default='auto',
                        choices=['auto', 'exact', 'approx', 'hist', 'gpu_exact', 'gpu_hist'],
                        help='The tree construction algorithm used in XGBoost. '
                             'Distributed and external memory version only support tree_method=approx.')
    parser.add_argument('--scale_pos_weight', type=float, default=1.,
                        help='Control the balance of positive and negative weights, useful for unbalanced classes. '
                             'A typical value to consider: sum(negative instances) / sum(positive instances).')

    # Learning task parameters
    parser.add_argument('--objective', type=str, default='binary:logistic',
                        choices=['binary:logistic', 'binary:logitraw'],
                        help='See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters')
    parser.add_argument('--eval_metric', type=str, nargs='*', default=['error'],
                        choices=['logloss', 'auc', 'error'],
                        help='Evaluation metrics for validation data. '
                             'See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random number seed')

    # Command line parameters
    parser.add_argument('--num_round', type=int, default=10,
                        help='The number of rounds for boosting')

    # Train params
    parser.add_argument('--cache_data', action='store_true',
                        help='Use external memory version')

    # Testing/Debugging
    parser.add_argument('--set_verbosity', type=str, default='INFO',
                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
                        help='Logging verbosity level')
    parser.add_argument('--benchmark', action='store_true',
                        help='Runs with benchmark settings. Ignores all XGBoost parameter inputs.')

    # Parse args
    opts = parser.parse_args()

    opts.train_file_pattern = os.path.join(opts.data_dir, opts.train_file_pattern)
    opts.test_file_pattern = os.path.join(opts.data_dir, opts.test_file_pattern)
    train_files = glob.glob(opts.train_file_pattern)
    test_files = glob.glob(opts.test_file_pattern)

    if train_files:
        opts.train_data = train_files[0]
        if len(train_files) > 1:
            logging.warning('Detected multiple files. Using {}.'.format(opts.train_data))
    else:
        raise IOError('Did not detect any files with train_file_pattern "{}"'.format(opts.train_file_pattern))

    if not opts.benchmark and test_files:
        opts.test_data = test_files[0]
        if len(test_files) > 1:
            logging.warning('Detected multiple files. Using {}.'.format(opts.test_data))
    elif not opts.benchmark:
        raise IOError('Did not detect any files with test_file_pattern "{}"'.format(opts.test_file_pattern))
    else:
        opts.test_data = ''

    opts.log_dir = get_logs_path(root=opts.local_log_root)

    return opts
示例#27
0
import numpy as np
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
from clusterone import get_data_path, get_logs_path

LOCAL_DATA_PATH = os.path.abspath(os.path.expanduser('../../data/'))
LOCAL_LOGS_PATH = os.path.abspath(os.path.expanduser('logs/'))

# Storage directory for the MNIST dataset.
# Returns LOCAL_DATA_PATH when running locally, '/data/malo/mnist' when running on Clusterone.
data_dir = get_data_path(dataset_name="malo/mnist",
                         local_root=LOCAL_DATA_PATH,
                         local_repo="mnist",
                         path='')

# Storage dictory for the log files produced by this script.
logs_dir = get_logs_path(LOCAL_LOGS_PATH)

# The MNIST dataset has 10 classes, representing the digits 0 through 9
NUM_CLASSES = 10

# The MNIST images are always 28x28 pixels
IMAGE_SIZE = 28
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Each hidden layer gets 128 neurons
hidden1_units = 128
hidden2_units = 128

# Further hyperparameters
learning_rate = 0.5
batch_size = 100
示例#28
0
def main():
    # clusterone snippet 1 - get environment variables
    try:
        job_name = os.environ['JOB_NAME']
        task_index = os.environ['TASK_INDEX']
        ps_hosts = os.environ['PS_HOSTS']
        worker_hosts = os.environ['WORKER_HOSTS']
    except:
        job_name = None
        task_index = 0
        ps_hosts = None
        worker_hosts = None

    #end of clusterone snippet 1

    #Flags
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION)

    # clusterone snippet 2: flags.
    flags.DEFINE_string("logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
                        "Path to store logs and checkpoints")

    # Define worker specific environment variables. Handled automatically.
    flags.DEFINE_string("job_name", job_name, "job name: worker or ps")
    flags.DEFINE_integer(
        "task_index", task_index,
        "Worker task index, should be >= 0. task_index=0 is "
        "the chief worker task the performs the variable "
        "initialization")
    flags.DEFINE_string("ps_hosts", ps_hosts,
                        "Comma-separated list of hostname:port pairs")
    flags.DEFINE_string("worker_hosts", worker_hosts,
                        "Comma-separated list of hostname:port pairs")

    # end of clusterone snippet 2

    flags.DEFINE_integer("nb_epochs", 20, "Number of epochs")

    # clusterone snippet 3: configure distributed environment
    def device_and_target():
        # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
        # Don't set a device.
        if FLAGS.job_name is None:
            print("Running single-machine training")
            return (None, "")

        # Otherwise we're running distributed TensorFlow.
        print("Running distributed training")
        if FLAGS.task_index is None or FLAGS.task_index == "":
            raise ValueError("Must specify an explicit `task_index`")
        if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "":
            raise ValueError("Must specify an explicit `ps_hosts`")
        if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "":
            raise ValueError("Must specify an explicit `worker_hosts`")

        # Represents a cluster as a set of "tasks", organized into "jobs".
        cluster_spec = tf.train.ClusterSpec({
            "ps":
            FLAGS.ps_hosts.split(","),  # job1
            "worker":
            FLAGS.worker_hosts.split(","),  # job2
        })

        # Server instance encapsulates a set of devices and a tf.Session
        # target that can participate in distributed training. A server belongs
        # to a cluster (specified by a tf.train.ClusterSpec), and corresponds to
        # a particular task in a named job.
        server = tf.train.Server(cluster_spec,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)

        if FLAGS.job_name == "ps":
            server.join()

        worker_device = "/job:worker/task:{}".format(FLAGS.task_index)

        # The device setter will automatically place Variables ops on separate
        # parameter servers (ps). The non-Variable ops will be placed on the workers.
        return (tf.train.replica_device_setter(worker_device=worker_device,
                                               cluster=cluster_spec),
                server.target)

    device, target = device_and_target()  # place tensors, session
    # end of clusterone snippet 3

    if FLAGS.logs_dir is None or FLAGS.logs_dir == "":
        raise ValueError("Must specify an explicit `logs_dir`")

    with tf.device(device):
        with tf.name_scope("input"):
            (x_train,
             y_train), (x_test,
                        y_test) = keras.datasets.fashion_mnist.load_data()

            print(x_train.shape, x_test.shape)  # 60k, 10k

            x_train = x_train.astype('float32') / 255.
            x_test = x_test.astype('float32') / 255.

            x_train, x_valid = x_train[5000:], x_train[:5000]
            y_train, y_valid = y_train[5000:], y_train[:5000]

            # Reshape input data from (28, 28) to (28, 28, 1)
            w, h = 28, 28
            x_train = x_train.reshape(x_train.shape[0], w, h, 1)  # NHWC
            x_valid = x_valid.reshape(x_valid.shape[0], w, h, 1)
            x_test = x_test.reshape(x_test.shape[0], w, h, 1)

            # One-hot encode the labels
            y_train = tf.keras.utils.to_categorical(y_train, 10)
            y_valid = tf.keras.utils.to_categorical(y_valid, 10)
            y_test = tf.keras.utils.to_categorical(y_test, 10)

        with tf.name_scope("model"):
            model = model_fn(input_shape, number_of_classes)
            x = model["x"]
            y = model["y"]
            train_mode = model["train_mode"]

    def shuffle(x, y):
        idxs = np.random.permutation(x.shape[0])  #shuffled ordering
        return x[idxs], y[idxs]

    def run_train_epoch(target, FLAGS, epoch_index, train_writer, test_writer):
        epoch_loss, epoch_accuracy = 0, 0
        x_train_r, y_train_r = shuffle(x_train, y_train)

        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir=FLAGS.logs_dir) as sess:
            total_size = x_train.shape[0]
            number_of_batches = int(total_size / batch_size)

            for i in range(number_of_batches):
                step = epoch_index * number_of_batches + i

                mini_x = x_train_r[i * batch_size:(i + 1) *
                                   batch_size, :, :, :]
                mini_y = y_train_r[i * batch_size:(i + 1) * batch_size, :]
                _, loss = sess.run([model["train_op"], model["loss"]],
                                   feed_dict={
                                       x: mini_x,
                                       y: mini_y,
                                       train_mode: True
                                   })

                epoch_loss += loss

                train_accuracy, summary = sess.run(
                    [model["accuracy"], model["summary"]],
                    feed_dict={
                        x: mini_x,
                        y: mini_y,
                        train_mode: False
                    })

                epoch_accuracy += train_accuracy

                train_writer.add_summary(summary, step)

                if step % 200 == 0:  # Record summaries and test-set accuracy
                    test_accuracy, summary = sess.run(
                        [model["accuracy"], model["summary"]],
                        feed_dict={
                            x: x_test,
                            y: y_test,
                            train_mode: False
                        })
                    test_writer.add_summary(summary, step)
                    print('test accuracy at step %s: %s' %
                          (step, test_accuracy))

        epoch_loss /= number_of_batches
        epoch_accuracy /= number_of_batches

        print("Epoch: {} loss: {} train accuracy: {}".format(
            epoch_index + 1, np.squeeze(epoch_loss), epoch_accuracy))

    train_wr = tf.summary.FileWriter(FLAGS.logs_dir + '/train',
                                     graph=tf.get_default_graph())
    test_wr = tf.summary.FileWriter(FLAGS.logs_dir + '/test')

    for e in range(FLAGS.nb_epochs):
        run_train_epoch(target, FLAGS, e, train_wr, test_wr)
val_data_dir = get_data_path(
    dataset_name = 'artem/artem-tiny-imagenet',
    local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/',
    local_repo = 'tiny-imagenet-200',
    path = 'val/for_keras'
)

models_dir = get_data_path(
    dataset_name = 'artem/artem-tiny-imagenet',
    local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/',
    local_repo = '',
    path = 'models'
)

log_dir = get_logs_path('/Users/artem/Documents/Scratch/tiny_imagenet/logs/')

def train():

    #
    # Data Preparation
    #

    train_datagen = ImageDataGenerator(
        rescale = 1. / 255,
        shear_range = 0.2,
        zoom_range = 0.2,
        horizontal_flip = True
    )
    val_datagen = ImageDataGenerator(
        rescale = 1. / 255.
示例#30
0
def parse_args():
    """Parse arguments"""
    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description=
        '''Trains a self-steering car model in single-instance or distributed mode.
                            For distributed mode, the script will use few environment variables as defaults:
                            JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be
                            available on distributed Tensorflow jobs on Clusterone platform by default.
                            If running this locally, you will need to set these environment variables
                            or pass them in as arguments (i.e. python main.py --job_name worker --task_index 0
                            --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224").
                            If these are not set, the script will run in non-distributed (single instance) mode.'''
    )

    # Configuration for distributed task
    parser.add_argument(
        '--job_name',
        type=str,
        default=os.environ.get('JOB_NAME', None),
        choices=['worker', 'ps'],
        help=
        'Task type for the node in the distributed cluster. Worker-0 will be set as master.'
    )
    parser.add_argument(
        '--task_index',
        type=int,
        default=os.environ.get('TASK_INDEX', 0),
        help=
        'Worker task index, should be >= 0. task_index=0 is the chief worker.')
    parser.add_argument('--ps_hosts',
                        type=str,
                        default=os.environ.get('PS_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')
    parser.add_argument('--worker_hosts',
                        type=str,
                        default=os.environ.get('WORKER_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')

    # Experiment related parameters
    parser.add_argument(
        '--local_data_root',
        type=str,
        default=os.path.abspath('./data/'),
        help='Path to dataset. This path will be /data on Clusterone.')
    parser.add_argument(
        '--local_log_root',
        type=str,
        default=os.path.abspath('./logs/'),
        help=
        'Path to store logs and checkpoints. This path will be /logs on Clusterone.'
    )
    parser.add_argument(
        '--data_subpath',
        type=str,
        default='',
        help=
        'Which sub-directory the data will sit inside local_data_root (locally) '
        + 'or /data/ (on Clusterone)')
    parser.add_argument(
        '--absolute_data_path',
        type=str,
        default=None,
        help='Using this will ignore other data path arguments.')

    # Model params
    parser.add_argument('--dropout_rate1',
                        type=float,
                        default=0.2,
                        help='Dropout rate after the convolutional layers.')
    parser.add_argument('--dropout_rate2',
                        type=float,
                        default=0.5,
                        help='Dropout rate after the dense layer.')
    parser.add_argument('--fc_dim',
                        type=int,
                        default=512,
                        help='Number of dimensions in the dense layer.')
    parser.add_argument('--nogood',
                        action='store_true',
                        help='Ignore "goods" filters')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.0001,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument(
        '--learning_decay',
        type=float,
        default=0.0001,
        help='Exponential decay rate of the learning rate per step.')

    # Training params
    parser.add_argument(
        '--batch_size',
        type=int,
        default=64,
        help='Batch size to use during training and evaluation.')
    parser.add_argument('--max_steps',
                        type=int,
                        default=10000,
                        help='Max number of steps to train for.')
    parser.add_argument(
        '--verbosity',
        type=str,
        default='INFO',
        choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'],
        help=
        'TF logging level. To log intermediate results, set this to INFO or DEBUG.'
    )
    parser.add_argument('--num_threads',
                        type=int,
                        default=1,
                        help='Number of threads to use to prepare data')
    parser.add_argument('--max_ckpts',
                        type=int,
                        default=2,
                        help='Maximum number of checkpoints to keep')
    parser.add_argument('--ckpt_steps',
                        type=int,
                        default=100,
                        help='How frequently to save a model checkpoint')
    parser.add_argument('--save_summary_steps',
                        type=int,
                        default=10,
                        help='How frequently to save TensorBoard summaries')
    parser.add_argument('--log_step_count_steps',
                        type=int,
                        default=10,
                        help='How frequently to log loss & global steps/s')
    parser.add_argument(
        '--eval_secs',
        type=int,
        default=60,
        help='How frequently to run evaluation step. ' +
        'By default, there is no evaluation dataset, thus effectively no evaluation.'
    )

    # Parse args
    opts = parser.parse_args()

    if opts.absolute_data_path is None:
        opts.train_data = get_data_path(dataset_name='*/*',
                                        local_root=opts.local_data_root,
                                        local_repo=opts.data_subpath,
                                        path='camera/training/*.h5')
    else:
        opts.train_data = os.path.join(opts.absolute_data_path,
                                       'camera/training/*.h5')

    opts.log_dir = get_logs_path(root=opts.local_log_root)
    opts.ps_hosts = opts.ps_hosts.split(',') if opts.ps_hosts else []
    opts.worker_hosts = opts.worker_hosts.split(
        ',') if opts.worker_hosts else []

    return opts