示例#1
0
def load_data(train_path, test_path):
    tr_path, tr_filename = os.path.split(train_path)
    train_dir = get_data_path(
                            dataset_name = "svenchmie/titanic_data/titanic_train.csv",
                            local_root = tr_path,
                            local_repo = tr_filename,
                            path = ''
                            )

    train = pd.read_csv(train_dir[:-1], engine="python")
    train = train[FEATURE_CLASSES].dropna(axis=0, how='any')
    train_x, train_y = train, train.pop('survived')

    te_path, te_filename = os.path.split(test_path)
    test_dir = get_data_path(
                            dataset_name = "svenchmie/titanic_data/titanic_test.csv",
                            local_root = te_path,
                            local_repo = te_filename,
                            path = ''
                            )

    test = pd.read_csv(test_dir[:-1], engine="python")
    test = test[FEATURE_CLASSES].dropna(axis=0, how='any')
    test_x, test_y = test, test.pop('survived')
    return (train_x, train_y), (test_x, test_y)

    return data
def parse_args():
    """Parse arguments"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            description='''Train a convolution neural network with MNIST dataset.
                            For distributed mode, you must run this with mpirun. See README.md''')

    # Experiment related parameters
    parser.add_argument('--local_data_root', type=str, default=os.path.join(FILE_DIR, 'data'),
                        help='Path to dataset. This path will be /data on Clusterone.')
    parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'),
                        help='Path to store logs and checkpoints. This path will be /logs on Clusterone.')
    parser.add_argument('--data_subpath', type=str, default='',
                        help='Which sub-directory the data will sit inside local_data_root (locally) ' +
                             'or /data/ (on Clusterone).')

    # CNN model params
    parser.add_argument('--kernel_size', type=int, default=3,
                        help='Size of the CNN kernels to use.')
    parser.add_argument('--hidden_units', type=str, default='32,64',
                        help='Comma-separated list of integers. Number of hidden units to use in CNN model.')
    parser.add_argument('--learning_rate', type=float, default=0.01,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument('--learning_decay', type=float, default=0.0001,
                        help='Exponential decay rate of the learning rate per step.')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout rate used after each convolutional layer.')
    parser.add_argument('--batch_size', type=int, default=512,
                        help='Batch size to use during training and evaluation.')

    # Training params
    parser.add_argument('--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'],
                        help='TF logging level. To see intermediate results printed, set this to INFO or DEBUG.')
    parser.add_argument('--fashion', action='store_true',
                        help='Download and use fashion MNIST data instead of the default handwritten digit MNIST.')
    parser.add_argument('--parallel_batches', type=int, default=2,
                        help='Number of parallel batches to prepare in data pipeline.')
    parser.add_argument('--max_ckpts', type=int, default=2,
                        help='Maximum number of checkpoints to keep.')
    parser.add_argument('--ckpt_steps', type=int, default=100,
                        help='How frequently to save a model checkpoint.')
    parser.add_argument('--save_summary_steps', type=int, default=10,
                        help='How frequently to save TensorBoard summaries.')
    parser.add_argument('--log_step_count_steps', type=int, default=10,
                        help='How frequently to log loss & global steps/s.')
    parser.add_argument('--eval_steps', type=int, default=100,
                        help='How frequently to run evaluation step.')
    parser.add_argument('--max_steps', type=int, default=1000000,
                        help='Maximum number of steps to run.')

    # Parse args
    opts = parser.parse_args()
    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_root,
                                  local_repo='',
                                  path=opts.data_subpath)
    opts.log_dir = get_logs_path(root=opts.local_log_root)

    opts.hidden_units = [int(n) for n in opts.hidden_units.split(',')]

    return opts
示例#3
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir', type=str, default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir', type=str, default='logs/',
                        help='Path to local log directory')
    parser.add_argument('--fashion', type=str2bool, default=False,
                        help='Use Fashion MNIST data')

    # Model params
    parser.add_argument('--cnn', type=str2bool, default=False,
                        help='If true, use CNN. Otherwise, use MLP. Default: False')
    parser.add_argument('--kernel_size', type=int, default=3,
                        help='Ignored if cnn is False')
    parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=512)

    # Training params
    parser.add_argument('--eval_secs', type=int, default=120,
                        help='throttle_secs for EvalSpec')

    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name = 'adrianyi/mnist-data',
                                 local_root = opts.local_data_dir,
                                 local_repo = '',
                                 path = '')
    opts.log_dir = get_logs_path(root = opts.local_log_dir)

    return opts
示例#4
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[256, 256])
    parser.add_argument(
        '--activation',
        type=str,
        default='relu',
        help=
        'Activation function. See Keras activation functions. Default: relu')
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=128)
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='adrianyi/mnist-data',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    return opts
示例#5
0
def get_args():
    """Return parsed args"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    parser.add_argument('--dist', type=str2bool, default='False')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[32, 64])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--epochs', type=int, default=9999999)
    parser.add_argument(
        '--cuda',
        type=str2bool,
        default=None,
        help='Use CUDA. If left empty, CUDA will be used if available.')
    parser.add_argument('--ckpt_epochs', type=int, default=1)
    # Logging
    parser.add_argument('--log_freq',
                        type=int,
                        default=100,
                        help='Number of steps before saving loss, etc.')
    parser.add_argument('--log_level',
                        type=str,
                        default='info',
                        choices=['info', 'debug'])
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    opts.cuda = opts.cuda or torch.cuda.is_available()
    opts.device = torch.device('cuda' if opts.cuda else 'cpu')

    opts.distributed = n_workers > 1 or opts.dist

    return opts
    def get_env(self):
        # Configure  distributed task
        try:
            job_name = os.environ['JOB_NAME']
            task_index = os.environ['TASK_INDEX']
            ps_hosts = os.environ['PS_HOSTS']
            worker_hosts = os.environ['WORKER_HOSTS']
        except:
            job_name = None
            task_index = 0
            ps_hosts = None
            worker_hosts = None

        flags = self.flags
        # Flags for configuring the distributed task
        flags.DEFINE_string("job_name", job_name,
                            "job name: worker or ps")
        flags.DEFINE_integer("task_index", task_index,
                             "Worker task index, should be >= 0. task_index=0 is "
                             "the chief worker task that performs the variable "
                             "initialization and checkpoint handling")
        flags.DEFINE_string("ps_hosts", ps_hosts,
                            "Comma-separated list of hostname:port pairs")
        flags.DEFINE_string("worker_hosts", worker_hosts,
                            "Comma-separated list of hostname:port pairs")

        # Training related flags
        flags.DEFINE_string("data_dir",
                            get_data_path(
                                dataset_name = self.cloud_user_repo, #all mounted repo
                                local_root = self.data_path,
                                local_repo = self.local_repo,
                                path = self.cloud_data_path
                                ),
                            "Path to dataset. It is recommended to use get_data_path()"
                            "to define your data directory.so that you can switch "
                            "from local to clusterone without changing your code."
                            "If you set the data directory manually make sure to use"
                            "/data/ as root path when running on ClusterOne cloud.")

        flags.DEFINE_string("log_dir",
                             get_logs_path(root=self.logs_path),
                            "Path to store logs and checkpoints. It is recommended"
                            "to use get_logs_path() to define your logs directory."
                            "so that you can switch from local to clusterone without"
                            "changing your code."
                            "If you set your logs directory manually make sure"
                            "to use /logs/ when running on ClusterOne cloud.")

        self.flags = flags
示例#7
0
def get_args():
    """Parse arguments"""
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            description='''Train a convolution neural network with MNIST dataset.
                            For distributed mode, the script will use few environment variables as defaults:
                            JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be
                            available on distributed Tensorflow jobs on Clusterone platform by default.
                            If running this locally, you will need to set these environment variables
                            or pass them in as arguments (i.e. python mnist.py --job_name worker --task_index 0
                            --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224").
                            If these are not set, the script will run in non-distributed (single instance) mode.''')

    # Configuration for distributed task
    parser.add_argument('--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'],
                        help='Task type for the node in the distributed cluster. Worker-0 will be set as master.')
    parser.add_argument('--task_index', type=int, default=os.environ.get('TASK_INDEX', 0),
                        help='Worker task index, should be >= 0. task_index=0 is the chief worker.')
    parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')
    parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')

    # Experiment related parameters
    parser.add_argument('--local_data_dir', type=str, default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir', type=str, default='logs/',
                        help='Path to local log directory')

    # Training params
    parser.add_argument('--learning_rate', type=float, default=0.001,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument('--learning_decay', type=float, default=0.001,
                        help='Exponential decay rate of the learning rate per step.')
    parser.add_argument('--batch_size', type=int, default=512,
                        help='Batch size to use during training and evaluation.')
    opts = parser.parse_args()

    # Clusterone snippet: Grabs the correct paths, depending on if the job is running local or on Clusterone
    opts.data_dir = get_data_path(dataset_name='',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    return opts
示例#8
0
def get_data_path(dataset_name, local_root, local_repo='', path=''):
    """
    Dataset specification, see: get_data_path,
    https://clusterone.com/documentation/api/#get_data_path

    If local_root starts with gs:// we suppose a bucket in google cloud and return
    local_root / local_repo / local_path
    :param str name: TensorPort dataset repository name,
        e.g. user_name/repo_name
    :param str local_root: specifies the root directory for dataset.
          e.g. /home/username/datasets, gs://my-project/my_dir
    :param str local_repo: specifies the repo name inside the root data path.
          e.g. my_repo_data/
    :param str path: specifies the path inside the repository, (optional)
          e.g. train
    :return str: the real path of the dataset
    """
    if local_root.startswith('gs://'):
        return os.path.join(local_root, local_repo, path)
    return clusterone.get_data_path(dataset_name=dataset_name,
                                    local_root=local_root,
                                    local_repo=local_repo,
                                    path=path)
示例#9
0
def get_args():
    '''Return parsed args'''
    parser = ArgumentParser()
    parser.add_argument('--local_data_dir',
                        type=str,
                        default='data/',
                        help='Path to local data directory')
    parser.add_argument('--local_log_dir',
                        type=str,
                        default='logs/',
                        help='Path to local log directory')
    # Model params
    parser.add_argument('--hidden_units',
                        type=int,
                        nargs='*',
                        default=[32, 64])
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--learning_decay', type=float, default=0.001)
    parser.add_argument('--dropout', type=float, default=0.5)
    # Runtime params
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--num_steps', type=int, default=9999999)
    parser.add_argument('--input_threads', type=int, default=None)
    opts = parser.parse_args()

    opts.data_dir = get_data_path(dataset_name='*/*',
                                  local_root=opts.local_data_dir,
                                  local_repo='',
                                  path='')
    opts.log_dir = get_logs_path(root=opts.local_log_dir)

    if opts.input_threads is None:
        import multiprocessing
        opts.input_threads = multiprocessing.cpu_count()

    return opts
示例#10
0
def parse_args():
    """Parse arguments"""
    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description=
        '''Trains a self-steering car model in single-instance or distributed mode.
                            For distributed mode, the script will use few environment variables as defaults:
                            JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be
                            available on distributed Tensorflow jobs on Clusterone platform by default.
                            If running this locally, you will need to set these environment variables
                            or pass them in as arguments (i.e. python main.py --job_name worker --task_index 0
                            --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224").
                            If these are not set, the script will run in non-distributed (single instance) mode.'''
    )

    # Configuration for distributed task
    parser.add_argument(
        '--job_name',
        type=str,
        default=os.environ.get('JOB_NAME', None),
        choices=['worker', 'ps'],
        help=
        'Task type for the node in the distributed cluster. Worker-0 will be set as master.'
    )
    parser.add_argument(
        '--task_index',
        type=int,
        default=os.environ.get('TASK_INDEX', 0),
        help=
        'Worker task index, should be >= 0. task_index=0 is the chief worker.')
    parser.add_argument('--ps_hosts',
                        type=str,
                        default=os.environ.get('PS_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')
    parser.add_argument('--worker_hosts',
                        type=str,
                        default=os.environ.get('WORKER_HOSTS', ''),
                        help='Comma-separated list of hostname:port pairs.')

    # Experiment related parameters
    parser.add_argument(
        '--local_data_root',
        type=str,
        default=os.path.abspath('./data/'),
        help='Path to dataset. This path will be /data on Clusterone.')
    parser.add_argument(
        '--local_log_root',
        type=str,
        default=os.path.abspath('./logs/'),
        help=
        'Path to store logs and checkpoints. This path will be /logs on Clusterone.'
    )
    parser.add_argument(
        '--data_subpath',
        type=str,
        default='',
        help=
        'Which sub-directory the data will sit inside local_data_root (locally) '
        + 'or /data/ (on Clusterone)')
    parser.add_argument(
        '--absolute_data_path',
        type=str,
        default=None,
        help='Using this will ignore other data path arguments.')

    # Model params
    parser.add_argument('--dropout_rate1',
                        type=float,
                        default=0.2,
                        help='Dropout rate after the convolutional layers.')
    parser.add_argument('--dropout_rate2',
                        type=float,
                        default=0.5,
                        help='Dropout rate after the dense layer.')
    parser.add_argument('--fc_dim',
                        type=int,
                        default=512,
                        help='Number of dimensions in the dense layer.')
    parser.add_argument('--nogood',
                        action='store_true',
                        help='Ignore "goods" filters')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.0001,
                        help='Initial learning rate used in Adam optimizer.')
    parser.add_argument(
        '--learning_decay',
        type=float,
        default=0.0001,
        help='Exponential decay rate of the learning rate per step.')

    # Training params
    parser.add_argument(
        '--batch_size',
        type=int,
        default=64,
        help='Batch size to use during training and evaluation.')
    parser.add_argument('--max_steps',
                        type=int,
                        default=10000,
                        help='Max number of steps to train for.')
    parser.add_argument(
        '--verbosity',
        type=str,
        default='INFO',
        choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'],
        help=
        'TF logging level. To log intermediate results, set this to INFO or DEBUG.'
    )
    parser.add_argument('--num_threads',
                        type=int,
                        default=1,
                        help='Number of threads to use to prepare data')
    parser.add_argument('--max_ckpts',
                        type=int,
                        default=2,
                        help='Maximum number of checkpoints to keep')
    parser.add_argument('--ckpt_steps',
                        type=int,
                        default=100,
                        help='How frequently to save a model checkpoint')
    parser.add_argument('--save_summary_steps',
                        type=int,
                        default=10,
                        help='How frequently to save TensorBoard summaries')
    parser.add_argument('--log_step_count_steps',
                        type=int,
                        default=10,
                        help='How frequently to log loss & global steps/s')
    parser.add_argument(
        '--eval_secs',
        type=int,
        default=60,
        help='How frequently to run evaluation step. ' +
        'By default, there is no evaluation dataset, thus effectively no evaluation.'
    )

    # Parse args
    opts = parser.parse_args()

    if opts.absolute_data_path is None:
        opts.train_data = get_data_path(dataset_name='*/*',
                                        local_root=opts.local_data_root,
                                        local_repo=opts.data_subpath,
                                        path='camera/training/*.h5')
    else:
        opts.train_data = os.path.join(opts.absolute_data_path,
                                       'camera/training/*.h5')

    opts.log_dir = get_logs_path(root=opts.local_log_root)
    opts.ps_hosts = opts.ps_hosts.split(',') if opts.ps_hosts else []
    opts.worker_hosts = opts.worker_hosts.split(
        ',') if opts.worker_hosts else []

    return opts
示例#11
0
    sum1, sumx, sumy = 0, 0, 0
    for i in range(len(X)):
        sum1 += (X[i] - avx) * (Y[i] - avy)
        sumx += (X[i] - avx) * (X[i] - avx)
        sumy += (Y[i] - avy) * (Y[i] - avy)
    return sum1 * sum1 / (sumx * sumy)


sys.stdout.write("reading data ... ")
sys.stdout.flush()
start = datetime.datetime.now()

# file path when running on clusterone: /data/my_username/dataset_name/
data_path = get_data_path(
    dataset_name='zhaoxiaq/qianqianmerckdata',  # on ClusterOne
    local_root='~/',  # path to local dataset
    local_repo='TrainingSet',  # local data folder name
    path='ACT1_competition_training.csv'  # folder within the data folder
)

train_1 = pd.read_csv(data_path,
                      engine='python',
                      dtype={
                          "MOLECULE": object,
                          "Act": float
                      })
# file path when running on my own pc or cloud
# train_1 = pd.read_csv('TrainingSet/ACT1_competition_training.csv',dtype={"MOLECULE": object, "Act": float})

stop = datetime.datetime.now()
sys.stdout.write("done\n")
sys.stdout.write("took {} seconds\n".format((stop - start).total_seconds()))
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import MobileNetV2
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.optimizers import Adam
from clusterone import get_data_path, get_logs_path

N_CLASSES = 200
BATCH_SIZE = 32

train_data_dir = get_data_path(
    dataset_name = 'artem/artem-tiny-imagenet',
    local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/',
    local_repo = 'tiny-imagenet-200',
    path = 'train'
)

val_data_dir = get_data_path(
    dataset_name = 'artem/artem-tiny-imagenet',
    local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/',
    local_repo = 'tiny-imagenet-200',
    path = 'val/for_keras'
)

models_dir = get_data_path(
    dataset_name = 'artem/artem-tiny-imagenet',
    local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/',
    local_repo = '',
    path = 'models'
)

log_dir = get_logs_path('/Users/artem/Documents/Scratch/tiny_imagenet/logs/')
示例#13
0
flags.DEFINE_string("job_name", job_name,
                    "job name: worker or ps")
flags.DEFINE_integer("task_index", task_index,
                     "Worker task index, should be >= 0. task_index=0 is "
                     "the chief worker task that performs the variable "
                     "initialization and checkpoint handling")
flags.DEFINE_string("ps_hosts", ps_hosts,
                    "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", worker_hosts,
                    "Comma-separated list of hostname:port pairs")

# Training related flags
flags.DEFINE_string("data_dir",
                    get_data_path(
                        dataset_name = "cyi/cyi-datasets", #all mounted repo
                        local_root = ROOT_PATH_TO_LOCAL_DATA,
                        local_repo = "mnist",
                        path = 'data'
                        ),
                    "Path to store logs and checkpoints. It is recommended"
                    "to use get_logs_path() to define your logs directory."
                    "so that you can switch from local to clusterone without"
                    "changing your code."
                    "If you set your logs directory manually make sure"
                    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string("log_dir",
                     get_logs_path(root=PATH_TO_LOCAL_LOGS),
                    "Path to dataset. It is recommended to use get_data_path()"
                    "to define your data directory.so that you can switch "
                    "from local to clusterone without changing your code."
                    "If you set the data directory manually makue sure to use"
                    "/data/ as root path when running on ClusterOne cloud.")
# coding: utf-8

# In[ ]:

import numpy as np
#from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import tensorflow as tf
from clusterone import get_data_path, get_logs_path

# In[ ]:

train_dir = get_data_path(
    dataset_name='/data/bhavikaj/bhavikaj-asl/data.h5',  # on ClusterOne
    local_root='data.h5',  # path to local dataset
    local_repo='',  # local data folder name
    path=''  # folder within the data folder
)

# In[ ]:

import h5py
# Load hdf5 dataset
h5f = h5py.File('/data/bhavikaj/bhavikaj-asl/data.h5', 'r')
X_train = h5f['X_train']
y_trainHot = h5f['y_trainHot']
X_test = h5f['X_test']
y_testHot = h5f['y_testHot']

##Defining the Model
#Training using a simple model
def main(args):
    print('args =', args)

    sys.stdout.write("reading data ... ")
    sys.stdout.flush()
    start = datetime.datetime.now()
    # file path when running on clusterone: /data/my_username/dataset_name/
    data_path = get_data_path(
            dataset_name = 'zhaoxiaq/qianqianmerckdata',  # on ClusterOne
            local_root = '~/',  # path to local dataset
            local_repo = 'TrainingSet',  # local data folder name
            path = 'ACT1_competition_training.csv'  # folder within the data folder
            )

    train_1 = pd.read_csv(data_path, engine = 'python', dtype={"MOLECULE": object, "Act": float})
    stop = datetime.datetime.now()
    sys.stdout.write("done\n")
    sys.stdout.write("took {} seconds\n".format((stop - start).total_seconds()))
    sys.stdout.flush()

    y = train_1['Act'].values
    y = np.reshape(y, (-1, 1))
    train_1 = train_1.drop(['Act', 'MOLECULE'], axis = 1)
    train_1 = train_1.apply(lambda x: np.log(x+1))
    x = train_1

    seed = args.seed if args.seed else random.randint(0, pow(2,32) - 1)
    print("SEED =", seed )

    X_train, X_dev, Y_train, Y_dev = train_test_split(x, y, train_size = 0.80, random_state=seed)
    X_val, X_test, Y_val, Y_test = train_test_split(X_dev, Y_dev, train_size = 0.50, random_state=seed)

    X_placeholder = tf.placeholder(tf.float64, (None, X_train.shape[1]))
    Y_placeholder = tf.placeholder(tf.float64, (None, Y_train.shape[1]))

    # define parameters
    features = np.shape(X_train)[1] # switch to X_train
    target_size = np.shape(X_train)[0]

    learning_rate = 0.001# switch to 0.05

    start_epoch = args.restart_epoch if args.restart_epoch else 0
    epochs = args.epochs
    checkpoint_freq = args.checkpoint
    batch_size = 300

    batch_size_placeholder = tf.placeholder(tf.int64)

    # network parameters
    n_hidden_1 = 100
    n_hidden_2 = 50
    n_hidden_3 = 25
    n_hidden_4 = 25

    ds_train = tf.data.Dataset.from_tensor_slices((X_placeholder, Y_placeholder)).shuffle(buffer_size=round(len(X_train) * 0.3)).batch(batch_size_placeholder)

    ds_test = tf.data.Dataset.from_tensor_slices((X_placeholder, Y_placeholder)).batch(batch_size_placeholder)

    ds_iter = tf.data.Iterator.from_structure(ds_train.output_types, ds_train.output_shapes)

    next_x, next_y = ds_iter.get_next()

    train_init_op = ds_iter.make_initializer(ds_train)
    test_init_op = ds_iter.make_initializer(ds_test)

    # define placeholder for input vector X and target vector y
    keep_prob = tf.placeholder(tf.float64)

    # initialize weights and bias
    weights = {'w1': tf.Variable(tf.truncated_normal([features, n_hidden_1], 0, 1, dtype=tf.float64)),
               'w2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], 0, 1, dtype=tf.float64)),
               'w3': tf.Variable(tf.truncated_normal([n_hidden_2, n_hidden_3], 0, 1, dtype=tf.float64)),
               'w4': tf.Variable(tf.truncated_normal([n_hidden_3, n_hidden_4], 0, 1, dtype=tf.float64)),
              'out': tf.Variable(tf.truncated_normal([n_hidden_4, 1], 0, 1, dtype=tf.float64))}

    biases = {'b1': tf.Variable(tf.truncated_normal([n_hidden_1], 0, 1, dtype=tf.float64)),
              'b2': tf.Variable(tf.truncated_normal([n_hidden_2], 0, 1, dtype=tf.float64)),
              'b3': tf.Variable(tf.truncated_normal([n_hidden_3], 0, 1, dtype=tf.float64)),
              'b4': tf.Variable(tf.truncated_normal([n_hidden_4], 0, 1, dtype=tf.float64)),
             'out': tf.Variable(tf.truncated_normal([1], 0, 1, dtype=tf.float64))}

    # construct model
    y_pred = multilayer_perceptron(next_x, weights, biases, keep_prob)

    # define cost function(mean squred error) and optimizer(gradient descent)
    cost =  tf.losses.mean_squared_error(next_y, y_pred)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    # initialize variables
    init_op = tf.global_variables_initializer()

    saver = tf.train.Saver(max_to_keep=8, keep_checkpoint_every_n_hours=1)

    with tf.Session() as sess:
        if args.restart:
            saver.restore(sess, args.restart)
        else:
            sess.run(init_op)

        for epoch in range(epochs):

            sess.run(train_init_op, feed_dict={X_placeholder: X_train, Y_placeholder: Y_train, batch_size_placeholder: batch_size})
            count = 0

            while True:
                try:
                    count += 1
                    _, c = sess.run((optimizer, cost), feed_dict={keep_prob: 0.75})
                    print('Epoch:', (epoch + 1), 'Batch:', count, 'cost =', c)
                except tf.errors.OutOfRangeError:
                    break

            # Calculate R^2 each Epoch
            sess.run(test_init_op, feed_dict={X_placeholder: X_test, Y_placeholder: Y_test, batch_size_placeholder: len(X_test)})
            results, test_cost = sess.run((y_pred, cost), feed_dict={keep_prob: 1.0})
            print(epoch, test_cost, r_square(np.reshape(results, (len(results),)), Y_test))

            # Save model every 'checkpoint_freq' epochs.
            if epoch % checkpoint_freq == 0:
                saver.save(sess, './snap', global_step=epoch)
示例#16
0
import argparse
import os
import math
import shutil
import cv2
import numpy as np
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
from clusterone import get_data_path, get_logs_path

LOCAL_DATA_PATH = os.path.abspath(os.path.expanduser('../../data/'))
LOCAL_LOGS_PATH = os.path.abspath(os.path.expanduser('logs/'))

# Storage directory for the MNIST dataset.
# Returns LOCAL_DATA_PATH when running locally, '/data/malo/mnist' when running on Clusterone.
data_dir = get_data_path(dataset_name="malo/mnist",
                         local_root=LOCAL_DATA_PATH,
                         local_repo="mnist",
                         path='')

# Storage dictory for the log files produced by this script.
logs_dir = get_logs_path(LOCAL_LOGS_PATH)

# The MNIST dataset has 10 classes, representing the digits 0 through 9
NUM_CLASSES = 10

# The MNIST images are always 28x28 pixels
IMAGE_SIZE = 28
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Each hidden layer gets 128 neurons
hidden1_units = 128
hidden2_units = 128
示例#17
0
flags.DEFINE_integer(
    "task_index", task_index,
    "Worker task index, should be >= 0. task_index=0 is "
    "the chief worker task the performs the variable "
    "initialization")
flags.DEFINE_string("ps_hosts", ps_hosts,
                    "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", worker_hosts,
                    "Comma-separated list of hostname:port pairs")

# Training related flags
flags.DEFINE_string(
    "data_dir",
    get_data_path(
        dataset_name="malo/mnist",  #all mounted repo
        local_root=ROOT_PATH_TO_LOCAL_DATA,
        local_repo="mnist",
        path=''),
    "Path to store logs and checkpoints. It is recommended"
    "to use get_logs_path() to define your logs directory."
    "so that you can switch from local to clusterone without"
    "changing your code."
    "If you set your logs directory manually make sure"
    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string(
    "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
    "Path to dataset. It is recommended to use get_data_path()"
    "to define your data directory.so that you can switch "
    "from local to clusterone without changing your code."
    "If you set the data directory manually makue sure to use"
    "/data/ as root path when running on ClusterOne cloud.")
示例#18
0
import os
import numpy as np
import torch.nn as nn
import torch
import argparse
import time
from torch.autograd import Variable
from tensorboard_logger import configure, log_value
from clusterone import get_logs_path, get_data_path
from .model.model import fetch_metrics, TinyImageNetModel
from .model.data_loader import fetch_label_map, fetch_dataloader

TRAIN_DATA_DIR = get_data_path(
    dataset_name = 'mohsen/clusterone-tiny-imagenet-example',
    local_root = os.path.expanduser('~/'),
    local_repo = 'tiny-imagenet-200',
    path = 'train'
)
EVAL_DATA_DIR = get_data_path(
    dataset_name = 'mohsen/clusterone-tiny-imagenet-example',
    local_root = os.path.expanduser('~/'),
    local_repo = 'tiny-imagenet-200',
    path = 'val/for_keras'
)
UNIQUE_LABELS_PATH = get_data_path(
    dataset_name = 'mohsen/clusterone-tiny-imagenet-example',
    local_root = os.path.expanduser('~/'),
    local_repo = 'tiny-imagenet-200',
    path = 'wnids.txt'
)
LOGS_PATH = get_logs_path('./logs')
flags.DEFINE_integer(
    "task_index", task_index,
    "Worker task index, should be >= 0. task_index=0 is "
    "the chief worker task that performs the variable "
    "initialization and checkpoint handling")
flags.DEFINE_string("ps_hosts", ps_hosts,
                    "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", worker_hosts,
                    "Comma-separated list of hostname:port pairs")

# Training related flags
flags.DEFINE_string(
    "train_data_dir",
    get_data_path(
        dataset_name='artem/artem-tiny-imagenet',
        local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'),
        local_repo='tiny-imagenet-200',
        path='train'), "Path to store logs and checkpoints. It is recommended"
    "to use get_logs_path() to define your logs directory."
    "so that you can switch from local to clusterone without"
    "changing your code."
    "If you set your logs directory manually make sure"
    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string(
    "val_data_dir",
    get_data_path(
        dataset_name='artem/artem-tiny-imagenet',
        local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'),
        local_repo='tiny-imagenet-200',
        path='val/for_keras'),
    "Path to store logs and checkpoints. It is recommended"
示例#20
0
flags.DEFINE_string("job_name", job_name,
                    "job name: worker or ps")
flags.DEFINE_integer("task_index", task_index,
                     "Worker task index, should be >= 0. task_index=0 is "
                     "the chief worker task the performs the variable "
                     "initialization")
flags.DEFINE_string("ps_hosts", ps_hosts,
                    "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", worker_hosts,
                    "Comma-separated list of hostname:port pairs")

# Training related flags
flags.DEFINE_string("data_dir",
                    get_data_path(
                        dataset_name = "sjay87/", #all mounted repo
                        local_root = ROOT_PATH_TO_LOCAL_DATA,
                        local_repo = "",
                        path = ""
                    ),
                    "Path to dataset. It is recommended to use get_data_path()"
                    "to define your data directory.so that you can switch "
                    "from local to ClusterOne without changing your code."
                    "If you set the data directory manually makue sure to use"
                    "/data/ as root path when running on ClusterOne cloud.")
flags.DEFINE_string("log_dir",
                    get_logs_path(root = PATH_TO_LOCAL_LOGS),
                    "Path to store logs and checkpoints. It is recommended"
                    "to use get_logs_path() to define your logs directory."
                    "so that you can switch from local to clusterone without"
                    "changing your code."
                    "If you set your logs directory manually make sure"
                    "to use /logs/ when running on ClusterOne cloud.")
示例#21
0
flags.DEFINE_integer(
    "input_width", None,
    "The size of image to use (will be center cropped). If None, same value as input_height [None]"
)
flags.DEFINE_integer("output_height", 64,
                     "The size of the output images to produce [64]")
flags.DEFINE_integer(
    "output_width", None,
    "The size of the output images to produce. If None, same value as output_height [None]"
)
flags.DEFINE_string("dataset", "celebA",
                    "The name of dataset [celebA, mnist, lsun]")
flags.DEFINE_string(
    "data_path",
    get_data_path(dataset_name="%s/*" % CLUSTERONE_USERNAME,
                  local_root=ROOT_PATH_TO_LOCAL_DATA,
                  local_repo=LOCAL_REPO,
                  path=""), "data path for zip file")
flags.DEFINE_string("checkpoint_dir", get_logs_path(LOCAL_PATH_TO_LOGS),
                    "Directory name to save the checkpoints [checkpoint]")
flags.DEFINE_string(
    "sample_dir", get_logs_path("samples"),
    "Directory name to save the image samples [samples]"
)  #TODO: replace with os.path.join(logs/samples) when folders are supported
flags.DEFINE_boolean("train", True,
                     "True for training, False for testing [True]")
flags.DEFINE_boolean("crop", True,
                     "True for training, False for testing [True]")
flags.DEFINE_boolean("visualize", False,
                     "True for visualizing, False for nothing [False]")
FLAGS = flags.FLAGS
示例#22
0
flags = tf.app.flags

flags.DEFINE_integer("number_worker_gpu", 0, "Number of worker GPUs")
flags.DEFINE_integer("number_ps_gpu", 0, "Number of PS GPUs")
flags.DEFINE_integer("batch_size", 2097152, "Batch size")

FLAGS = tf.flags.FLAGS

USERNAME = "******"

DATASET_NAME = "openslr_small"
PROBLEM = 'librispeech_clean'

DATA_PATH = get_data_path(
    dataset_name="%s/%s" % (USERNAME, DATASET_NAME),  #on clusterone
    local_root=os.path.expanduser("~/Data"),
    local_repo="openSLR",
    path='')

CHECKPOINTS_PATH = get_logs_path(root=os.path.expanduser("~/logs"))

if not os.path.exists(CHECKPOINTS_PATH):
    os.makedirs(CHECKPOINTS_PATH)

try:
    job_name = os.environ['JOB_NAME']
    task_index = int(os.environ['TASK_INDEX'])
    ps_hosts = os.environ['PS_HOSTS'].split(',')
    worker_hosts = os.environ['WORKER_HOSTS'].split(',')
    if job_name == 'ps':
        ps_hosts[task_index] = 'localhost:%s' % (
示例#23
0
def main():
    """ Main wrapper"""

    # clusterone snippet 1 - get environment variables
    try:
        job_name = os.environ['JOB_NAME']
        task_index = os.environ['TASK_INDEX']
        ps_hosts = os.environ['PS_HOSTS']
        worker_hosts = os.environ['WORKER_HOSTS']
    except:
        job_name = None
        task_index = 0
        ps_hosts = None
        worker_hosts = None

    if job_name == None:  #if running locally
        if LOCAL_LOG_LOCATION == "...":
            raise ValueError("LOCAL_LOG_LOCATION needs to be defined")
        if LOCAL_DATASET_LOCATION == "...":
            raise ValueError("LOCAL_DATASET_LOCATION needs to be defined")
        if LOCAL_DATASET_NAME == "...":
            raise ValueError("LOCAL_DATASET_NAME needs to be defined")

    #Path to your data locally. This will enable to run the model both locally and on
    # ClusterOne without changes
    PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION)
    ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION)
    #end of clusterone snippet 1

    #Flags
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    # clusterone snippet 2: flags.

    #Define the path from the root data directory to your data.
    #We use glob to match any .h5 datasets in Documents/comma locally, or in data/ on ClusterOne
    flags.DEFINE_string(
        "train_data_dir",
        get_data_path(
            dataset_name="tensorbot/*",
            local_root=ROOT_PATH_TO_LOCAL_DATA,
            local_repo=
            LOCAL_DATASET_NAME,  #all repos (we use glob downstream, see read_data.py)
            path='camera/training/*.h5'  #all .h5 files
        ),
        """Path to training dataset. It is recommended to use get_data_path()
        to define your data directory. If you set your dataset directory manually make sure to use /data/
        as root path when running on TensorPort cloud.
        On tensrport, the data will be mounted in /data/user/clusterone_dataset_name,
        so you can acces `path` with  /data/user/clusterone_dataset_name/path
        """)
    flags.DEFINE_string(
        "logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
        "Path to store logs and checkpoints. It is recommended"
        "to use get_logs_path() to define your logs directory."
        "If you set your logs directory manually make sure"
        "to use /logs/ when running on TensorPort cloud.")

    # Define worker specific environment variables. Handled automatically.
    flags.DEFINE_string("job_name", job_name, "job name: worker or ps")
    flags.DEFINE_integer(
        "task_index", task_index,
        "Worker task index, should be >= 0. task_index=0 is "
        "the chief worker task the performs the variable "
        "initialization")
    flags.DEFINE_string("ps_hosts", ps_hosts,
                        "Comma-separated list of hostname:port pairs")
    flags.DEFINE_string("worker_hosts", worker_hosts,
                        "Comma-separated list of hostname:port pairs")

    # end of clusterone snippet 2

    # Training flags - feel free to play with that!
    flags.DEFINE_integer("batch", 64, "Batch size")
    flags.DEFINE_integer("time", 1, "Number of frames per sample")
    flags.DEFINE_integer("steps_per_epoch", 10000,
                         "Number of training steps per epoch")
    flags.DEFINE_integer("nb_epochs", 200, "Number of epochs")

    # Model flags - feel free to play with that!
    flags.DEFINE_float("dropout_rate1", .2,
                       "Dropout rate on first dropout layer")
    flags.DEFINE_float("dropout_rate2", .5,
                       "Dropout rate on second dropout layer")
    flags.DEFINE_float("starter_lr", 1e-6,
                       "Starter learning rate. Exponential decay is applied")
    flags.DEFINE_integer("fc_dim", 512, "Size of the dense layer")
    flags.DEFINE_boolean("nogood", False, "Ignore `goods` filters.")

    # clusterone snippet 3: configure distributed environment
    def device_and_target():
        # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
        # Don't set a device.
        if FLAGS.job_name is None:
            print("Running single-machine training")
            return (None, "")

        # Otherwise we're running distributed TensorFlow.
        print("Running distributed training")
        if FLAGS.task_index is None or FLAGS.task_index == "":
            raise ValueError("Must specify an explicit `task_index`")
        if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "":
            raise ValueError("Must specify an explicit `ps_hosts`")
        if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "":
            raise ValueError("Must specify an explicit `worker_hosts`")

        cluster_spec = tf.train.ClusterSpec({
            "ps":
            FLAGS.ps_hosts.split(","),
            "worker":
            FLAGS.worker_hosts.split(","),
        })
        server = tf.train.Server(cluster_spec,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

        worker_device = "/job:worker/task:{}".format(FLAGS.task_index)
        # The device setter will automatically place Variables ops on separate
        # parameter servers (ps). The non-Variable ops will be placed on the workers.
        return (
            tf.train.replica_device_setter(worker_device=worker_device,
                                           cluster=cluster_spec),
            server.target,
        )

    device, target = device_and_target()
    # end of clusterone snippet 3

    print(FLAGS.logs_dir)
    print(FLAGS.train_data_dir)

    if FLAGS.logs_dir is None or FLAGS.logs_dir == "":
        raise ValueError("Must specify an explicit `logs_dir`")
    if FLAGS.train_data_dir is None or FLAGS.train_data_dir == "":
        raise ValueError("Must specify an explicit `train_data_dir`")
    # if FLAGS.val_data_dir is None or FLAGS.val_data_dir == "":
    #     raise ValueError("Must specify an explicit `val_data_dir`")

    TIME_LEN = 1  #1 video frame. Other not supported.

    # Define graph
    with tf.device(device):
        # X = tf.placeholder(tf.float32, [FLAGS.batch, 3, 160, 320], name="X")
        # Y = tf.placeholder(tf.float32,[FLAGS.batch,1], name="Y") # angle only
        # S = tf.placeholder(tf.float32,[FLAGS.batch,1], name="S") #speed

        if FLAGS.task_index == 0:
            print("Looking for data in %s" % FLAGS.train_data_dir)
        reader = DataReader(FLAGS.train_data_dir)
        x, y, s = reader.read_row_tf()
        x.set_shape((3, 160, 320))
        y.set_shape((1))
        s.set_shape((1))

        X, Y, S = tf.train.batch([x, y, s], batch_size=FLAGS.batch)
        predictions = get_model(X, FLAGS)
        steering_summary = tf.summary.image(
            "green-is-predicted", render_steering_tf(X, Y, S, predictions)
        )  # Adding numpy operation to graph. Adding image to summary
        loss = get_loss(predictions, Y)
        training_summary = tf.summary.scalar('Training_Loss',
                                             loss)  #add to tboard

        #Batch generators
        global_step = tf.contrib.framework.get_or_create_global_step()
        learning_rate = tf.train.exponential_decay(FLAGS.starter_lr,
                                                   global_step,
                                                   1000,
                                                   0.96,
                                                   staircase=True)

        train_step = (tf.train.AdamOptimizer(learning_rate).minimize(
            loss, global_step=global_step))

    def run_train_epoch(target, FLAGS, epoch_index):
        """Restores the last checkpoint and runs a training epoch
        Inputs:
            - target: device setter for distributed work
            - FLAGS:
                - requires FLAGS.logs_dir from which the model will be restored.
                Note that whatever most recent checkpoint from that directory will be used.
                - requires FLAGS.steps_per_epoch
            - epoch_index: index of current epoch
        """

        hooks = [
            tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch *
                                    epoch_index)
        ]  # Increment number of required training steps
        i = 1

        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir=FLAGS.logs_dir,
                hooks=hooks) as sess:

            while not sess.should_stop():
                variables = [loss, learning_rate, train_step]
                current_loss, lr, _ = sess.run(variables)

                print(
                    "Iteration %s - Batch loss: %s" %
                    ((epoch_index) * FLAGS.steps_per_epoch + i, current_loss))
                i += 1

    for e in range(FLAGS.nb_epochs):
        run_train_epoch(target, FLAGS, e)
示例#24
0
def main():
    
    #Training Data
    xtrain = 'Xtrain.txt'
    ytrain = 'Ytrain.txt'
    
    #Validation Data
    xtest = 'Xtest.txt'
    ytest = 'Ytest.txt'
    
    # Training Parameters
    batch_size = 500  # Batch size
    num_epochs = 5  # Number epochs
    train_holdout = 0.2  # Portion of training features used for valisation
    learning_rate = 0.005  # Starting learning rate
    steps_per_epoch = 50 # Number of training steps per epoch
    
#----- Begin Main Code    
    
    # Get environment variables
    try:
        job_name = os.environ['JOB_NAME']
        task_index = os.environ['TASK_INDEX']
        ps_hosts = os.environ['PS_HOSTS']
        worker_hosts = os.environ['WORKER_HOSTS']
    except:
        job_name = None
        task_index = 0
        ps_hosts = None
        worker_hosts = None
        
    # Get local file paths
    PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION)
    ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION)
   
    # Flags
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    # Flags for environment variables
    flags.DEFINE_string("job_name", job_name,
                        "job name: worker or ps")
    flags.DEFINE_integer("task_index", task_index,
                         "Worker task index, should be >= 0. task_index=0 is "
                         "the chief worker task that performs the variable "
                         "initialization and checkpoint handling")
    flags.DEFINE_string("ps_hosts", ps_hosts,
                        "Comma-separated list of hostname:port pairs")
    flags.DEFINE_string("worker_hosts", worker_hosts,
                        "Comma-separated list of hostname:port pairs")
    
    # Training file flags
    flags.DEFINE_string("xtrain",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = xtrain
                            ),
                        "Path to training dataset.")
    flags.DEFINE_string("ytrain",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = ytrain
                            ),
                        "Path to training dataset.")
    
    flags.DEFINE_string("log_dir",
                         get_logs_path(root=PATH_TO_LOCAL_LOGS),
                         "Path to store logs and checkpoints.")
    
    # Validation file flags
    flags.DEFINE_string("xtest",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = xtest
                            ),
                        "Path to testing dataset.")
    flags.DEFINE_string("ytest",
                        get_data_path(
                            dataset_name = "emanrao/variantnn-demo",
                            local_root = ROOT_PATH_TO_LOCAL_DATA,
                            local_repo = LOCAL_DATASET_NAME,
                            path = ytest
                            ),
                        "Path to testing dataset.")

    # Training parameter flags
    flags.DEFINE_integer("batch_size", batch_size,
                        "Batch size [100].")
    flags.DEFINE_integer("num_epochs", num_epochs,
                        "Number epochs [50].")
    flags.DEFINE_float("train_holdout", train_holdout,
                        "Portion of training features withheld from traing and used for validation [0.2].")
    flags.DEFINE_float("learning_rate", learning_rate,
                        "Starting learning rate [0.0005].")
    flags.DEFINE_integer("steps_per_epoch", steps_per_epoch, 
                         "Number of training steps per epoch")

    # Configure Distributed Environment
    def device_and_target():
        # If FLAGS.job_name is not set, we're running single-machine TensorFlow.
        # Don't set a device.
        if FLAGS.job_name is None:
            print("Running single-machine training")
            return (None, "")

        # Otherwise we're running distributed TensorFlow.
        print("Running distributed training")
        if FLAGS.task_index is None or FLAGS.task_index == "":
            raise ValueError("Must specify an explicit `task_index`")
        if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "":
            raise ValueError("Must specify an explicit `ps_hosts`")
        if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "":
            raise ValueError("Must specify an explicit `worker_hosts`")

        cluster_spec = tf.train.ClusterSpec({
                "ps": FLAGS.ps_hosts.split(","),
                "worker": FLAGS.worker_hosts.split(","),
        })
        server = tf.train.Server(
                cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

        worker_device = "/job:worker/task:{}".format(FLAGS.task_index)
        # The device setter will automatically place Variables ops on separate
        # parameter servers (ps). The non-Variable ops will be placed on the workers.
        return (
                tf.train.replica_device_setter(
                        worker_device=worker_device,
                        cluster=cluster_spec),
                server.target,
        )

    device, target = device_and_target()

# ----- Read Data  -----   
    # Check Flags
    if FLAGS.log_dir is None or FLAGS.log_dir == "":
        raise ValueError("Must specify an explicit `log_dir`")
    if FLAGS.xtrain is None or FLAGS.xtrain == "":
        raise ValueError("Must specify an explicit `xtrain`")
    if FLAGS.ytrain is None or FLAGS.ytrain == "":
        raise ValueError("Must specify an explicit `ytrain`")
    if FLAGS.xtest is None or FLAGS.xtest == "":
        raise ValueError("Must specify an explicit `xtest`")
    if FLAGS.ytest is None or FLAGS.ytest == "":
        raise ValueError("Must specify an explicit `ytest`")
        
    print('Training dataset file: ', FLAGS.xtrain)
    print('Training target file: ', FLAGS.ytrain)

    print('Testing dataset file: ', FLAGS.xtest)
    print('Testing target file: ', FLAGS.ytest)
    
    print('Log Files Saved To: ', FLAGS.log_dir)

    # Read in data
    Xtrain, Ytrain = read_flat_file(FLAGS.xtrain, FLAGS.ytrain)           
    Xtest, Ytest = read_flat_file(FLAGS.xtest, FLAGS.ytest)  
    
    num_train = int(np.round(Xtrain.shape[0] * (1-FLAGS.train_holdout)))
    num_held = int(Xtrain.shape[0]-num_train)
    print('Training on {:d} features'.format(num_train))
    print('Validating on {:d} features (once per epoch)'.format(num_held)) 
    Xval = Xtrain[num_train:]
    Yval = Ytrain[num_train:]
    Xtrain = Xtrain[:num_train]
    Ytrain = Ytrain[:num_train]
    
    num_batches = int(np.floor(Ytrain.shape[0]/FLAGS.batch_size))
    if num_batches==0: # if defined bach size is below dataset, read as1 batch
        num_batches=1
        FLAGS.batch_size = Ytrain.shape[0]

# ----- Define Graph -----

    tf.reset_default_graph()
    with tf.device(device):            
#        X_in = tf.placeholder(tf.float32, [None, 15, 4, 3])
#        Y_out = tf.placeholder(tf.float32, [None, 8])
        global_step = tf.train.get_or_create_global_step()

        # Create Datasets
        train_dataset = tf.data.Dataset.from_tensor_slices((Xtrain, Ytrain))
#        train_dataset = train_dataset.shuffle(buffer_size=10000)
        train_dataset = train_dataset.batch(FLAGS.batch_size)
#        train_dataset = train_dataset.repeat(FLAGS.num_epochs)
        
        val_dataset = tf.data.Dataset.from_tensor_slices((Xval, Yval))
        val_dataset = val_dataset.batch(Yval.shape[0])
#        val_dataset = val_dataset.repeat(FLAGS.num_epochs)

        test_dataset = tf.data.Dataset.from_tensor_slices((Xtest, Ytest))
        test_dataset = test_dataset.batch(FLAGS.batch_size)

        # Create Iterator
        iter = tf.data.Iterator.from_structure(train_dataset.output_types,
                                           train_dataset.output_shapes)
        features, labels = iter.get_next()
        
        # Create initialisation operations
        train_init_op = iter.make_initializer(train_dataset)
        val_init_op = iter.make_initializer(val_dataset)
        test_init_op = iter.make_initializer(test_dataset)
        
        # Apply model
        with tf.name_scope('predictions'):
            predictions = get_model(features, FLAGS)
        with tf.name_scope('loss'):    
            loss = get_loss(predictions,labels)
        tf.summary.scalar('loss', loss)#add to tboard
         
        with tf.name_scope('train'):
            train_step = (
                tf.train.AdamOptimizer(FLAGS.learning_rate)
                .minimize(loss, global_step=global_step)
                )
            
        summ = tf.summary.merge_all()
        writer = tf.summary.FileWriter(FLAGS.log_dir)
        
#%% Train Model with periodic validation
    def run_train_epoch(target, FLAGS, epoch_index):
        print('Epoch {:d} Training...'.format(epoch_index))
        i=1
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch*epoch_index)] # Increment number of required training steps
        scaffold = tf.train.Scaffold(
                local_init_op=[train_init_op, val_init_op],
                saver=tf.train.Saver(max_to_keep=5)
                )
    
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir=FLAGS.log_dir,
                hooks = hooks,
                scaffold=scaffold
                ) as sess:
            
            writer.add_graph(sess.graph)
            sess.run(train_init_op) # switch to train dataset
            
            while not sess.should_stop():
                
                [current_loss,_,s] = sess.run([loss, train_step, summ])
                iteration = (epoch_index)*FLAGS.steps_per_epoch + i
                print("Iteration {}  Training Loss: {:.4f}".format(iteration,current_loss))
                i += 1
                #writer.add_summary(s, i)
                if i==FLAGS.steps_per_epoch: # validate on last session
                    sess.run(val_init_op) # switch to val dataset
                    while True:
                        try: # run and save validation parameters
                            v_loss = sess.run(loss)
                            print("Epoch {}  Validation Loss: {:.4f}".format(epoch_index, v_loss))
                        except tf.errors.OutOfRangeError:
                            break
                        
    for e in range(1,FLAGS.num_epochs+1):
        run_train_epoch(target, FLAGS,e)
    
    # ----- Test Model on Different Dataset -----                  
    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=(FLAGS.task_index == 0)
            ) as sess:  
        sess.run(test_init_op) # initialize to test dataset
        loss = sess.run(loss)
        
    print("Test Set Loss (independent dataset): {:.4f}".format(loss))
示例#25
0
    ps_hosts = os.environ['PS_HOSTS']
    worker_hosts = os.environ['WORKER_HOSTS']
except:
    job_name = None
    task_index = 0
    ps_hosts = None
    worker_hosts = None

flags = tf.app.flags

# Training related flags
flags.DEFINE_string(
    "data_dir",
    get_data_path(
        dataset_name="kelvinchngphysicist/2d10",  #all mounted repo
        local_root=ROOT_PATH_TO_LOCAL_DATA,
        local_repo="2d10",
        path=''),
    "Path to store logs and checkpoints. It is recommended"
    "to use get_logs_path() to define your logs directory."
    "so that you can switch from local to clusterone without"
    "changing your code."
    "If you set your logs directory manually make sure"
    "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_string(
    "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS),
    "Path to dataset. It is recommended to use get_data_path()"
    "to define your data directory.so that you can switch "
    "from local to clusterone without changing your code."
    "If you set the data directory manually makue sure to use"
    "/data/ as root path when running on ClusterOne cloud.")
示例#26
0
    input0 = input - m
    m.squeeze()
    return m + torch.log(torch.sum(torch.exp(input0), dim=1))


def get_log_odds(raw_marginals):
    marginals = torch.clamp(raw_marginals.mean(dim=0), 1e-7, 1 - 1e-7)
    return torch.log(marginals / (1 - marginals))




train_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10(root=get_data_path(
        dataset_name="%s/cifars3"%CLUSTERONE_USERNAME,
        local_root=opt.dataroot,
        local_repo="",
        path=""
    )
                     , train=True, download=True,
                  transform=transforms.Compose([
                      transforms.ToTensor()
                  ])),
    batch_size=batch_size, shuffle=True)

save_image_dir = get_logs_path(opt.save_image_dir)
save_model_dir = get_logs_path(opt.save_model_dir)

netE = tocuda(Encoder(latent_size, True))
netG = tocuda(Generator(latent_size))
netD = tocuda(Discriminator(latent_size, 0.2, 1))