Exemplo n.º 1
0
def train_rnn_noncluster(file_name,
                         hidden_size,
                         n_layers=1,
                         bidirectional=False,
                         classifier=False,
                         idx_label=None,
                         n_epochs_max=2000,
                         train_ratio=0.8,
                         batch_size=128,
                         n_workers=4,
                         root_dir=ROOT_DIR,
                         lr=0.001,
                         betas=(0.9, 0.999)):
    '''
    NOTE: to be deprecated. classifier works but regressor has not been integrated yet
    trains the recurrent neural network given a file that contains data.
    this data can be either scat transformed or pure simulated data

    inputs
    ------
    file_name: string type name of file
    hidden_size: list type, sizes of hidden states
    n_layers: number of recurrent layers
    bidirectional: if True, becomes a bidirectional LSTM
    classifier: boolean indicating whether it's a classifier or regressor.
    idx_label: int indicating index of parameter to infer. should be given when classifer is False
    n_epochs_max: maximum number of epochs to run. 
        can terminate with ctrl + c to move on to next neural network training.
    train_ratio: float indicating ratio for training data. should be between 0 and 1
    batch_size: size of batch for computing gradient
    n_workers: how many subprocesses to use for data loading.
        0 means that the data will be loaded in the main process.
    root_dir: string type root directory name
    lr - float type learning rate
    betas - tuple of floats indicating betas arguments in Adam optimizer

    outputs
    -------
    None: saves weights and meta data into file
    '''
    file_name, _ = os.path.splitext(file_name)
    file_path = os.path.join(root_dir, file_name + '.pt')
    transformed = 'scat' in file_name
    samples = torch.load(file_path)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    nums = cu.match_filename(r'{}_meta_rnn_([0-9]+).pt'.format(file_name),
                             root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0
    file_name_meta = '{}_meta_rnn_{}.pt'.format(file_name, idx)
    # data shape: (n_param_1, n_param_2,..., n_param_N, n_samples_total, n_channels, (n_nodes), data_len)
    data, labels, label_names = samples['data'], samples['labels'], samples[
        'label_names']
    # the number of dimensions that do not correspond to the batch dimension is 4 if scat transformed.
    # Otherwise, it's 3
    n_none_param_dims = 4 if transformed else 3
    n_samples_total = data.shape[-n_none_param_dims]
    n_data_total = np.prod(data.shape[:-(n_none_param_dims - 1)])
    n_labels = len(label_names)  # number of labels to predict
    if classifier:
        assert (
            idx_label is None
        ), "Invalid idx_label input: should not be given for training classifier"
        assert (isinstance(
            hidden_size,
            int)), "Invalid format of hidden_size given. Should be type int"
    else:
        raise NotImplementedError(
            "Training regressor for non-cluster version has not been implemented yet"
        )
        assert (
            isinstance(idx_label, int)
        ), "Invalid idx_label input: int type idx_label required for training regressor"
        if n_labels == 1 and isinstance(hidden_size, int):
            hidden_size = [hidden_size]
        assert (len(hidden_size) == n_labels
                ), "Invalid format of hidden state sizes given.\
            Should have length n_labels"
        assert(all([isinstance(hidden_size_label, int) for hidden_size_label in hidden_size])),\
            "Invalid format of hidden_size given. Should be list with int type elements"

    index = _train_test_split(n_data_total, train_ratio)
    index['val'] = index.pop('test')

    # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes), data_len).
    # (n_scat_nodes) means 1 if data not transformed
    data = np.reshape(data, (n_data_total, -1, data.shape[-1]))
    input_size = data.shape[-2]

    # initialize meta data and save it to a file
    meta = {
        'file_name': file_name_meta,
        'root_dir': root_dir,
        'input_size': input_size,
        'hidden_size': hidden_size,
        'n_layers': n_layers,
        'bidirectional': bidirectional,
        'classifier': classifier,
        'n_epochs_max': n_epochs_max,
        'train_ratio': train_ratio,
        'batch_size': batch_size,
        'n_workers': n_workers,
        'index': index,
        'device': device,
        'labels': samples['labels'],
        'label_names': samples['label_names']
    }

    labels = np.array(list(product(*labels)),
                      dtype='float32')  # shaped (n_conditions, n_labels)
    if classifier:
        label_to_idx = {
            tuple(condition): idx_condition
            for idx_condition, condition in enumerate(labels)
        }
        n_conditions = len(label_to_idx)

        meta.update({
            'epoch': [],
            'weights': None,
            'elapsed': [],
            'loss': {
                'train': [],
                'val': []
            },
            'criterion': 'cross_entropy_mean',
            'label_to_idx': label_to_idx
        })
        _init_meta(**meta)

        labels = np.arange(n_conditions)  # shaped (n_conditions,)
        labels = np.repeat(
            labels,
            n_samples_total)  # shaped (n_conditions * n_samples_total,)
        # which, for example, looks like [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4]
        # for n_samples_total being 3 and n_conditions being 5

        dataset = TimeSeriesDataset(data, labels, transform=ToTensor())
        # train the neural network for classification
        print("Beginning training of {}:".format(', '.join(
            samples['label_names'])))
        _train_rnn_noncluster(dataset,
                              index,
                              hidden_size=hidden_size,
                              n_layers=n_layers,
                              bidirectional=bidirectional,
                              classifier=classifier,
                              n_epochs_max=n_epochs_max,
                              batch_size=batch_size,
                              n_workers=n_workers,
                              device=device,
                              file_name=file_name_meta,
                              root_dir=root_dir,
                              lr=lr,
                              betas=betas)
    else:
        meta.update({
            'epoch': [[] for _ in range(n_labels)],
            'weights': [None for _ in range(n_labels)],
            'elapsed': [[] for _ in range(n_labels)],
            'loss': [{
                'train': [],
                'val': []
            } for _ in range(n_labels)],
            'criterion': 'rmse'
        })
        _init_meta(**meta)

        # following is shaped (n_labels, n_conditions)
        labels = labels.swapaxes(0, 1)
        # following is shaped (n_labels, n_data_total)
        labels = np.tile(labels[:, :, np.newaxis],
                         [1, 1, n_samples_total]).reshape(
                             [n_labels, n_data_total])
        for idx_label in range(n_labels):
            dataset = TimeSeriesDataset(data,
                                        labels[idx_label],
                                        transform=ToTensor())
            # train the rnn for the given idx_label
            print("Beginning training of {}:".format(
                samples['label_names'][idx_label]))
            _train_rnn_noncluster(dataset,
                                  index,
                                  hidden_size=hidden_size[idx_label],
                                  n_layers=n_layers,
                                  bidirectional=bidirectional,
                                  classifier=classifier,
                                  n_epochs_max=n_epochs_max,
                                  batch_size=batch_size,
                                  n_workers=n_workers,
                                  device=device,
                                  idx_label=idx_label,
                                  file_name=file_name_meta,
                                  root_dir=root_dir,
                                  lr=lr,
                                  betas=betas)
Exemplo n.º 2
0
def train_rnn(file_name,
              hidden_size,
              n_layers=1,
              bidirectional=False,
              classifier=False,
              idx_label=None,
              n_epochs_max=2000,
              train_ratio=0.8,
              batch_size=128,
              n_workers=4,
              root_dir=ROOT_DIR,
              lr=0.001,
              betas=(0.9, 0.999),
              opt_level="O0",
              seed=42,
              log_interval=10):
    '''
    trains the recurrent neural network given a file that contains data.
    this data can be either scat transformed or pure simulated data

    inputs
    ------
    file_name: string type name of file
    hidden_size: list type, sizes of hidden states
    n_layers: number of recurrent layers
    bidirectional: if True, becomes a bidirectional LSTM
    classifier: boolean indicating whether it's a classifier or regressor.
    idx_label - int representing which neural network to train. should be given only when classifier is False
    n_epochs_max: maximum number of epochs to run. 
        can terminate with ctrl + c to move on to next neural network training.
    train_ratio: float indicating ratio for training data. should be between 0 and 1
    batch_size: size of batch for computing gradient
    n_workers: how many subprocesses to use for data loading.
        0 means that the data will be loaded in the main process.
    root_dir: string type root directory name
    lr - float type learning rate
    betas - tuple of floats indicating betas arguments in Adam optimizer
    opt_level - optimization level
    seed - random seed
    log_interval - how many batches to wait before logging training status

    outputs
    -------
    None: saves weights and meta data into file
    '''
    # NOTE: regression means you train on data whose parameters are sampled continuously and test also for data whose parameters are sampled continuously, whereas
    # classifier means you train on data on the grid and test on the grid.
    # pass the dataset as an argument to _train_rnn() not with the index but the dataset being a dictionary with keys 'train' and 'val'

    hvd.init()
    torch.cuda.set_device(hvd.local_rank())
    #device = hvd.local_rank()
    root_process = hvd.local_rank() == 0

    file_name, _ = os.path.splitext(file_name)
    file_path = os.path.join(root_dir, file_name + '.pt')
    transformed = 'scat' in file_name
    samples = torch.load(file_path)
    # shape of data: (n_data_total, n_channels, (n_nodes), data_len)
    data, labels, label_names = samples['data'], samples['labels'], samples[
        'label_names']
    n_data_total = len(data)
    if root_process:
        assert (isinstance(
            hidden_size,
            int)), "Invalid format of hidden_size given. Should be type int"

    if classifier:
        idx = 0
        if root_process:
            nums = cu.match_filename(
                r'{}_meta_rnn_([0-9]+).pt'.format(file_name),
                root_dir=root_dir)  # FIXME
            nums = [int(num) for num in nums]
            idx = max(nums) + 1 if nums else 0
        idx = hvd.broadcast(torch.tensor(idx),
                            root_rank=0,
                            name='idx_file_meta').item()
        file_name_meta = '{}_meta_rnn_{}.pt'.format(file_name, idx)
    else:
        label = labels[idx_label]
        label_name = label_names[idx_label]
        idx = 0
        if root_process:
            nums = cu.match_filename(r'{}_meta_rnn_([0-9]+)_{}.pt'.format(
                file_name, label_name),
                                     root_dir=root_dir)
            nums = [int(num) for num in nums]
            idx = max(nums) + 1 if nums else 0
        idx = hvd.broadcast(torch.tensor(idx),
                            root_rank=0,
                            name='idx_file_meta').item()
        file_name_meta = '{}_meta_rnn_{}_{}.pt'.format(file_name, idx,
                                                       label_name)

    index = _train_test_split(n_data_total, train_ratio)
    index['val'] = index.pop('test')

    # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes), data_len).
    # (n_scat_nodes) means 1 if data not transformed
    if isinstance(data, np.ndarray):
        data = np.reshape(data, (n_data_total, -1, data.shape[-1]))
    elif isinstance(data, list):
        data = [
            np.reshape(data_slice, (-1, data_slice.shape[-1]))
            for data_slice in data
        ]
    else:
        raise ValueError("Invalid type of data given")
    input_size = data[0].shape[0]

    # initialize meta data and save it to a file
    meta = {
        'file_name': file_name_meta,
        'root_dir': root_dir,
        'input_size': input_size,
        'hidden_size': hidden_size,
        'n_layers': n_layers,
        'bidirectional': bidirectional,
        'classifier': classifier,
        'n_epochs_max': n_epochs_max,
        'train_ratio': train_ratio,
        'batch_size': batch_size,
        'n_workers': n_workers,
        'index': index,
        'epoch': [],
        'weights': None,
        'elapsed': [],
        'loss': {
            'train': [],
            'val': []
        },
        'criterion': 'cross_entropy_mean' if classifier else 'rmse',
        'labels': labels if classifier else label,
        'label_names': label_names if classifier else label_name
    }
    if classifier:
        if 'labels_lut' in samples.keys():
            meta.update({'labels_lut': samples['labels_lut']})
        _init_meta(
            **meta
        )  # done for all processes to ensure data gets loaded after initializing file

        dataset = TimeSeriesDataset(data, labels, transform=ToTensor())
        # train the neural network for classification
        if root_process:
            print("Training classifier for {}:".format(', '.join(
                samples['label_names'])))
        _train_rnn(dataset,
                   index,
                   hidden_size=hidden_size,
                   n_layers=n_layers,
                   bidirectional=bidirectional,
                   classifier=classifier,
                   n_epochs_max=n_epochs_max,
                   batch_size=batch_size,
                   n_workers=n_workers,
                   file_name=file_name_meta,
                   root_dir=root_dir,
                   lr=lr,
                   betas=betas,
                   opt_level=opt_level,
                   seed=seed,
                   log_interval=log_interval)
    else:
        _init_meta(
            **meta
        )  # done for all processes to ensure data gets loaded after initializing file

        dataset = TimeSeriesDataset(data, label, transform=ToTensor())
        # train the rnn for the given idx_label
        if root_process: print("Training regressor for {}:".format(label_name))
        _train_rnn(dataset,
                   index,
                   hidden_size=hidden_size,
                   n_layers=n_layers,
                   bidirectional=bidirectional,
                   classifier=classifier,
                   n_epochs_max=n_epochs_max,
                   batch_size=batch_size,
                   n_workers=n_workers,
                   file_name=file_name_meta,
                   root_dir=root_dir,
                   lr=lr,
                   betas=betas,
                   opt_level=opt_level,
                   seed=seed,
                   log_interval=log_interval)
Exemplo n.º 3
0
    '(tbd_0_meta_rnn_[0-9]+_diff_coef_ratios.pt)'
]

idx_file_start = 0  # None or 0 to start from beginning
idx_file_end = 5  # None for going to end

epoch_len = 200  # only consider files that went through 2000 epochs
#plt.style.use('dark_background')
fontsize_label = 14
fontsize_title = 18
fig_w = 12
fig_h = 8

file_names = []
for file_name_regex in file_name_regexs:
    file_names += cu.match_filename(file_name_regex, root_dir)
file_paths = [os.path.join(root_dir, file_name) for file_name in file_names]

file_paths_tmp = []
plt.close('all')
for file_path in file_paths:
    meta = torch.load(file_path)
    if len(meta['epoch']) == epoch_len:
        file_paths_tmp.append(file_path)

file_paths = file_paths_tmp[idx_file_start:idx_file_end]
n_files = len(file_paths)

figs = []
axs = []
for file_path in file_paths:
Exemplo n.º 4
0
def train_nn(file_name,
             n_nodes_hidden,
             classifier=False,
             n_epochs_max=2000,
             train_ratio=0.8,
             batch_size=128,
             n_workers=4,
             root_dir=ROOT_DIR,
             lr=0.001,
             betas=(0.9, 0.999)):
    '''
    trains the neural network given a file that contains data.
    this data can be either scat transformed or pure simulated data

    NOTE: requires refactoring to run on cluster

    inputs
    ------
    file_name: string type name of file
    n_nodes_hidden: list type, where values are nodes (list of nodes) in the hidden layers
        for classification (regression). For regression of multiple labels, the number of lists should match
        with the number of labels to predict
    classifier: boolean indicating whether it's a classifier or regressor.
    n_epochs_max: int, maximum number of epochs to run. 
        can terminate with ctrl + c to move on to next neural network training.
    train_ratio: float indicating ratio for training data. should be between 0 and 1
    batch_size: size of batch for computing gradient
    n_workers: how many subprocesses to use for data loading.
        0 means that the data will be loaded in the main process.
    root_dir: string type root directory name
    lr - float type learning rate
    betas - tuple of floats indicating betas arguments in Adam optimizer

    outputs
    -------
    None: saves weights and meta data into file
    '''

    file_name, _ = os.path.splitext(file_name)
    file_path = os.path.join(root_dir, file_name + '.pt')
    transformed = 'scat' in file_name
    samples = torch.load(file_path)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    nums = cu.match_filename(r'{}_meta_nn_([0-9]+).pt'.format(file_name),
                             root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0
    file_name_meta = '{}_meta_nn_{}.pt'.format(file_name, idx)

    # data shape: (n_param_1, n_param_2,..., n_param_N, n_samples_total, n_channels, (n_nodes), data_len)
    data, labels, label_names = samples['data'], samples['labels'], samples[
        'label_names']
    # the number of dimensions that do not correspond to the batch dimension is 4 if scat transformed.
    # Otherwise, it's 3
    n_none_param_dims = 4 if transformed else 3
    n_samples_total = data.shape[-n_none_param_dims]
    n_data_total = np.prod(data.shape[:-(n_none_param_dims - 1)])
    n_labels = len(label_names)  # number of labels to predict
    assert (isinstance(
        n_nodes_hidden,
        list)), "Invalid format of nodes given. Should be type list"
    if not classifer:
        if n_labels == 1 and not isinstance(n_nodes_hidden[0], list):
            n_nodes_hidden = [n_nodes_hidden]
        assert (
            len(n_nodes_hidden) == n_labels), "Invalid format of nodes given.\
                Should be n_labels number of lists"
        assert(all([isinstance(n_nodes_hidden_label, list) for n_nodes_hidden_label in n_nodes_hidden])),\
            "Invalid format of nodes given. Should provide list of {} lists".format(n_labels)
    index = _train_test_split(n_data_total, train_ratio)
    index['val'] = index.pop('test')

    # reshape data. output is shaped (n_data_total, n_channels * (n_scat_nodes) * data_len).
    # (n_scat_nodes) means 1 if data not transformed
    data = np.reshape(data, (n_data_total, -1))

    # initialize meta data and save it to a file
    meta = {
        'file_name': file_name_meta,
        'root_dir': root_dir,
        'n_nodes': n_nodes,
        'classifier': classifier,
        'n_epochs_max': n_epochs_max,
        'train_ratio': train_ratio,
        'batch_size': batch_size,
        'n_workers': n_workers,
        'index': index,
        'device': device,
        'labels': samples['labels'],
        'label_names': samples['label_names']
    }

    labels = np.array(list(product(*labels)),
                      dtype='float32')  # shaped (n_conditions, n_labels)
    if classifier:
        label_to_idx = {
            tuple(condition): idx_condition
            for idx_condition, condition in enumerate(labels)
        }
        n_conditions = len(label_to_idx)
        n_nodes = [data.shape[-1]] + n_nodes_hidden + [n_conditions]
        meta.update({
            'epoch': [],
            'weights': None,
            'elapsed': [],
            'loss': {
                'train': [],
                'val': []
            },
            'criterion': 'cross_entropy_mean',
            'label_to_idx': label_to_idx,
            'n_nodes': n_nodes
        })
        _init_meta(**meta)
        labels = np.arange(n_conditions)  # shaped (n_conditions,)
        labels = np.repeat(
            labels,
            n_samples_total)  # shaped (n_conditions * n_samples_total,)
        # which, for example, looks like [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4]
        # for n_samples_total being 3 and n_conditions being 5

        dataset = TimeSeriesDataset(data, labels, transform=ToTensor())
        # train the neural network for classification
        print("Beginning training of {}:".format(', '.join(
            samples['label_names'])))
        _train_nn(dataset,
                  index,
                  n_nodes_hidden=n_nodes_hidden,
                  classifier=classifier,
                  n_epochs_max=n_epochs_max,
                  batch_size=batch_size,
                  device=device,
                  n_workers=n_workers,
                  file_name=file_name_meta,
                  root_dir=root_dir)
    else:
        n_nodes = [[data.shape[-1]] + n_nodes_hidden_label + [1]
                   for n_nodes_hidden_label in n_nodes_hidden]
        meta.update({
            'epoch': [[] for _ in range(n_labels)],
            'weights': [None for _ in range(n_labels)],
            'elapsed': [[] for _ in range(n_labels)],
            'loss': [{
                'train': [],
                'val': []
            } for _ in range(n_labels)],
            'criterion': 'rmse'
        })
        _init_meta(**meta)
        # following is shaped (n_labels, n_conditions)
        labels = labels.swapaxes(0, 1)
        # following is shaped (n_labels, n_data_total)
        labels = np.tile(labels[:, :, np.newaxis],
                         [1, 1, n_samples_total]).reshape(
                             [n_labels, n_data_total])
        for idx_label in range(n_labels):
            dataset = TimeSeriesDataset(data,
                                        labels[idx_label],
                                        transform=ToTensor())
            # train the neural network for the given idx_label
            print("Beginning training of {}:".format(
                samples['label_names'][idx_label]))
            _train_nn(dataset,
                      index,
                      n_nodes_hidden=n_nodes_hidden[idx_label],
                      classifier=classifier,
                      n_epochs_max=n_epochs_max,
                      batch_size=batch_size,
                      device=device,
                      n_workers=n_workers,
                      idx_label=idx_label,
                      file_name=file_name_meta,
                      root_dir=root_dir)
Exemplo n.º 5
0
def sim_two_beads(data_len,
                  gammas,
                  k_ratios,
                  diff_coef_ratios,
                  dt,
                  n_data=1,
                  n_steps_initial=10000,
                  save_file=False,
                  root_dir=ROOT_DIR,
                  dtype='float32'):
    '''
    returns ensemble of two bead simulation trajectories.

    inputs:
    -------
    - data_len: int, length of each process
    - gammas: numeric or list-like, drag coefficient values
    - k_ratios: numeric or list-like, ratios of spring constants
    - diff_coef_ratios: numeric or list-like, ratios of diffusion coefficients
    - dt: float, time step between data points
    - n_data: int, number of processes in ensemble
    - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 2, data_len) which is an
        ensemble of two beads trajectories. the 2nd dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are indices for the label values
        'label_names': list whose elements are string type, ['gammas', 'k_ratios', 'diff_coef_ratios']
        'labels_lut': list where the values are the label values given the index values in 'labels'
        'dt': float type dt
        'n_steps_initial': int type n_steps_initial
    - (file_name): string type file name of the simulated data. data returned if save_file is True

    FIXME: check the code to see if the dimensions are not mixed up, check if actual simulation part is not mixed up with initial condition simulation
    '''
    if isinstance(gammas, (int, float)):
        gammas = [gammas]
    if isinstance(k_ratios, (int, float)):
        k_ratios = [k_ratios]
    if isinstance(diff_coef_ratios, (int, float)):
        diff_coef_ratios = [diff_coef_ratios]

    gammas = np.array(gammas, dtype=dtype)
    k_ratios = np.array(k_ratios, dtype=dtype)
    diff_coef_ratios = np.array(diff_coef_ratios, dtype=dtype)
    n_gammas = len(gammas)
    n_k_ratios = len(k_ratios)
    n_diff_coef_ratios = len(diff_coef_ratios)

    file_size_est = data_len * n_gammas * n_diff_coef_ratios * n_k_ratios * n_data * 2 * np.dtype(
        dtype).itemsize
    file_size_est_gb = file_size_est / 1.e9
    if file_size_est_gb > 2.:
        warnings.warn("Generating file with size roughly {:.2f} GB".format(
            file_size_est_gb),
                      Category=BytesWarning)

    processes = np.empty((n_gammas, n_k_ratios, n_diff_coef_ratios, n_data, 2,
                          data_len)).astype(dtype)

    for idx0, gamma in enumerate(gammas):
        for idx1, k in enumerate(k_ratios):
            for idx2, diff_coef in enumerate(diff_coef_ratios):
                force_matrix = np.array([[-(1 + k), k], [k, -(1 + k)]])
                diffusion_matrix = np.array([[diff_coef, 0], [0, 1]])
                prefactor1 = force_matrix * dt
                prefactor2 = np.sqrt(2 * diffusion_matrix * dt)
                rand_nums = np.random.normal(0, 1,
                                             [n_steps_initial, 2, n_data])
                x0 = np.zeros((2, n_data))
                for idx in range(n_steps_initial):
                    x0 = x0 + np.matmul(prefactor1, x0) + np.matmul(
                        prefactor2, rand_nums[idx])
                processes[idx0, idx1, idx2, :, :, 0] = x0.T

        for idx1, k in enumerate(k_ratios):
            for idx2, diff_coef in enumerate(diff_coef_ratios):
                x = x0
                force_matrix = np.array([[-(1 + k), k], [k, -(1 + k)]])
                diffusion_matrix = np.array([[diff_coef, 0], [0, 1]])
                prefactor1 = force_matrix * dt
                prefactor2 = np.sqrt(2 * diffusion_matrix * dt)
                rand_nums = np.random.normal(0, 1, [data_len - 1, 2, n_data])
                for idx in range(data_len - 1):
                    x = x + np.matmul(prefactor1, x) + np.matmul(
                        prefactor2, rand_nums[idx])
                    processes[idx0, idx1, idx2, :, :, idx + 1] = x.T

        processes[idx0] = processes[idx0] / gamma

    # reshape data
    n_data_total = n_gammas * n_k_ratios * n_diff_coef_ratios * n_data
    processes = np.reshape(
        processes,
        (n_data_total, 2, data_len))  # shaped (n_data_total, 2, data_len)

    # reshape labels
    labels = [gammas, k_ratios, diff_coef_ratios]
    labels = np.array(list(product(*labels)),
                      dtype=dtype)  # shaped (n_conditions, n_labels)
    labels_lut = [tuple(condition) for condition in labels]
    n_conditions = len(labels_lut)

    labels = np.arange(n_conditions)  # shaped (n_conditions,)
    labels = np.repeat(labels,
                       n_data)  # shaped (n_conditions * n_samples_total,)

    samples = {
        'data': processes,
        'labels': labels,
        'label_names': ['gammas', 'k_ratios', 'diff_coefs'],
        'dt': dt,
        'n_steps_initial': n_steps_initial,
        'labels_lut': labels_lut
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'tbd_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 6
0
def sim_two_beads_sample(data_len,
                         gammas,
                         k_ratios,
                         diff_coef_ratios,
                         dt,
                         n_data=1,
                         n_steps_initial=10000,
                         save_file=False,
                         root_dir=ROOT_DIR,
                         dtype='float32'):
    '''
    returns ensemble of two bead simulation trajectories for a given range of spring constant and diffusion coefficient values.

    inputs:
    -------
    - data_len: int, length of each process
    - gammas: numeric or length 2 list-like representing low, high values of the drag coefficient values
    - k_ratios: numeric or length 2 list-like representing low, high values of the ratios of spring constants
    - diff_coef_ratios: numeric or length 2 list-like representing low, high values of the ratios of diffusion coefficients
    - dt: float, time step between data points
    - n_data: int, number of processes in ensemble
    - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 2, data_len) which is an
        ensemble of two beads trajectories. the 2nd dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are the ndarrays. 
            each ndarray is shaped (n_data,) whose values are the gammas, k_ratios, diff_coef_ratios values
        'label_names': list whose elements are string type, ['gammas', 'k_ratios', 'diff_coef_ratios']
        'dt': float type dt
        'n_steps_initial': int type n_steps_initial
    - (file_name): string type file name of the simulated data. data returned if save_file is True

    FIXME: check the code to see if actual simulation part is not mixed up with initial condition simulation
    '''
    if isinstance(gammas, (int, float)):
        gammas = np.array([gammas, gammas], dtype=dtype)
    if isinstance(k_ratios, (int, float)):
        k_ratios = np.array([k_ratios, k_ratios], dtype=dtype)
    if isinstance(diff_coef_ratios, (int, float)):
        diff_coef_ratios = np.array([diff_coef_ratios, diff_coef_ratios],
                                    dtype=dtype)
    assert (
        len(gammas) == 2
    ), "Invalid gammas given: should be numeric or length 2 list-like format"
    assert (
        len(k_ratios) == 2
    ), "Invalid k_ratios given: should be numeric or length 2 list-like format"
    assert (
        len(diff_coef_ratios) == 2
    ), "Invalid diff_coef_ratios given: should be numeric or length 2 list-like format"
    gamma_low, gamma_high = gammas
    k_ratio_low, k_ratio_high = k_ratios
    diff_coef_ratio_low, diff_coef_ratio_high = diff_coef_ratios
    gamma_samples = (gamma_high - gamma_low) * np.random.random(
        n_data, ) + gamma_low
    k_ratio_samples = (k_ratio_high - k_ratio_low) * np.random.random(
        n_data, ) + k_ratio_low
    diff_coef_ratio_samples = (diff_coef_ratio_high -
                               diff_coef_ratio_low) * np.random.random(
                                   n_data, ) + diff_coef_ratio_low
    param_samples = np.stack(
        [gamma_samples, k_ratio_samples, diff_coef_ratio_samples], axis=1)

    concat_list = []
    for gamma_sample, k_ratio_sample, diff_coef_ratio_sample in param_samples:
        process = sim_two_beads(data_len,
                                gammas=gamma_sample,
                                k_ratios=k_ratio_sample,
                                diff_coef_ratios=diff_coef_ratio_sample,
                                dt=dt,
                                n_data=1,
                                n_steps_initial=n_steps_initial,
                                save_file=False,
                                dtype=dtype)
        process = process['data']
        concat_list.append(process)
    processes = np.concatenate(concat_list,
                               axis=0)  # shaped (n_data, 2, data_len)

    samples = {
        'data': processes,
        'labels': [gamma_samples, k_ratio_samples, diff_coef_ratio_samples],
        'label_names': ['gammas', 'k_ratios', 'diff_coef_ratios'],
        'dt': dt,
        'n_steps_initial': n_steps_initial
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'tbd_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 7
0
def sim_poisson_sample(data_len,
                       lams,
                       dt,
                       n_data=1,
                       save_file=False,
                       root_dir=ROOT_DIR,
                       dtype='float32'):
    '''
    returns ensemble of poisson processes for a given range of lambda values

    inputs:
    -------
    - data_len: int, length of each process
    - lams: numeric or length 2 list-like representing low, high values of expectation per interval value
    - dt: time step between data points
    - n_data: int, number of processes in ensemble
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 1, data_len) which is an
        ensemble of poisson trajectories. the singleton dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are the lams values
        'label_names': list whose elements are string type, ['lams']
        'dt': float type dt
   - (file_name): string type file name of the simulated data. data returned if save_file is True

    REVIEW: confirm this method of using fixed time step generates identical statistics to that of Gielespie algorithm
    '''
    if isinstance(lams, (int, float)):
        lams = np.array([lams, lams], dtype=dtype)
    assert (
        len(lams) == 2
    ), "Invalid lams given: should be numeric or length 2 list-like format"
    lam_low, lam_high = lams
    lam_samples = (lam_high - lam_low) * np.random.random(n_data, ) + lam_low

    concat_list = []
    for lam_sample in lam_samples:
        process = sim_poisson(data_len,
                              lams=lam_sample,
                              dt=dt,
                              n_data=1,
                              save_file=False,
                              dtype=dtype)
        process = process['data']
        concat_list.append(process)
    processes = np.concatenate(concat_list,
                               axis=0)  # shaped (n_data, 1, data_len)

    samples = {
        'data': processes,
        'labels': [lam_samples],
        'label_names': ['lams'],
        'dt': dt
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'pos_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'pos_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 8
0
def sim_one_bead(data_len,
                 ks,
                 diff_coefs,
                 dt,
                 n_data=1,
                 n_steps_initial=10000,
                 save_file=False,
                 root_dir=ROOT_DIR,
                 dtype='float32'):
    '''
    returns ensemble of one bead simulation trajectories. as there is only one heat bath, this is a passive trajectory

    inputs:
    -------
    - data_len: int, length of each process
    - k: numeric or list or ndarray, spring constant
    - diff_coef: numeric or list or ndarray, diffusion coefficient
    - dt: time step between data points
    - n_data: number of processes in ensemble
    - n_steps_initial: number of steps to take in Langevin equation for simulating initial positions
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 1, data_len) which is an
        ensemble of one bead trajectories. the singleton dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are indices for the label values
        'label_names': list whose elements are string type, ['ks', 'diff_coefs']
        'labels_lut': list where the values are the label values given the index values in 'labels'
        'dt': float type dt
        'n_steps_initial': int type n_steps_initial
   - (file_name): string type file name of the simulated data. returned if save_file is True

    FIXME: check the code to see if actual simulation part is not mixed up with initial condition simulation
    '''
    if isinstance(ks, (int, float)):
        ks = [ks]
    if isinstance(diff_coefs, (int, float)):
        diff_coefs = [diff_coefs]

    ks = np.array(ks, dtype=dtype)
    diff_coefs = np.array(diff_coefs, dtype=dtype)
    n_ks = len(ks)
    n_diff_coefs = len(diff_coefs)

    file_size_est = data_len * n_diff_coefs * n_ks * n_data * np.dtype(
        dtype).itemsize
    file_size_est_gb = file_size_est / 1.e9
    if file_size_est_gb > 2.:
        warnings.warn("Generating file with size roughly {:.2f} GB".format(
            file_size_est_gb),
                      Category=BytesWarning)

    processes = np.empty((n_ks, n_diff_coefs, n_data, data_len)).astype(dtype)

    for idx0, k in enumerate(ks):
        for idx1, diff_coef in enumerate(diff_coefs):
            prefactor1 = k * dt
            prefactor2 = np.sqrt(2 * diff_coef * dt)
            rand_nums = np.random.normal(0, 1, [n_steps_initial, n_data])
            x0 = np.zeros(n_data)
            for idx in range(n_steps_initial):
                x0 = x0 - prefactor1 * x0 + prefactor2 * rand_nums[idx]
            processes[idx0, idx1, :, 0] = x0

    for idx0, k in enumerate(ks):
        for idx1, diff_coef in enumerate(diff_coefs):
            x = processes[idx0, idx1, :, 0]
            prefactor1 = k * dt
            prefactor2 = np.sqrt(2 * diff_coef * dt)
            rand_nums = np.random.normal(0, 1, [data_len - 1, n_data])
            for idx in range(data_len - 1):
                x = x - prefactor1 * x + prefactor2 * rand_nums[idx]
                processes[idx0, idx1, :, idx + 1] = x

    processes = np.expand_dims(processes, axis=-2)

    # reshape data
    n_data_total = n_ks * n_diff_coefs * n_data
    processes = np.reshape(
        processes,
        (n_data_total, 1, data_len))  # shaped (n_data_total, 1, data_len)

    # reshape labels
    labels = [ks, diff_coefs]
    labels = np.array(list(product(*labels)),
                      dtype=dtype)  # shaped (n_conditions, n_labels)
    labels_lut = [tuple(condition) for condition in labels]
    n_conditions = len(labels_lut)

    labels = np.arange(n_conditions)  # shaped (n_conditions,)
    labels = np.repeat(labels,
                       n_data)  # shaped (n_conditions * n_samples_total,)

    samples = {
        'data': processes,
        'labels': labels,
        'label_names': ['ks', 'diff_coefs'],
        'dt': dt,
        'n_steps_initial': n_steps_initial,
        'labels_lut': labels_lut
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'obd_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'obd_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 9
0
def sim_brownian(data_len,
                 diff_coefs,
                 dt,
                 n_data=1,
                 save_file=False,
                 root_dir=ROOT_DIR,
                 dtype='float32'):
    '''
    returns ensemble of brownian trajectories

    inputs:
    -------
    - data_len: int, length of each process
    - diff_coef: numeric or list or ndarray, diffusion coefficient. 
    - dt: float, time step between data points
    - n_data: int, number of processes in ensemble
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 1, data_len) which is an
        ensemble of brownian trajectories. the singleton dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are indices for the label values
        'label_names': list whose elements are string type, ['diff_coefs']
        'labels_lut': list where the values are the label values given the index values in 'labels'
        'dt': float type dt
    - (file_name): string type file name of the simulated data. returned if save_file is True
    '''
    concat_list = []
    if isinstance(diff_coefs, (int, float)):
        diff_coefs = [diff_coefs]
    diff_coefs = np.array(diff_coefs, dtype=dtype)
    n_diff_coefs = len(diff_coefs)
    file_size_est = data_len * len(diff_coefs) * n_data * np.dtype(
        dtype).itemsize
    file_size_est_gb = file_size_est / 1.e9
    if file_size_est_gb > 2.:
        warnings.warn("Generating file with size roughly {:.2f} GB".format(
            file_size_est_gb),
                      Category=BytesWarning)

    for diff_coef in diff_coefs:
        increments = np.sqrt(2 * diff_coef * dt) * np.random.normal(
            0, 1, [n_data, data_len - 1])
        x0 = np.random.normal(0, 1, [n_data, 1])
        increments = np.concatenate([x0, increments], axis=1)
        processes = increments.cumsum(axis=1)
        concat_list.append(processes.astype(dtype))

    processes = np.stack(concat_list, axis=0)
    processes = np.expand_dims(processes, axis=-2)

    # reshape data
    n_data_total = n_diff_coefs * n_data
    processes = np.reshape(
        processes,
        (n_data_total, 1, data_len))  # shaped (n_data_total, 1, data_len)

    # reshape labels
    labels = [diff_coefs]
    labels = np.array(list(product(*labels)),
                      dtype=dtype)  # shaped (n_conditions, n_labels)
    labels_lut = [tuple(condition) for condition in labels]
    n_conditions = len(labels_lut)

    labels = np.arange(n_conditions)  # shaped (n_conditions,)
    labels = np.repeat(labels,
                       n_data)  # shaped (n_conditions * n_samples_total,)

    samples = {
        'data': processes,
        'labels': labels,
        'label_names': ['diff_coefs'],
        'dt': dt,
        'labels_lut': labels_lut
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'brw_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'brw_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 10
0
def sim_poisson(data_len,
                lams,
                dt,
                n_data=1,
                save_file=False,
                root_dir=ROOT_DIR,
                dtype='float32'):
    '''
    returns ensemble of poisson processes

    inputs:
    -------
    - data_len: int, length of each process
    - lams: numeric or list or ndarray, expectation per interval
    - dt: time step between data points
    - n_data: number of processes in ensemble
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 1, data_len) which is an
        ensemble of poisson trajectories. the singleton dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are indices for the label values
        'label_names': list whose elements are string type, ['lams']
        'labels_lut': list where the values are the label values given the index values in 'labels'
        'dt': float type dt
    - (file_name): string type file name of the simulated data. data returned if save_file is True

    REVIEW: confirm this method of using fixed time step generates identical statistics to that of Gielespie algorithm
    '''
    if isinstance(lams, (int, float)):
        lams = [lams]
    lams = np.array(lams, dtype=dtype)
    n_lams = len(lams)

    file_size_est = data_len * n_lams * n_data * np.dtype(dtype).itemsize
    file_size_est_gb = file_size_est / 1.e9
    if file_size_est_gb > 2.:
        warnings.warn("Generating file with size roughly {:.2f} GB".format(
            file_size_est_gb),
                      Category=BytesWarning)

    concat_list = []
    for lam in lams:
        increments = np.random.poisson(lam * dt, size=[n_data, data_len])
        processes = increments.cumsum(axis=1)
        concat_list.append(processes.astype(dtype))
    processes = np.stack(concat_list, axis=0)
    processes = np.expand_dims(processes, axis=-2)

    # reshape data
    n_data_total = n_lams * n_data
    processes = np.reshape(
        processes,
        (n_data_total, 1, data_len))  # shaped (n_data_total, 1, data_len)

    # reshape labels
    labels = [lams]
    labels = np.array(list(product(*labels)),
                      dtype=dtype)  # shaped (n_conditions, n_labels)
    labels_lut = [tuple(condition) for condition in labels]
    n_conditions = len(labels_lut)

    labels = np.arange(n_conditions)  # shaped (n_conditions,)
    labels = np.repeat(labels,
                       n_data)  # shaped (n_conditions * n_samples_total,)

    samples = {
        'data': processes,
        'labels': labels,
        'label_names': ['lams'],
        'dt': dt,
        'labels_lut': labels_lut
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'pos_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'pos_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
Exemplo n.º 11
0
def sim_brownian_sample(data_len,
                        diff_coefs,
                        dt,
                        n_data=1,
                        save_file=False,
                        root_dir=ROOT_DIR,
                        dtype='float32'):
    '''
    returns ensemble of brownian trajectories for a given range of diffusion coefficients

    inputs:
    -------
    - data_len: int, length of each process
    - diff_coef: numeric or length 2 list-like representing low, high values of diffusion coefficients
    - dt: float, time step between data points
    - n_data: int, number of processes in ensemble
    - save_file: boolean, whether or not to save the file. If True, file name is returned.
        Otherwise, data is returned
    - root_dir: string, root directory to save file if save_file is True
    - dtype: 'float32' or 'float64', precision of output data

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, 1, data_len) which is an
        ensemble of brownian trajectories. the singleton dimension is for the number of channels.
        returned if save_file is False
        'labels': list whose values are the diff_coefs values
        'label_names': list whose elements are string type, ['diff_coefs']
        'dt': float type dt
    - (file_name): string type file name of the simulated data. returned if save_file is True
    '''
    if isinstance(diff_coefs, (int, float)):
        diff_coefs = np.array([diff_coefs, diff_coefs], dtype=dtype)
    assert (
        len(diff_coefs) == 2
    ), "Invalid diff_coefs given: should be numeric or length 2 list-like format"
    diff_coef_low, diff_coef_high = diff_coefs
    diff_coef_samples = (diff_coef_high - diff_coef_low) * np.random.random(
        n_data, ) + diff_coef_low

    concat_list = []
    for diff_coef_sample in diff_coef_samples:
        process = sim_brownian(data_len,
                               diff_coefs=diff_coef_sample,
                               dt=dt,
                               n_data=1,
                               save_file=False,
                               dtype=dtype)
        process = process['data']
        concat_list.append(process)
    processes = np.concatenate(concat_list,
                               axis=0)  # shaped (n_data, 1, data_len)

    samples = {
        'data': processes,
        'labels': [diff_coef_samples],
        'label_names': ['diff_coefs'],
        'dt': dt
    }
    if not save_file:
        return samples

    nums = cu.match_filename(r'brw_([0-9]+).pt', root_dir=root_dir)
    nums = [int(num) for num in nums]
    idx = max(nums) + 1 if nums else 0

    file_name = 'brw_{}.pt'.format(idx)
    file_path = os.path.join(root_dir, file_name)
    torch.save(samples, file_path)
    return file_name
    ]
for file_name_meta in file_names_meta:
    assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case"
file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta]

file_names_meta = [
    cu.match_filename('(tbd_4_meta_rnn_[0-9]+_k_ratios.pt)', root_dir),
    cu.match_filename('(tbd_4_scat_0_meta_rnn_[0-9]+_k_ratios.pt)', root_dir),
    ]
for file_name_meta in file_names_meta:
    assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case"
file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta]

"""
file_names_meta = [
    cu.match_filename('(tbd_4_meta_rnn_[0-9]+_diff_coef_ratios.pt)', root_dir),
    cu.match_filename('(tbd_4_scat_0_meta_rnn_[0-9]+_diff_coef_ratios.pt)', root_dir),
    ]
for file_name_meta in file_names_meta:
    assert(len(file_name_meta) == 1), "Invalid number of files. Should be only 1 trained model for each case"
file_names_meta = [file_name_meta[0] for file_name_meta in file_names_meta]




#file_names_meta = ['data_meta_rnn_1.pt', 'data_scat_0_meta_rnn_1.pt']

# IRFP
#file_names_meta = ['data_meta_rnn_11.pt', 'data_scat_0_meta_rnn_11.pt']
# OR, provide file names and paths using regular expression
#file_paths_meta = glob.glob(os.path.join(root_dir, 'tbd_0_scat_meta_rnn_*.pt'))
Exemplo n.º 13
0
            except:
                print("exception occurred during scat transformation for n_data:{} with parameters avg_len:{}, n_filter_octave:{}".format(n_data, avg_len, n_filter_octave))

# simulate data for testing performance
print("simulating data for evaluation for randomly sampled labels")
k_ratios_test = (k_ratios_test_high - k_ratios_test_low) * np.random.random(n_data_test,) + k_ratios_test_low 
diff_coef_ratios_test = (diff_coef_ratios_test_high - diff_coef_ratios_test_low) * np.random.random(n_data_test,) + diff_coef_ratios_test_low
k_ratios_diff_coef_ratios_test = np.stack([k_ratios_test, diff_coef_ratios_test], axis=1)

data_tests = []
for k_ratio_test, diff_coef_ratio_test in k_ratios_diff_coef_ratios_test:
    data_test = siu.sim_two_beads(data_len, k_ratios=k_ratio_test, diff_coef_ratios=diff_coef_ratio_test, dt=dt, n_data=1, n_steps_initial=10000, save_file=False)
    data_tests.append(data_test)
processes = np.concatenate(data_tests, axis=2) # shaped (1, 1, n_data_test, n_channels, data_len)
samples = {'data':processes, 'labels':k_ratios_diff_coef_ratios_test, 'label_names':'k_ratios_diff_coef_ratios', 'dt':dt, 'n_steps_initial':10000}
nums = cu.match_filename(r'tbd_([0-9]+).pt', root_dir=root_dir)
nums = [int(num) for num in nums]
idx = max(nums) + 1 if nums else 0

file_name_test = 'tbd_{}.pt'.format(idx)
file_path_test = os.path.join(root_dir, file_name_test)
torch.save(samples, file_path_test)

# scat transforming test data
file_names_scat_test = []
for avg_len in avg_lens:
    for n_filter_octave in n_filter_octaves:
        try:
            print("scat transforming n_data_test:{} with parameters avg_len:{}, n_filter_octave:{}".format(n_data_test, avg_len, n_filter_octave))
            file_name_scat_test = scu.scat_transform(file_name_test, avg_len, log_transform=False, n_filter_octave=n_filter_octave, save_file=True, root_dir=root_dir)
            file_names_scat_test.append(file_name_scat_test)
Exemplo n.º 14
0
def scat_transform(file_name, avg_len, log_transform=False, n_filter_octave=[1, 1], 
    filter_format='fourier_truncated', save_file=True, root_dir=ROOT_DIR):
    '''
    performs invariant scattering transform for a given file

    inputs:
    -------
    - file_name: str type file name
    - avg_len: window length of scaling function in scat transform
    - log_transform: boolean whether to apply logarithm on scat transform results
    - n_filter_octave: number of filters when halving the frequency. 1 indicates dyadic filter bank
    - filter_format: 'fourier_multires', 'fourier_truncated', 'fourier'
    - save_file: boolean whether to save the results into a file or return as a dictionary.
        If True, file name is returned. Otherwise, data is returned
    - root_dir: str type directory name data_len: int, length of each process

    outputs:
    --------
    - (processes): dict whose key-value pairs are the following:
        'data': ndarray shaped (n_data, n_channels, n_nodes, data_len) or
        list of ndarrays where each array is shaped (n_channels, n_nodes, data_len)
        whatever key-values were in the given file
        hyperparameters used for performing the scat transform
        Returned if save_file is False
    - (file_name): string type file name of the simulated data. returned if save_file is True
    '''

    file_name, _ = os.path.splitext(file_name)
    file_path = os.path.join(root_dir, file_name + '.pt')
    samples = torch.load(file_path)

    nums = cu.match_filename(r'{}_scat_([0-9]+).pt'.format(file_name), root_dir=root_dir)
    nums = [int(num) for num in nums]; idx = max(nums) + 1 if nums else 0
    file_name_scat = '{}_scat_{}.pt'.format(file_name, idx)
    file_path_scat = os.path.join(root_dir, file_name_scat)

    data = samples['data']
    if isinstance(data, np.ndarray):
        assert(len(data.shape) == 3),\
            "Invalid data shape given. If type is ndarray, should be rank 3"
        n_data, n_channels, data_len = data.shape
        # perform scattering transform
        scat = ScatNet(data_len, avg_len, n_filter_octave=n_filter_octave, filter_format=filter_format)
        S = scat.transform(data)
        if log_transform: S = log_scat(S)
        data_scat = stack_scat(S) # shaped (n_data, n_channels, n_scat_nodes, data_len)
    elif isinstance(data, list):
        assert(len(data[0].shape) == 2),\
            "Invalid data shape given. If type is list, elements should be rank 2 ndarrays"
        data_scat = []
        for track in data:
            track_len = track.shape[1]
            scat = ScatNet(track_len, avg_len, n_filter_octave=n_filter_octave, filter_format=filter_format)
            S = scat.transform(track[np.newaxis, :, :])
            S = stack_scat(S)[0] # shaped (2, n_nodes, track_scat_len) where n_nodes is fixed but track_scat_len varies
            data_scat.append(S)
    else:
        raise ValueError("Invalid data given. Type should be either ndarray or list")

    samples_out = copy.deepcopy(samples)
    del samples_out['data']
    samples_out.update({'data':data_scat, 'avg_len':avg_len, 'log_transform':log_transform,
        'n_filter_octave':n_filter_octave, 'filter_format':filter_format, 'file_name':file_name})

    if not save_file:
        return samples_out
    torch.save(samples_out, file_path_scat)
    return file_name_scat
Exemplo n.º 15
0
    elif isinstance(samples['data'], list):
        assert(len(samples['data'][0].shape) == 2),\
            "Invalid data shape given. If type is list, elements should be rank 2 ndarrays"
        data = []
        for track in samples['data']:
            data.append(np.diff(track, n=1, axis=-1))
        samples_out['data'] = data
    else:
        raise ValueError(
            "Invalid data given. Type should be either ndarray or list")
    file_name_no_ext, _ = os.path.splitext(file_name)
    file_name_out = file_name_no_ext + '_disp.pt'
    file_path_out = os.path.join(root_dir, file_name_out)
    torch.save(samples_out, file_path_out)

    file_names_scat = cu.match_filename(
        r'({}_scat_[0-9]+.pt)'.format(file_name_no_ext), root_dir=root_dir)
    for file_name_scat in file_names_scat:
        file_path_scat = os.path.join(root_dir, file_name_scat)
        samples_scat = torch.load(file_path_scat)
        avg_len = samples_scat['avg_len']
        n_filter_octave = samples_scat['n_filter_octave']

        # perform scat transform and append to list
        print(
            "scat transforming {} with parameters avg_len:{}, n_filter_octave:{}"
            .format(file_name_out, avg_len, n_filter_octave))
        file_name_scat_out = scu.scat_transform(
            file_name_out,
            avg_len,
            log_transform=False,
            n_filter_octave=n_filter_octave,