示例#1
0
def main(argv):

    # copy data
    dst = os.path.join(FLAGS.tmp_dir, 'Off_parasol.mat')

    if not gfile.Exists(dst):
        print('Started Copy')
        src = os.path.join(FLAGS.src_dir, 'Off_parasol.mat')
        if not gfile.IsDirectory(FLAGS.tmp_dir):
            gfile.MkDir(FLAGS.tmp_dir)

        gfile.Copy(src, dst)
        print('File copied to destination')

    else:
        print('File exists')

    # load stimulus
    file = h5py.File(dst, 'r')

    # Load Masked movie
    data = file.get('maskedMovdd')
    stimulus = np.array(data)
    # load cell response
    cells = file.get('cells')

    ttf_log = file.get('ttf_log')
    ttf_avg = file.get('ttf_avg')

    # Load spike Response of cells
    data = file.get('Y')
    responses = np.array(data)

    # get mask
    total_mask_log = file.get('totalMaskAccept_log')

    print('Got data')

    # get cell and mask
    nsub_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    if FLAGS.taskid < 107 * len(nsub_list):
        cell_idx = [np.int(np.floor(FLAGS.taskid / len(nsub_list)))]
        cellid = cells[np.int(np.floor(FLAGS.taskid / len(nsub_list)))]
        Nsub = nsub_list[FLAGS.taskid % len(nsub_list)]
        partition_list = np.arange(10)

    elif FLAGS.taskid < 107 * len(nsub_list) + 37 * 10:
        cell_idx = [39, 42, 44, 45]  #[np.int(FLAGS.taskid)]
        cellid = cells[cell_idx]
        cellid = np.squeeze(cellid)
        task_id_effective = FLAGS.taskid - 107 * len(nsub_list)
        partition_list = [task_id_effective % 10]
        nsub_list_pop = np.arange(4, 41)
        Nsub = nsub_list_pop[np.int(np.floor(task_id_effective / 10))]

    elif FLAGS.taskid < 107 * len(nsub_list) + 37 * 10 + 19 * 10:
        cell_idx = [39, 42]  #[np.int(FLAGS.taskid)]
        cellid = cells[cell_idx]
        cellid = np.squeeze(cellid)
        task_id_effective = FLAGS.taskid - 107 * len(nsub_list) - 37 * 10
        partition_list = [task_id_effective % 10]
        nsub_list_pop = np.arange(2, 21)
        Nsub = nsub_list_pop[np.int(np.floor(task_id_effective / 10))]

    elif FLAGS.taskid < 107 * len(nsub_list) + 37 * 10 + 19 * 10 + 19 * 10:
        cell_idx = [44, 45]  #[np.int(FLAGS.taskid)]
        cellid = cells[cell_idx]
        cellid = np.squeeze(cellid)
        task_id_effective = FLAGS.taskid - 107 * len(
            nsub_list) - 37 * 10 - 19 * 10
        partition_list = [task_id_effective % 10]
        nsub_list_pop = np.arange(2, 21)
        Nsub = nsub_list_pop[np.int(np.floor(task_id_effective / 10))]

    print(cell_idx)
    print(Nsub)

    mask = (total_mask_log[cell_idx, :].sum(0) != 0)
    mask_matrix = np.reshape(mask != 0, [40, 80])

    # make mask bigger - add one row one left/right
    r, c = np.where(mask_matrix)
    mask_matrix[r.min() - 1:r.max() + 1, c.min() - 1:c.max() + 1] = True
    mask = np.ndarray.flatten(mask_matrix)

    stim_use = stimulus[:, mask]
    resp_use = responses[:, cell_idx]

    print('Prepared data')

    # get last 10% as test data
    np.random.seed(23)

    frac_test = 0.1
    tms_test = np.arange(np.floor(stim_use.shape[0] * (1 - frac_test)),
                         1 * np.floor(stim_use.shape[0])).astype(np.int)

    # Random partitions
    n_partitions = 10
    tms_train_validate = np.arange(
        0, np.floor(stim_use.shape[0] * (1 - frac_test))).astype(np.int)

    frac_validate = 0.1

    partitions = []
    for ipartition in range(n_partitions):
        perm = np.random.permutation(tms_train_validate)
        tms_train = perm[0:np.floor((1 - frac_validate) * perm.shape[0])]
        tms_validate = perm[np.floor((1 - frac_validate) *
                                     perm.shape[0]):perm.shape[0]]

        partitions += [{
            'tms_train': tms_train,
            'tms_validate': tms_validate,
            'tms_test': tms_test
        }]

    print('Made partitions')

    # Do fitting
    # tms_train = np.arange(0, np.floor(stim_use.shape[0] * 0.8)).astype(np.int)
    # tms_test = np.arange(np.floor(stim_use.shape[0] * 0.8),
    #                       1 * np.floor(stim_use.shape[0] * 0.9)).astype(np.int)

    for ipartition in partition_list:
        print(cell_idx, cellid, Nsub)

        ss = '_'.join([str(ic) for ic in cellid])

        save_filename = os.path.join(
            FLAGS.save_path,
            'Cell_%s_nsub_%d_part_%d_jnt.pkl' % (ss, Nsub, ipartition))
        if not gfile.Exists(save_filename):
            print('Fitting started')
            op = jnt_model.Flat_clustering_jnt(
                stim_use,
                resp_use,
                Nsub,
                partitions[ipartition]['tms_train'],
                partitions[ipartition]['tms_validate'],
                steps_max=10000,
                eps=1e-9)

            # op = jnt_model.Flat_clustering_jnt(stim_use, resp_use, Nsub,
            #                                   tms_train,
            #                                   tms_test,
            #                                   steps_max=10000, eps=1e-9)

            K, b, alpha, lam_log, lam_log_test, fitting_phase, fit_params = op

            print('Fitting done')
            save_dict = {
                'K': K,
                'b': b,
                'lam_log': lam_log,
                'lam_log_test': lam_log_test,
                'fitting_phase': fitting_phase,
                'fit_params': fit_params
            }
            pickle.dump(save_dict, gfile.Open(save_filename, 'w'))
            print('Saved results')
示例#2
0
文件: train.py 项目: zhaocz765/shifu
def load_data(context):
    train_data = []
    train_target = []
    valid_data = []
    valid_target = []

    training_data_sample_weight = []
    valid_data_sample_weight = []

    count = 0
    train_pos_cnt = 0
    train_neg_cnt = 0
    valid_pos_cnt = 0
    valid_neg_cnt = 0

    feature_column_nums = context["feature_column_nums"]
    sample_weight_column_num = context["sample_weight_column_num"]
    allFileNames = gfile.ListDirectory(root)
    normFileNames = filter(lambda x: not x.startswith(".") and not x.startswith("_"), allFileNames)
    tprint("Total input file count is " + str(len(normFileNames)) + ".")

    file_count = 1
    line_count = 0

    for normFileName in normFileNames:
        print("Now loading " + normFileName + " Progress: " + str(file_count) + "/" + str(len(normFileNames)) + ".")
        sys.stdout.flush()
        file_count += 1

        with gfile.Open(os.path.join(root, normFileName), 'rb') as f:
            gf = gzip.GzipFile(fileobj=StringIO(f.read()))
            while True:
                line = gf.readline()
                if len(line) == 0:
                    break
                
                line_count += 1
                if line_count % 10000 == 0: 
                    tprint("Total loading lines: " + str(line_count))
                
                columns = line.split(delimiter)

                if feature_column_nums == None:
                    feature_column_nums = range(0, len(columns))
                    feature_column_nums.remove(target_index)

                if random.random() >= valid_data_percentage:
                    # Append training data
                    train_target.append([float(columns[target_index])])
                    if(columns[target_index] == "1"):
                        train_pos_cnt += 1
                    else :
                        train_neg_cnt += 1
                    single_train_data = []
                    for feature_column_num in feature_column_nums:
                        single_train_data.append(float(columns[feature_column_num].strip('\n')))
                    train_data.append(single_train_data)
                    
                    if sample_weight_column_num >= 0 and sample_weight_column_num < len(columns):
                        weight = float(columns[sample_weight_column_num].strip('\n'))
                        if weight < 0.0:
                            print("Warning: weight is below 0. example:" + line)
                            weight= 1.0
                        training_data_sample_weight.append([weight])
                    else:
                        training_data_sample_weight.append([1.0])
                else:
                    # Append validation data
                    valid_target.append([float(columns[target_index])])
                    if(columns[target_index] == "1"):
                        valid_pos_cnt += 1
                    else:
                        valid_neg_cnt += 1
                    single_valid_data = []
                    for feature_column_num in feature_column_nums:
                        single_valid_data.append(float(columns[feature_column_num].strip('\n')))
                    valid_data.append(single_valid_data)
                    
                    if sample_weight_column_num >= 0 and sample_weight_column_num < len(columns):
                        weight = float(columns[sample_weight_column_num].strip('\n'))
                        if weight < 0.0:
                            print("Warning: weight is below 0. example:" + line)
                            weight= 1.0
                        valid_data_sample_weight.append([weight])
                    else:
                        valid_data_sample_weight.append([1.0])

    tprint("Total data count: " + str(line_count) + ".")
    tprint("Train pos count: " + str(train_pos_cnt) + ", neg count: " + str(train_neg_cnt) + ".")
    tprint("Valid pos count: " + str(valid_pos_cnt) + ", neg count: " + str(valid_neg_cnt) + ".")

    context['feature_count'] = len(feature_column_nums)

    return train_data, train_target, valid_data, valid_target, training_data_sample_weight, valid_data_sample_weight
示例#3
0
  def testProfileBasic(self):
    ops.reset_default_graph()
    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
    opts['account_type_regexes'] = ['.*']
    opts['select'] = ['params', 'float_ops', 'micros', 'bytes',
                      'device', 'op_types', 'occurrence']
    outfile = os.path.join(test.get_temp_dir(), 'dump')
    opts['output'] = 'file:outfile=' + outfile

    # Test the output without run_meta.
    sess = session.Session()
    r = lib.BuildFullModel()
    sess.run(variables.global_variables_initializer())

    profiler = model_analyzer.Profiler(sess.graph)
    profiler.profile_name_scope(opts)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='scope', tfprof_options=opts)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertEqual(pma_str, profiler_str)

    # Test the output with run_meta.
    run_meta = config_pb2.RunMetadata()
    _ = sess.run(r,
                 options=config_pb2.RunOptions(
                     trace_level=config_pb2.RunOptions.FULL_TRACE),
                 run_metadata=run_meta)

    profiler.add_step(1, run_meta)
    profiler.profile_graph(opts)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='graph', run_meta=run_meta, tfprof_options=opts)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertEqual(pma_str, profiler_str)

    profiler.profile_python_codes(opts)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='code', run_meta=run_meta, tfprof_options=opts)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertEqual(pma_str, profiler_str)

    profiler.profile_operations(opts)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='op', run_meta=run_meta, tfprof_options=opts)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertEqual(pma_str, profiler_str)

    # Test the output difference between multi-step profile and 1-step profile.
    _ = sess.run(r,
                 options=config_pb2.RunOptions(
                     trace_level=config_pb2.RunOptions.FULL_TRACE),
                 run_metadata=run_meta)

    profiler.add_step(2, run_meta)
    profiler.profile_name_scope(opts)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertNotEqual(pma_str, profiler_str)

    opts2 = opts.copy()
    opts2['select'] = ['params', 'float_ops']
    profiler.profile_name_scope(opts2)
    with gfile.Open(outfile, 'r') as f:
      profiler_str = f.read()

    model_analyzer.print_model_analysis(
        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts2)
    with gfile.Open(outfile, 'r') as f:
      pma_str = f.read()
    self.assertEqual(pma_str, profiler_str)
示例#4
0
from tensorflow.contrib import lookup
from tensorflow.python.platform import gfile

#
# #figure out document lengths
length = []
import nltk as nl
for f in df['Review Text']:
    length.append(len(nl.word_tokenize(f)))
MAX_DOCUMENT_LENGTH = max(length)
PADWORD = 'ZYXW'

vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
    MAX_DOCUMENT_LENGTH)
vocab_processor.fit(lines)
with gfile.Open('vocab.tsv', 'wb') as f:
    f.write("{}\n".format(PADWORD))
    for word, index in vocab_processor.vocabulary_._mapping.items():
        f.write("{}\n".format(word))

x = np.array(list(vocab_processor.fit_transform(lines)))

vocabulary = vocab_processor.vocabulary_


def load_embedding_vectors_word2vec(vocabulary, filename, binary):
    # load embedding_vectors from the word2vec
    encoding = 'utf-8'
    with open(filename, "rb") as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
示例#5
0
def _write_with_backup(filename, content):
    if gfile.Exists(filename):
        gfile.Rename(filename, filename + '.old', overwrite=True)
    with gfile.Open(filename, 'w') as f:
        f.write(content)
示例#6
0
 def restore(cls, filename):
     with gfile.Open(filename, 'rb') as f:
         return pickle.loads(f.read())
def main(argv):

    # copy WN data
    dst = os.path.join(FLAGS.tmp_dir, 'Off_parasol.mat')

    if not gfile.Exists(dst):
        print('Started Copy')
        src = os.path.join(FLAGS.src_dir, 'Off_parasol.mat')
        if not gfile.IsDirectory(FLAGS.tmp_dir):
            gfile.MkDir(FLAGS.tmp_dir)

        gfile.Copy(src, dst)
        print('File copied to destination')

    else:
        print('File exists')

    # load stimulus
    file = h5py.File(dst, 'r')

    # Load Masked movie
    data = file.get('maskedMovdd')
    stimulus = np.array(data)

    # load cell response
    cells = file.get('cells')
    cells = np.array(cells)
    cells = np.squeeze(cells)

    ttf_log = file.get('ttf_log')
    ttf_avg = file.get('ttf_avg')

    # Load spike Response of cells
    data = file.get('Y')
    responses = np.array(data)

    # get mask
    total_mask_log = np.array(file.get('totalMaskAccept_log'))

    print('Got WN data')

    # Get NSEM data
    dat_nsem_mov = sio.loadmat(
        gfile.Open(
            '/home/bhaishahster/nsem_data/'
            'pc2015_10_29_2/NSinterval_30_025.mat', 'r'))
    stimulus_nsem = dat_nsem_mov['mov']

    stimulus_nsem = np.transpose(stimulus_nsem, [2, 1, 0])
    stimulus_nsem = np.reshape(stimulus_nsem, [stimulus_nsem.shape[0], -1])

    dat_nsem_resp = sio.loadmat(
        gfile.Open(
            '/home/bhaishahster/nsem_data/'
            'pc2015_10_29_2/OFF_parasol_trial_resp'
            '_data_NSEM_data039.mat', 'r'))
    responses_nsem = dat_nsem_resp['resp_cell_log']
    print('Git NSEM data')

    # read line corresponding to task
    with gfile.Open(FLAGS.task_params_file, 'r') as f:
        for itask in range(FLAGS.taskid + 1):
            line = f.readline()
    line = line[:-1]  # Remove \n from end.
    print(line)

    # get task parameters by parsing the lines
    line_split = line.split(';')
    cell_idx = line_split[0]
    cell_idx = cell_idx[1:-1].split(',')
    cell_idx = [int(i) for i in cell_idx]

    Nsub = int(line_split[1])
    projection_type = line_split[2]
    lam_proj = float(line_split[3])
    ipartition = int(line_split[4])

    cell_idx_mask = cell_idx

    ##

    print(cell_idx)
    print(Nsub)
    print(cell_idx_mask)

    mask = (total_mask_log[cell_idx_mask, :].sum(0) != 0)
    mask_matrix = np.reshape(mask != 0, [40, 80])

    # make mask bigger - add one row one left/right
    r, c = np.where(mask_matrix)
    mask_matrix[r.min() - 1:r.max() + 1, c.min() - 1:c.max() + 1] = True
    neighbor_mat = su_model.get_neighbormat(mask_matrix, nbd=1)
    mask = np.ndarray.flatten(mask_matrix)

    ## WN preprocess
    stim_use_wn = stimulus[:, mask]
    resp_use_wn = responses[:, cell_idx]

    # get last 10% as test data
    np.random.seed(23)

    frac_test = 0.1
    tms_test = np.arange(np.floor(stim_use_wn.shape[0] * (1 - frac_test)),
                         1 * np.floor(stim_use_wn.shape[0])).astype(np.int)

    # Random partitions
    n_partitions = 10
    tms_train_validate = np.arange(
        0, np.floor(stim_use_wn.shape[0] * (1 - frac_test))).astype(np.int)

    frac_validate = 0.1

    partitions_wn = []
    for _ in range(n_partitions):
        perm = np.random.permutation(tms_train_validate)
        tms_train = perm[0:np.floor((1 - frac_validate) * perm.shape[0])]
        tms_validate = perm[np.floor((1 - frac_validate) *
                                     perm.shape[0]):perm.shape[0]]

        partitions_wn += [{
            'tms_train': tms_train,
            'tms_validate': tms_validate,
            'tms_test': tms_test
        }]

    print('Made partitions')
    print('WN data preprocessed')

    ## NSEM preprocess
    stim_use_nsem = stimulus_nsem[:, mask]
    ttf_use = np.array(ttf_log[cell_idx, :]).astype(np.float32).squeeze()
    stim_use_nsem = filterMov_time(stim_use_nsem, ttf_use)
    resp_use_nsem = np.array(responses_nsem[cell_idx][0,
                                                      0]).astype(np.float32).T

    # Remove first 30 frames due to convolution artifact.
    stim_use_nsem = stim_use_nsem[30:, :]
    resp_use_nsem = resp_use_nsem[30:, :]

    n_trials = resp_use_nsem.shape[1]
    t_nsem = resp_use_nsem.shape[0]
    tms_train_1tr_nsem = np.arange(np.floor(t_nsem / 2))
    tms_test_1tr_nsem = np.arange(np.ceil(t_nsem / 2), t_nsem)

    # repeat in time dimension, divide into training and testing.
    stim_use_nsem = np.tile(stim_use_nsem.T, n_trials).T
    resp_use_nsem = np.ndarray.flatten(resp_use_nsem.T)
    resp_use_nsem = np.expand_dims(resp_use_nsem, 1)

    tms_train_nsem = np.array([])
    tms_test_nsem = np.array([])
    for itrial in range(n_trials):
        tms_train_nsem = np.append(tms_train_nsem,
                                   tms_train_1tr_nsem + itrial * t_nsem)
        tms_test_nsem = np.append(tms_test_nsem,
                                  tms_test_1tr_nsem + itrial * t_nsem)
    tms_train_nsem = tms_train_nsem.astype(np.int)
    tms_test_nsem = tms_test_nsem.astype(np.int)

    print('NSEM data preprocessed')

    ss = '_'.join([str(cells[ic]) for ic in cell_idx])

    save_filename = os.path.join(
        FLAGS.save_path, 'Cell_%s_nsub_%d_%s_%.3f_part_%d_jnt.pkl' %
        (ss, Nsub, projection_type, lam_proj, ipartition))

    save_filename_partial = os.path.join(
        FLAGS.save_path_partial, 'Cell_%s_nsub_%d_%s_%.3f_part_%d_jnt.pkl' %
        (ss, Nsub, projection_type, lam_proj, ipartition))

    ## Do fitting
    # Fit SU on WN
    print('Fitting started on WN')
    op = su_model.Flat_clustering_jnt(
        stim_use_wn,
        resp_use_wn,
        Nsub,
        partitions_wn[ipartition]['tms_train'],
        partitions_wn[ipartition]['tms_validate'],
        steps_max=10000,
        eps=1e-9,
        projection_type=projection_type,
        neighbor_mat=neighbor_mat,
        lam_proj=lam_proj,
        eps_proj=0.01,
        save_filename_partial=save_filename_partial,
        fitting_phases=[1])

    _, _, alpha, lam_log_wn, lam_log_test_wn, fitting_phase, fit_params_wn = op
    print('Fitting done on WN')

    # Fit on NSEM
    op = su_model.fit_scales(stim_use_nsem[tms_train_nsem, :],
                             resp_use_nsem[tms_train_nsem, :],
                             stim_use_nsem[tms_test_nsem, :],
                             resp_use_nsem[tms_test_nsem, :],
                             Ns=Nsub,
                             K=fit_params_wn[0][0],
                             b=fit_params_wn[0][1],
                             params=fit_params_wn[0][2],
                             lr=0.1,
                             eps=1e-9)

    K_nsem, b_nsem, nl_params_nsem, lam_log_nsem, lam_log_test_nsem = op

    # Collect results and save
    fit_params = fit_params_wn + [[K_nsem, b_nsem, nl_params_nsem]]
    lam_log = [lam_log_wn, np.array(lam_log_nsem)]
    lam_log_test = [lam_log_test_wn, np.array(lam_log_test_nsem)]

    save_dict = {
        'lam_log': lam_log,
        'lam_log_test': lam_log_test,
        'fit_params': fit_params
    }
    pickle.dump(save_dict, gfile.Open(save_filename, 'w'))
    print('Saved results')
示例#8
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
示例#9
0
def load_data(data_file):
    data_file_list = data_file.split(",")
    global feature_column_nums

    logging.info("input data %s" % data_file_list)
    logging.info("SELECTED_COLUMN_NUMS" + str(feature_column_nums))

    train_data = []
    train_target = []
    valid_data = []
    valid_target = []

    training_data_sample_weight = []
    valid_data_sample_weight = []

    train_pos_cnt = 0
    train_neg_cnt = 0
    valid_pos_cnt = 0
    valid_neg_cnt = 0

    file_count = 1
    line_count = 0

    for currentFile in data_file_list:
        logging.info("Now loading " + currentFile + " Progress: " +
                     str(file_count) + "/" + str(len(data_file_list)) + ".")
        file_count += 1

        with gfile.Open(currentFile, 'rb') as f:
            gf = gzip.GzipFile(fileobj=StringIO(f.read()))
            while True:
                line = gf.readline()
                if len(line) == 0:
                    break

                line_count += 1
                if line_count % 10000 == 0:
                    logging.info("Total loading lines: " + str(line_count))

                columns = line.split(DELIMITER)

                if feature_column_nums is None:
                    feature_column_nums = range(0, len(columns))

                    feature_column_nums.remove(target_column_num)
                    if sample_weight_column_num >= 0:
                        feature_column_nums.remove(sample_weight_column_num)

                if random.random() >= VALID_TRAINING_DATA_RATIO:
                    # Append training data
                    train_target.append([float(columns[target_column_num])])
                    if columns[target_column_num] == "1":
                        train_pos_cnt += 1
                    else:
                        train_neg_cnt += 1
                    single_train_data = []
                    for feature_column_num in feature_column_nums:
                        try:
                            single_train_data.append(
                                float(columns[feature_column_num].strip('\n')))
                        except:
                            logging.info(
                                "Could not convert " +
                                str(columns[feature_column_num].strip('\n') +
                                    " to float"))
                            logging.info("feature_column_num: " +
                                         str(feature_column_num))
                    train_data.append(single_train_data)

                    if sample_weight_column_num >= 0 and sample_weight_column_num < len(
                            columns):
                        weight = float(
                            columns[sample_weight_column_num].strip('\n'))
                        if weight < 0.0:
                            logging.info(
                                "Warning: weight is below 0. example:" + line)
                            weight = 1.0
                        training_data_sample_weight.append([weight])
                    else:
                        training_data_sample_weight.append([1.0])
                else:
                    # Append validation data
                    valid_target.append([float(columns[target_column_num])])
                    if columns[target_column_num] == "1":
                        valid_pos_cnt += 1
                    else:
                        valid_neg_cnt += 1
                    single_valid_data = []
                    for feature_column_num in feature_column_nums:
                        try:
                            single_valid_data.append(
                                float(columns[feature_column_num].strip('\n')))
                        except:
                            logging.info(
                                "Could not convert " +
                                str(columns[feature_column_num].strip('\n') +
                                    " to float"))
                            logging.info("feature_column_num: " +
                                         str(feature_column_num))

                    valid_data.append(single_valid_data)

                    if sample_weight_column_num >= 0 and sample_weight_column_num < len(
                            columns):
                        weight = float(
                            columns[sample_weight_column_num].strip('\n'))
                        if weight < 0.0:
                            logging.info(
                                "Warning: weight is below 0. example:" + line)
                            weight = 1.0
                        valid_data_sample_weight.append([weight])
                    else:
                        valid_data_sample_weight.append([1.0])

    logging.info("Total data count: " + str(line_count) + ".")
    logging.info("Train pos count: " + str(train_pos_cnt) + ", neg count: " +
                 str(train_neg_cnt) + ".")
    logging.info("Valid pos count: " + str(valid_pos_cnt) + ", neg count: " +
                 str(valid_neg_cnt) + ".")

    return {
        "train_data": train_data,
        "train_target": train_target,
        "valid_data": valid_data,
        "valid_target": valid_target,
        "train_data_sample_weight": training_data_sample_weight,
        "valid_data_sample_weight": valid_data_sample_weight,
        "feature_count": len(feature_column_nums)
    }
def txt_line_iterator(path):
    """Iterate through lines of file."""
    with gfile.Open(path) as f:
        for line in f:
            yield line.strip()
示例#11
0
def _save_vocab_file(vocab_file, subtoken_list):
    """Save subtokens to file."""
    with gfile.Open(vocab_file, mode="w") as f:
        for subtoken in subtoken_list:
            f.write("'%s'\n" % subtoken)
    def testSelectEverthingDetail(self):
        ops.reset_default_graph()
        dev = '/gpu:0' if test.is_gpu_available() else '/cpu:0'
        outfile = os.path.join(test.get_temp_dir(), 'dump')
        opts = (builder(
            builder.trainable_variables_parameter()).with_file_output(
                outfile).with_accounted_types(['.*']).select([
                    'micros', 'bytes', 'params', 'float_ops', 'occurrence',
                    'device', 'op_types', 'input_shapes'
                ]).build())

        config = config_pb2.ConfigProto()
        with session.Session(config=config) as sess, ops.device(dev):
            x = lib.BuildSmallModel()

            sess.run(variables.global_variables_initializer())
            run_meta = config_pb2.RunMetadata()
            _ = sess.run(x,
                         options=config_pb2.RunOptions(
                             trace_level=config_pb2.RunOptions.FULL_TRACE),
                         run_metadata=run_meta)

            model_analyzer.profile(sess.graph, run_meta, options=opts)

            with gfile.Open(outfile, 'r') as f:
                # pylint: disable=line-too-long
                outputs = f.read().split('\n')

                self.assertEqual(
                    outputs[0],
                    'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes'
                )
                for o in outputs[1:]:
                    if o.find('Conv2D ') > 0:
                        metrics = o[o.find('(') + 1:o.find(')')].split(',')
                        # Make sure time is profiled.
                        gap = 1 if test.is_gpu_available() else 2
                        for i in range(3, 6, gap):
                            mat = re.search('(.*)[um]s/(.*)[um]s', metrics[i])
                            self.assertGreater(float(mat.group(1)), 0.0)
                            self.assertGreater(float(mat.group(2)), 0.0)
                        # Make sure device is profiled.
                        if test.is_gpu_available():
                            self.assertTrue(metrics[6].find('gpu') > 0)
                            self.assertFalse(metrics[6].find('cpu') > 0)
                        else:
                            self.assertFalse(metrics[6].find('gpu') > 0)
                            self.assertTrue(metrics[6].find('cpu') > 0)
                        # Make sure float_ops is profiled.
                        mat = re.search('(.*)k/(.*)k flops',
                                        metrics[1].strip())
                        self.assertGreater(float(mat.group(1)), 0.0)
                        self.assertGreater(float(mat.group(2)), 0.0)
                        # Make sure op_count is profiled.
                        self.assertEqual(metrics[8].strip(), '1/1|1/1')
                        # Make sure input_shapes is profiled.
                        self.assertEqual(metrics[9].strip(),
                                         '0:2x6x6x3|1:3x3x3x6')

                    if o.find('DW (3x3x3x6') > 0:
                        metrics = o[o.find('(') + 1:o.find(')')].split(',')
                        mat = re.search('(.*)/(.*) params', metrics[1].strip())
                        self.assertGreater(float(mat.group(1)), 0.0)
                        self.assertGreater(float(mat.group(2)), 0.0)
    def testComplexCodeView(self):
        ops.reset_default_graph()
        outfile = os.path.join(test.get_temp_dir(), 'dump')
        opts = (builder(
            builder.trainable_variables_parameter()).with_file_output(
                outfile).with_accounted_types(['.*']).with_node_names(
                    show_name_regexes=['.*model_analyzer_testlib.py.*']).
                account_displayed_op_only(False).select(
                    ['params', 'float_ops']).build())

        with session.Session() as sess:
            x = lib.BuildFullModel()

            sess.run(variables.global_variables_initializer())
            run_meta = config_pb2.RunMetadata()
            _ = sess.run(x,
                         options=config_pb2.RunOptions(
                             trace_level=config_pb2.RunOptions.FULL_TRACE),
                         run_metadata=run_meta)

            tfprof_node = model_analyzer.profile(sess.graph,
                                                 run_meta,
                                                 cmd='code',
                                                 options=opts)

            # pylint: disable=line-too-long
            with gfile.Open(outfile, 'r') as f:
                lines = f.read().split('\n')
                result = '\n'.join([l[:min(len(l), 80)] for l in lines])
                self.assertEqual(
                    'node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient) (0\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient) (0\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient) (0\n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
                    result)

            self.assertLess(0, tfprof_node.total_exec_micros)
            self.assertEqual(2844, tfprof_node.total_parameters)
            self.assertEqual(91040, tfprof_node.total_float_ops)
            self.assertEqual(8, len(tfprof_node.children))
            self.assertEqual('_TFProfRoot', tfprof_node.name)
            self.assertEqual(
                'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...',
                tfprof_node.children[0].name)
            self.assertEqual(
                'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient)',
                tfprof_node.children[1].name)
            self.assertEqual(
                'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
                tfprof_node.children[2].name)
            self.assertEqual(
                'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient)',
                tfprof_node.children[3].name)
            self.assertEqual(
                'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
                tfprof_node.children[4].name)
            self.assertEqual(
                'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
                tfprof_node.children[5].name)
            self.assertEqual(
                'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient)',
                tfprof_node.children[6].name)
            self.assertEqual(
                'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
                tfprof_node.children[7].name)
示例#14
0
    def _restore(self, path):
        """Restores this estimator from given path.

        Note: will rebuild the graph and initialize all parameters,
        and will ignore provided model.

        Args:
            path: Path to checkpoints and other information.
        """
        # Currently Saver requires absolute path to work correctly.
        path = os.path.abspath(path)

        self._graph = ops.Graph()
        with self._graph.as_default():
            endpoints_filename = os.path.join(path, 'endpoints')
            if not os.path.exists(endpoints_filename):
                raise ValueError("Restore folder doesn't contain endpoints.")
            with gfile.Open(endpoints_filename) as foutputs:
                endpoints = foutputs.read().split('\n')
            graph_filename = os.path.join(path, 'graph.pbtxt')
            if not os.path.exists(graph_filename):
                raise ValueError(
                    "Restore folder doesn't contain graph definition.")
            with gfile.Open(graph_filename) as fgraph:
                graph_def = graph_pb2.GraphDef()
                text_format.Merge(fgraph.read(), graph_def)
                (self._inp, self._out, self._model_predictions,
                 self._model_loss) = importer.import_graph_def(
                     graph_def, name='', return_elements=endpoints)
            saver_filename = os.path.join(path, 'saver.pbtxt')
            if not os.path.exists(saver_filename):
                raise ValueError(
                    "Restore folder doesn't contain saver definition.")
            with gfile.Open(saver_filename) as fsaver:
                saver_def = train.SaverDef()
                text_format.Merge(fsaver.read(), saver_def)
                self._saver = train.Saver(saver_def=saver_def)

            # Restore trainer
            self._global_step = self._graph.get_tensor_by_name('global_step:0')
            self._train = self._graph.get_operation_by_name('train')

            # Restore summaries.
            self._summaries = self._graph.get_operation_by_name(
                'MergeSummary/MergeSummary')

            # Restore session.
            if not isinstance(self._config, RunConfig):
                self._config = RunConfig(verbose=self.verbose)
            self._session = session.Session(self._config.tf_master,
                                            config=self._config.tf_config)
            checkpoint_path = train.latest_checkpoint(path)
            if checkpoint_path is None:
                raise ValueError(
                    "Missing checkpoint files in the %s. Please "
                    "make sure you are you have checkpoint file that describes "
                    "latest checkpoints and appropriate checkpoints are there. "
                    "If you have moved the folder, you at this point need to "
                    "update manually update the paths in the checkpoint file."
                    % path)
            self._saver.restore(self._session, checkpoint_path)
        # Set to be initialized.
        self._initialized = True
def spike_triggered_clustering(X, Y, Ns, tms_tr, tms_tst, K=None, b=None,
                               steps_max=10000, eps=1e-6,
                               projection_type=None, neighbor_mat=None,
                               lam_proj=0, eps_proj=0.01,
                               save_filename_partial=None,
                               fitting_phases=[1, 2, 3]):
  """Subunit estimation using spike triggered clustering.
  
  The fitting proceeds in three phases - 
  First phase: Ignoring the output nonlinearity and soft-clustering of spike triggered stimuli to estimate K and b.
  Second phase: Fix K, optimize b and output nonlinearity by gradient descent.
  Third phase : Optimize K, b and the nonlinearity by gradient descent.
  
  Args: 
    X : Stimulus (dims: # samples x # pixels).
    Y: Responses (dims: # samples x # cells).
    Ns: Number of subunits (scalar).
    tms_tr: Sample indices used for training (dims: # training samples).
    tms_tst: Samples indices for validation (dims: # validation samples).
    K: Initial subunit filter (dims: # pixels x Ns).
    b: Initial weights for different subunits (dims: Ns x # cells).
    steps_max: Maximum number of steps for first phase.
    eps: Threshold change of loss, for convergence.
    projection_type: Regularization type ('lnl1' or 'l1').
    neighbor_mat: Adjacency matrix for pixels (dims: # pixels x # pixels).
    lam_proj: Regularization strength.
    eps_proj: Hyperparameter for 'lnl1' regularization.
    save_filename_partial: Checkpoint filename.
    fitting_phases: Phases of fitting to be applied (list with elements from {1, 2, 3}).
    
  Returns:
    K : Final subunit filters, (dims: # n_pix x #SU).
    b : Final subunit weights for different cells (dims: # SU x # cells).
    alpha: Softmax weights for each stimulus and different subunits (dims: # samples x Ns).
    lam_log: Training loss curve.
    lam_log_test: Validation loss curve.
    fitting_phase: The phase (1/2/3) corresponding to each iteration.
    fit_params: Outputs (K, b, nonlinearity parameters) after each fitting phase. 
  """
  # projection_op='lnl1'

  # X is Txmask
  X_tr = X[tms_tr, :]
  Y_tr = Y[tms_tr, :]
  X_test = X[tms_tst, :]
  Y_test = Y[tms_tst, :]

  Tlen = Y_tr.shape[0]
  times = np.arange(Tlen)
  N1 = X_tr.shape[1]
  n_cells = Y.shape[1]
  Sigma = numpy.dot(X_tr.transpose(),X_tr)/float(X_tr.shape[0])
  
  if projection_type == 'lnl1':
    if neighbor_mat is None:
      neighbor_mat = np.eye(N1)


  # load previously saved data
  if gfile.Exists(save_filename_partial):
    try:
      data = pickle.load(gfile.Open(save_filename_partial, 'r'))
      K = data['K']
      b = data['b']
      lam_log = data['lam_log']
      lam_log_test = data['lam_log_test']
      irepeat_start = data['irepeat']

      lam = lam_log[-1]
      lam_test = lam_log_test[-1]
      lam_min = data['lam_min']
      K_min = data['K_min']
      b_min = data['b_min']
      #print('Partially fit model parameters loaded')
    except:
      pass
      #print('Error in loading file')

      if K is None:
        K = 2*rng.rand(N1,Ns)-0.5
      K_min = np.copy(K)

      if b is None:
        b = 2*rng.rand(Ns, n_cells)-0.5
      b_min = np.copy(b)

      lam_log = np.zeros((0, n_cells))
      lam_log_test = np.zeros((0, n_cells))
      lam = np.inf
      lam_test = np.inf
      lam_min = np.inf
      irepeat_start = 0

  else:
    #print('No partially fit model')
    # initialize filters
    if K is None:
      K = 2*rng.rand(N1,Ns)-0.5
    K_min = np.copy(K)

    if b is None:
      b = 2*rng.rand(Ns, n_cells)-0.5
    b_min = np.copy(b)

    lam_log = np.zeros((0, n_cells))
    lam_log_test = np.zeros((0, n_cells))
    lam = np.inf
    lam_test = np.inf
    lam_min = np.inf
    irepeat_start = 0
    #print('Variables initialized')

  fitting_phase = np.array([])
  fit_params = []

  # Find subunits - no output NL
  if 1 in fitting_phases:
    for irepeat in range(irepeat_start, np.int(steps_max)):

      if irepeat % 100 == 99:
        save_dict = {'K': K, 'b': b, 'lam_log': lam_log,
                     'lam_log_test': lam_log_test, 'irepeat': irepeat,
                     'K_min': K_min, 'b_min': b_min, 'lam_min': lam_min}
        if save_filename_partial is not None:
          pickle.dump(save_dict, gfile.Open(save_filename_partial, 'w' ))

      # compute reweighted L1 weights
      if projection_type == 'lnl1':
        wts = 1 / (neighbor_mat.dot(np.abs(K)) + eps_proj)

      # test data
      _, lam_test = compute_fr_loss(K, b, X_test, Y_test)
      lam_log_test = np.append(lam_log_test, np.expand_dims(lam_test, 0), 0)

      # train data
      lam_prev = np.copy(lam)
      _, lam = compute_fr_loss(K, b, X_tr, Y_tr)
      lam_log = np.append(lam_log, np.expand_dims(lam, 0), 0)

      if np.sum(lam) <= np.sum(lam_min) :
        K_min = np.copy(K)
        b_min = np.copy(b)
        lam_min = np.copy(lam)
        lam_test_at_lam_min = np.copy(lam_test)

      #print(itime)
      K_new_list_nr = []
      K_new_list_dr = []
      mean_ass_f_list = []
      for icell in range(n_cells):
        tms = np.int64(np.arange(Tlen))
        t_sp = tms[Y_tr[:, icell] != 0]
        Y_tsp = Y_tr[t_sp, icell]

        f = np.exp(numpy.dot(X_tr, K) + b[:, icell])
        alpha = (f.transpose()/f.sum(1)).transpose()
        xx = (Y_tsp.transpose()*alpha[t_sp, :].T).T
        sta_f = X_tr[t_sp,:].transpose().dot(xx)
        mean_ass_f = xx.sum(0)

        K_new_list_nr += [numpy.linalg.solve(Sigma,sta_f)]
        K_new_list_dr += [mean_ass_f]
        mean_ass_f_list += [mean_ass_f]

      K_new_list_nr = np.array(K_new_list_nr)
      K_new_list_dr = np.array(K_new_list_dr)
      mean_ass_f_list = np.array(mean_ass_f_list).T # recompute ??

      K = np.mean(K_new_list_nr, 0) / np.mean(K_new_list_dr, 0)

      # Soft thresholding for K
      if projection_type == 'lnl1':
        K = np.maximum(K - (wts * lam_proj), 0) - np.maximum(- K - (wts * lam_proj), 0)

      if projection_type == 'l1':
        K = np.maximum(K - lam_proj, 0) - np.maximum(- K - lam_proj, 0)

      b = np.log((1/Tlen)*mean_ass_f_list)- np.expand_dims(np.diag(0.5*K.transpose().dot(Sigma.dot(K))), 1)

      #print(irepeat, lam, lam_prev)

      if np.sum(np.abs(lam_prev - lam)) < eps:
        print('Subunits fitted, Train loss: %.7f, '
              'Test loss: %.7f after %d iterations' % (lam, lam_test, irepeat))
        break

    fitting_phase = np.append(fitting_phase, np.ones(lam_log.shape[0]))
    nl_params = np.repeat(np.expand_dims(np.array([1.0, 0.0]), 1), n_cells, 1)
    fit_params += [[np.copy(K_min), np.copy(b_min), nl_params]]

  # fit NL + b + Kscale
  if 2 in fitting_phases:
    K, b, nl_params, loss_log, loss_log_test = fit_scales(X_tr, Y_tr,
                                                          X_test, Y_test,
                                                          Ns=Ns, K=K, b=b,
                                                          params=nl_params,
                                                          lr=0.001, eps=eps)

    if 'lam_log' in vars():
      lam_log = np.append(lam_log, np.array(loss_log), 0)
    else:
      lam_log = np.array(loss_log)

    if 'lam_log_test' in vars():
      lam_log_test = np.append(lam_log_test, np.array(loss_log_test), 0)
    else:
      lam_log_test = np.array(loss_log_test)

    fitting_phase = np.append(fitting_phase, 2 * np.ones(np.array(loss_log).shape[0]))
    fit_params += [[np.copy(K), np.copy(b), nl_params]]

  # Fit all params
  if 3 in fitting_phases:
    K, b, nl_params, loss_log, loss_log_test  = fit_all(X_tr, Y_tr, X_test, Y_test,
                                                     Ns=Ns, K=K, b=b, train_phase=3,
                                                     params=nl_params,
                                                     lr=0.001, eps=eps)

    if 'lam_log' in vars():
      lam_log = np.append(lam_log, np.array(loss_log), 0)
    else:
      lam_log = np.array(loss_log)

    if 'lam_log_test' in vars():
      lam_log_test = np.append(lam_log_test, np.array(loss_log_test), 0)
    else:
      lam_log_test = np.array(loss_log_test)

    fitting_phase = np.append(fitting_phase, 3 * np.ones(np.array(loss_log).shape[0]))
    fit_params += [[np.copy(K), np.copy(b), nl_params]]

  return K, b, alpha, lam_log, lam_log_test, fitting_phase, fit_params
示例#16
0
def format_tensor(tensor,
                  tensor_name,
                  np_printoptions,
                  print_all=False,
                  tensor_slicing=None,
                  highlight_options=None,
                  include_numeric_summary=False,
                  write_path=None):
    """Generate formatted str to represent a tensor or its slices.

  Args:
    tensor: (numpy ndarray) The tensor value.
    tensor_name: (str) Name of the tensor, e.g., the tensor's debug watch key.
    np_printoptions: (dict) Numpy tensor formatting options.
    print_all: (bool) Whether the tensor is to be displayed in its entirety,
      instead of printing ellipses, even if its number of elements exceeds
      the default numpy display threshold.
      (Note: Even if this is set to true, the screen output can still be cut
       off by the UI frontend if it consist of more lines than the frontend
       can handle.)
    tensor_slicing: (str or None) Slicing of the tensor, e.g., "[:, 1]". If
      None, no slicing will be performed on the tensor.
    highlight_options: (tensor_format.HighlightOptions) options to highlight
      elements of the tensor. See the doc of tensor_format.format_tensor()
      for more details.
    include_numeric_summary: Whether a text summary of the numeric values (if
      applicable) will be included.
    write_path: A path to save the tensor value (after any slicing) to
      (optional). `numpy.save()` is used to save the value.

  Returns:
    An instance of `debugger_cli_common.RichTextLines` representing the
    (potentially sliced) tensor.
  """

    if tensor_slicing:
        # Validate the indexing.
        value = command_parser.evaluate_tensor_slice(tensor, tensor_slicing)
        sliced_name = tensor_name + tensor_slicing
    else:
        value = tensor
        sliced_name = tensor_name

    auxiliary_message = None
    if write_path:
        with gfile.Open(write_path, "wb") as output_file:
            np.save(output_file, value)
        line = debugger_cli_common.RichLine("Saved value to: ")
        line += debugger_cli_common.RichLine(write_path, font_attr="bold")
        line += " (%sB)" % bytes_to_readable_str(gfile.Stat(write_path).length)
        auxiliary_message = debugger_cli_common.rich_text_lines_from_rich_line_list(
            [line, debugger_cli_common.RichLine("")])

    if print_all:
        np_printoptions["threshold"] = value.size
    else:
        np_printoptions["threshold"] = DEFAULT_NDARRAY_DISPLAY_THRESHOLD

    return tensor_format.format_tensor(
        value,
        sliced_name,
        include_metadata=True,
        include_numeric_summary=include_numeric_summary,
        auxiliary_message=auxiliary_message,
        np_printoptions=np_printoptions,
        highlight_options=highlight_options)
def main(argv):

    # parse task params
    # read line corresponding to task
    with gfile.Open(FLAGS.task_params_file, 'r') as f:
        for _ in range(FLAGS.taskid + 1):
            line = f.readline()

    print(line)

    # get task parameters by parsing the line.
    line_split = line.split(';')
    cells = gfile.ListDirectory(FLAGS.src_dir)

    cell_idx = line_split[0]
    cell_idx = cell_idx[1:-1].split(',')

    nsub = int(line_split[1])
    projection_type = line_split[2]
    lam_proj = float(line_split[3])
    ipartition = int(line_split[4][:-1])

    # Copy data for all the data
    cell_str_final = ''
    dst_log = []
    for icell in cell_idx:
        icell = int(icell)
        cell_string = cells[icell]
        cell_str_final += cell_string

        # copy data for the corresponding task
        dst = os.path.join(FLAGS.tmp_dir, cell_string)

        dst_log += [dst]
        if not gfile.Exists(dst):
            print('Started Copy')
            src = os.path.join(FLAGS.src_dir, cell_string)
            if not gfile.IsDirectory(FLAGS.tmp_dir):
                gfile.MkDir(FLAGS.tmp_dir)

            gfile.Copy(src, dst)
            print('File %s copied to destination' % cell_string)

        else:
            print('File %s exists' % cell_string)

    # Load data for different cells
    stim_log = []
    resp_log = []
    mask_matrix_log = []
    for dst in dst_log:
        print('Loading %s' % dst)
        data = h5py.File(dst)
        stimulus = np.array(data.get('stimulus'))
        stimulus = stimulus[:-1, :]  # drop the last frame so that it's
        # the same size as the binned spike train

        response = np.squeeze(np.array(data.get('response')))
        response = np.expand_dims(response, 1)
        mask_matrix = np.array(data.get('mask'))

        stim_log += [stimulus]
        resp_log += [response]
        mask_matrix_log += [mask_matrix]

    # Prepare for fitting across multiple cells
    # Get total mask
    mask_matrix_pop = np.array(mask_matrix_log).sum(0) > 0

    # Get total response.
    resp_len = np.min([resp_log[icell].shape[0] for icell in range(4)])
    response_pop = np.zeros((resp_len, len(resp_log)))
    for icell in range(len(resp_log)):
        response_pop[:, icell] = resp_log[icell][:resp_len, 0]

    # Get total stimulus.
    stimulus_pop = np.zeros((resp_len, mask_matrix_pop.sum()))
    # Find non-zero locations for each mask element
    nnz_log = [np.where(imask > 0) for imask in mask_matrix_log]
    nnz_pop = np.where(mask_matrix_pop > 0)

    for ipix in range(mask_matrix_pop.sum()):
        print(ipix)
        r = nnz_pop[0][ipix]
        c = nnz_pop[1][ipix]

        stim_pix = np.zeros(resp_len)
        nc = 0
        for icell in range(len(nnz_log)):
            pix_cell_bool = np.logical_and(nnz_log[icell][0] == r,
                                           nnz_log[icell][1] == c)
            if pix_cell_bool.sum() > 0:
                pix_cell = np.where(pix_cell_bool > 0)[0][0]
                stim_pix += stim_log[icell][:resp_len, pix_cell]
                nc += 1

        if nc == 0:
            print('Error')

        stim_pix = stim_pix / nc
        stimulus_pop[:, ipix] = stim_pix

    # Fit with a given number of subunits
    print('Starting fitting')
    get_su_nsub(stimulus_pop, response_pop, mask_matrix_pop, cell_str_final,
                nsub, projection_type, lam_proj, ipartition)
示例#18
0
def convert(saved_model_dir,
            output_tflite=None,
            output_arrays=None,
            tag_set=None,
            signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
            batch_size=1):
  """Convert a savedmodel to tflite flatbuffer.

  Args:
    saved_model_dir: Saved model directory to convert.
    output_tflite: File path to write result flatbuffer.
    output_arrays: List of output tensor names, the default value is None, which
      means conversion keeps all output tensors. This is also used to filter
      tensors that are from Op currently not supported in tflite, e.g., Argmax).
    tag_set: This is the set of tags to get meta_graph_def in saved_model.
    signature_key: This is the signature key to extract inputs, outputs.
    batch_size: If input tensor shape has None at first dimension,
      e.g. (None,224,224,3), replace None with batch_size.

  Returns:
    The converted data. For example if tflite was the destination, then
    this will be a tflite flatbuffer in a bytes array.

  Raises:
    ValueError: If tag_set does not indicate any meta_graph_def in saved_model,
      or signature_key is not in relevant meta_graph_def,
      or input shape has None beyond 1st dimension, e.g., (1,None, None, 3),
      or given output_arrays are not valid causing empty outputs.
  """
  if tag_set is None:
    tag_set = set([tag_constants.SERVING])

  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
  signature_def = get_signature_def(meta_graph, signature_key)
  inputs, outputs = get_inputs_outputs(signature_def)

  graph = ops.Graph()
  with session.Session(graph=graph) as sess:

    loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)

    in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs]

    # Users can use output_arrays to filter output tensors for conversion.
    # If output_arrays is None, we keep all output tensors. In future, we may
    # use tflite supported Op list and check whether op is custom Op to
    # automatically filter output arrays.
    # TODO(zhixianyan): Use tflite supported Op list to filter outputs.
    if output_arrays is not None:
      output_arrays = output_arrays.split(",")
      out_tensors = [
          graph.get_tensor_by_name(output)
          for output in outputs
          if output.split(":")[0] in output_arrays
      ]
    else:
      out_tensors = [graph.get_tensor_by_name(output) for output in outputs]

    output_names = [node.split(":")[0] for node in outputs]

    if not out_tensors:
      raise ValueError(
          "No valid output tensors for '{}', possible values are '{}'".format(
              output_arrays, output_names))

    frozen_graph_def = tf_graph_util.convert_variables_to_constants(
        sess, graph.as_graph_def(), output_names)

    # Toco requires fully defined tensor shape, for input tensor with None in
    # their shape, e.g., (None, 224, 224, 3), we need to replace first None with
    # a given batch size. For shape with more None, e.g. (None, None, None, 3),
    # still be able to replace and convert, but require further investigation.
    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
    for i in range(len(in_tensors)):
      shape = in_tensors[i].get_shape().as_list()
      if shape[0] is None:
        shape[0] = batch_size
      if None in shape[1:]:
        raise ValueError(
            "Only support None shape at 1st dim as batch_size. But tensor "
            "'{}' 's shape '{}' has None at other dimension. ".format(
                inputs[i], shape))
      in_tensors[i].set_shape(shape)

    result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors)

    if output_tflite is not None:
      with gfile.Open(output_tflite, "wb") as f:
        f.write(result)
      logging.info("Successfully converted to: %s", output_tflite)

    return result
def get_su_nsub(stimulus, response, mask_matrix, cell_string, nsub,
                projection_type, lam_proj, ipartition):
    """Get 'nsub' subunits."""

    np.random.seed(95)  # 23 for _jnt.pkl, 46 for _jnt_2.pkl, 93 for _nov, _jan

    # Get a few (5) training, testing, validation partitions

    # continuous partitions
    # ifrac = 0.8
    # tms_train = np.arange(0, np.floor(stimulus.shape[0]*ifrac)).astype(np.int)

    # Random partitions
    # get last 10% as test data
    frac_test = 0.1
    tms_test = np.arange(np.floor(stimulus.shape[0] * (1 - frac_test)),
                         1 * np.floor(stimulus.shape[0])).astype(np.int)

    # Random partitions
    n_partitions = 10
    tms_train_validate = np.arange(
        0, np.floor(stimulus.shape[0] * (1 - frac_test))).astype(np.int)

    frac_validate = 0.1

    partitions = []
    for _ in range(n_partitions):
        perm = np.random.permutation(tms_train_validate)
        tms_train = perm[0:np.floor((1 - frac_validate) * perm.shape[0])]
        tms_validate = perm[np.floor((1 - frac_validate) *
                                     perm.shape[0]):perm.shape[0]]

        partitions += [{
            'tms_train': tms_train,
            'tms_validate': tms_validate,
            'tms_test': tms_test
        }]

    print('Made partitions')

    # do fitting for different lambdas
    # from IPython import embed; embed()
    neighbor_mat = su_model.get_neighbormat(mask_matrix, nbd=1)
    save_name = os.path.join(
        FLAGS.save_path, 'Cell_%s_nsub_%d_%s_%.6f_part_%d_%s.pkl' %
        (cell_string, nsub, projection_type, lam_proj, ipartition,
         FLAGS.save_suffix))

    save_filename_partial = os.path.join(
        FLAGS.save_path_partial, 'Cell_%s_nsub_%d_%s_%.6f_part'
        '_%d_%s.pkl' % (cell_string, nsub, projection_type, lam_proj,
                        ipartition, FLAGS.save_suffix))

    if not gfile.Exists(save_name):
        print(cell_string, nsub, projection_type, lam_proj, ipartition)
        op = su_model.Flat_clustering_jnt(
            stimulus,
            response,
            nsub,
            partitions[ipartition]['tms_train'],
            partitions[ipartition]['tms_validate'],
            steps_max=10000,
            eps=1e-9,
            projection_type=projection_type,
            neighbor_mat=neighbor_mat,
            lam_proj=lam_proj,
            eps_proj=0.01,
            save_filename_partial=save_filename_partial)

        k_f, b_f, _, loss_log_f, loss_log_test_f, fitting_phase_f, fit_params_f = op

        print('Fitting done')
        save_dict = {
            'K': k_f,
            'b': b_f,
            'loss_log': loss_log_f,
            'loss_log_test': loss_log_test_f,
            'fitting_phase': fitting_phase_f,
            'fit_params': fit_params_f
        }

        pickle.dump(save_dict, gfile.Open(save_name, 'w'))
        print('Saved results')
示例#20
0
 def save(self, filename):
     with gfile.Open(filename, 'wb') as f:
         f.write(pickle.dumps(self))
示例#21
0
def _load_graph_def_from_event_file(event_file_path):
  event = event_pb2.Event()
  with gfile.Open(event_file_path, "rb") as f:
    event.ParseFromString(f.read())

  return graph_pb2.GraphDef.FromString(event.graph_def)
示例#22
0
def extract_cifar10(local_url, data_dir):
  """
  Extracts the CIFAR-10 dataset and return numpy arrays with the different sets
  :param local_url: where the tar.gz archive is located locally
  :param data_dir: where to extract the archive's file
  :return: a tuple (train data, train labels, test data, test labels)
  """
  # These numpy dumps can be reloaded to avoid performing the pre-processing
  # if they exist in the working directory.
  # Changing the order of this list will ruin the indices below.
  preprocessed_files = ['/cifar10_train.npy',
                        '/cifar10_train_labels.npy',
                        '/cifar10_test.npy',
                        '/cifar10_test_labels.npy']

  all_preprocessed = True
  for file in preprocessed_files:
    if not gfile.Exists(data_dir + file):
      all_preprocessed = False
      break

  if all_preprocessed:
    # Reload pre-processed training data from numpy dumps
    with gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj:
      train_data = np.load(file_obj)
    with gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj:
      train_labels = np.load(file_obj)

    # Reload pre-processed testing data from numpy dumps
    with gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj:
      test_data = np.load(file_obj)
    with gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj:
      test_labels = np.load(file_obj)

  else:
    # Do everything from scratch
    # Define lists of all files we should extract
    train_files = ["data_batch_" + str(i) for i in xrange(1,6)]
    test_file = ["test_batch"]
    cifar10_files = train_files + test_file

    # Check if all files have already been extracted
    need_to_unpack = False
    for file in cifar10_files:
      if not gfile.Exists(file):
        need_to_unpack = True
        break

    # We have to unpack the archive
    if need_to_unpack:
      tarfile.open(local_url, 'r:gz').extractall(data_dir)

    # Load training images and labels
    images = []
    labels = []
    for file in train_files:
      # Construct filename
      filename = data_dir + "/cifar-10-batches-py/" + file

      # Unpickle dictionary and extract images and labels
      images_tmp, labels_tmp = unpickle_cifar_dic(filename)

      # Append to lists
      images.append(images_tmp)
      labels.append(labels_tmp)

    # Convert to numpy arrays and reshape in the expected format
    train_data = np.asarray(images, dtype=np.float32).reshape((50000,3,32,32))
    train_data = np.swapaxes(train_data, 1, 3)
    train_labels = np.asarray(labels, dtype=np.int32).reshape(50000)

    # Save so we don't have to do this again
    np.save(data_dir + preprocessed_files[0], train_data)
    np.save(data_dir + preprocessed_files[1], train_labels)

    # Construct filename for test file
    filename = data_dir + "/cifar-10-batches-py/" + test_file[0]

    # Load test images and labels
    test_data, test_images = unpickle_cifar_dic(filename)

    # Convert to numpy arrays and reshape in the expected format
    test_data = np.asarray(test_data,dtype=np.float32).reshape((10000,3,32,32))
    test_data = np.swapaxes(test_data, 1, 3)
    test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000)

    # Save so we don't have to do this again
    np.save(data_dir + preprocessed_files[2], test_data)
    np.save(data_dir + preprocessed_files[3], test_labels)

  return train_data, train_labels, test_data, test_labels
示例#23
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL

    dataset = train_dir.split("/")[-1]
    if dataset == "cifar":
        (train_images, train_labels), (test_images, test_labels) = \
            tf.keras.datasets.cifar10.load_data()
        train_labels = tf.keras.utils.to_categorical(train_labels, 10)
        test_labels = tf.keras.utils.to_categorical(test_labels, 10)
        train_images = train_images / 255.0
        test_images = test_images / 255.0

    if dataset == "circles":
        from sklearn import datasets
        X, y = datasets.make_circles(n_samples=70000, factor=.5, noise=.05)
        X = (X - X.min()) / (X.max() - X.min())
        train_images = X
        train_labels = tf.keras.utils.to_categorical(y, 2)
        test_images = train_images[:10000]
        test_labels = train_labels[:10000]
        train_images = train_images[10000:]
        train_labels = train_labels[10000:]

    if dataset == "s-curve":
        # 3-d data
        from sklearn import datasets
        X, y = datasets.make_s_curve(n_samples=70000,
                                     noise=0.5,
                                     random_state=2019)

    if dataset == "swiss-roll":
        # 3-d data
        datasets.make_swiss_roll(n_samples=70000, noise=0.5, random_state=2019)

    if dataset == "mnist" or dataset == "fashion":
        TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
        TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
        TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
        TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

        # local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
        #                                  source_url + TRAIN_IMAGES)
        with gfile.Open(local_file, 'rb') as f:
            train_images = extract_images(f)

        local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                         source_url + TRAIN_LABELS)
        with gfile.Open(local_file, 'rb') as f:
            train_labels = extract_labels(f, one_hot=one_hot)

        local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                         source_url + TEST_IMAGES)
        with gfile.Open(local_file, 'rb') as f:
            test_images = extract_images(f)

        local_file = base.maybe_download(TEST_LABELS, train_dir,
                                         source_url + TEST_LABELS)
        with gfile.Open(local_file, 'rb') as f:
            test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)