def normalize(self, gropuname1, groupname2): # ## normalize y ## # with h5py.File(self.OUTPATH, mode='r+') as f: for atom in self.MAINCHAIN: # load train_y = da.from_array( f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) val_y = da.from_array( f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) total_y = da.concatenate([train_y, val_y], axis=0) y_mean = da.mean(total_y.reshape(-1), axis=0).compute() y_std = da.std(total_y.reshape(-1), axis=0).compute() # normalize train_y = da.divide(da.subtract(train_y, y_mean), y_std) val_y = da.divide(da.subtract(val_y, y_mean), y_std) # save da.to_hdf5(self.OUTPATH, f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}', train_y) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y) f.create_dataset(name=f'/{atom}/normalization', data=np.array([y_mean, y_std])) print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
def _potential_dask(x, y, z, m, eps): """ Specific gravitational potential energy of particles calculation. Parameters ---------- x, y, z : `np.ndarray` Positions of particles. Shape(n,1) m : `np.ndarray` Masses of particles. Shape(n,1) eps : `float`, optional Softening parameter. Shape(1,) Returns ------- np.ndarray : `float` Specific potential energy of particles. """ dist = np.sqrt( np.square(x - x.reshape(-1, 1)) + np.square(y - y.reshape(-1, 1)) + np.square(z - z.reshape(-1, 1)) + np.square(eps)) np.fill_diagonal(dist, 0.0) flt = dist != 0 mdist = da.divide(m, dist.astype(np.float32), where=flt) return mdist.sum(axis=1) * G
def _ttest_ind_from_stats(mean1, mean2, denom, df): d = mean1 - mean2 with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return (t, prob)
def _ttest_ind_from_stats(mean1, mean2, denom, df): d = mean1 - mean2 with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return (t, prob)
def test_notifications_error_with_threading(make_napari_viewer): """Test notifications of `threading` threads, using a dask example.""" random_image = da.random.random(size=(50, 50)) with notification_manager: viewer = make_napari_viewer() viewer.add_image(random_image) result = da.divide(random_image, da.zeros(50, 50)) viewer.add_image(result) assert len(notification_manager.records) >= 1 notification_manager.records = []
def update_velocities(position, velocity, mass, G, epsilon): """Calculate the interactions between all particles and update the velocities. Args: position (dask array): dask array of all particle positions in cartesian coordinates. velocity (dask array): dask array of all particle velocities in cartesian coordinates. mass (dask array): dask array of all particle masses. G (float): gravitational constant. epsilon (float): softening parameter. Returns: velocity: updated particle velocities in cartesian coordinates. """ dx = da.subtract.outer(position[:, 0], position[:, 0]) dy = da.subtract.outer(position[:, 1], position[:, 1]) dz = da.subtract.outer(position[:, 2], position[:, 2]) r2 = da.square(dx) + da.square(dy) + da.square(dz) + da.square(epsilon) # coef = -G * mass[:] ax = coef * dx ay = coef * dy az = coef * dz # ax_scaled = da.divide(ax, r2) ay_scaled = da.divide(ay, r2) az_scaled = da.divide(az, r2) # total_ax = da.nansum(ax_scaled, axis=1) total_ay = da.nansum(ay_scaled, axis=1) total_az = da.nansum(az_scaled, axis=1) # velocity_x = da.diag(da.add.outer(da.transpose(velocity)[0], total_ax)) velocity_y = da.diag(da.add.outer(da.transpose(velocity)[1], total_ay)) velocity_z = da.diag(da.add.outer(da.transpose(velocity)[2], total_az)) # velocity = np.column_stack((velocity_x.compute(), velocity_y.compute(), velocity_z.compute())) return velocity
def ttest_1samp(a, popmean, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def ttest_rel(a, b, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def ttest_rel(a, b, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def bw_corrcoef(image1, image2, block_shape, keep_shape=False): """ Blockwise Pearson correlation coefficient. """ # blockwise zero-mean image1_zm = image1 - bw_mean(image1, block_shape, keep_shape=True) image2_zm = image2 - bw_mean(image2, block_shape, keep_shape=True) # follow Pearson correlation coefficient formula numerator = bw_mean(da.multiply(image1_zm, image2_zm), block_shape) image1_std = bw_std(image1, block_shape) image2_std = bw_std(image2, block_shape) denominator = da.multiply(image1_std, image2_std) bwcc = da.divide(numerator, denominator) if keep_shape: bwcc = repeat_block(bwcc, block_shape) return bwcc
def activations(images, labels, batch_size, model, layer_regex, nodaug_params, daug_params, include_input=False, class_invariance=False, n_daug_rep=0, norms=['fro']): """ Computes metrics from the activations, such as the norm of the feature maps, data augmentation invariance, class invariance, etc. Parameters ---------- images : h5py Dataset The set of images labels : h5py Dataset The ground truth labels batch_size : int Batch size model : Keras Model The model nodaug_params : dict Dictionary of data augmentation parameters for the baseline daug_params : dict Dictionary of data augmentation parameters include_input : bool If True, the input layer is considered for the analysis class_invariance : bool If True, the class invariance score is computed n_daug_rep : int If larger than 0, the data augentation invariance score is computed, performing n_daug_rep repetitions of random augmentations norms : list List of keywords to specify the types of norms to compute on the activations Returns ------- results_dict : dict Dictionary containing some performance metrics """ def _update_stats(mean_norm, std_norm, norm): mean_norm_batch = np.mean(norm, axis=0) std_norm_batch = np.std(norm, axis=0) mean_norm = init / float(end) * mean_norm + \ batch_size / float(end) * mean_norm_batch std_norm = init / float(end) * std_norm ** 2 + \ batch_size / float(end) * std_norm_batch ** 2 + \ (init * batch_size) / float(end ** 2) * \ (mean_norm - mean_norm_batch) ** 2 std_norm = np.sqrt(std_norm) return mean_norm, std_norm def _frobenius_norm(activations): norm = np.linalg.norm( activations, ord='fro', axis=tuple(range(1, len(activations.shape) - 1))) return norm def _inf_norm(activations): norm = np.max(np.abs(activations), axis=tuple(range(1, len(activations.shape) - 1))) return norm model = del_extra_nodes(model) n_images = images.shape[0] n_batches_per_epoch = int(np.ceil(float(n_images) / batch_size)) # Get relevant layers if include_input: layer_regex = '({}|.*input.*)'.format(layer_regex) else: layer_regex = layer_regex layers = [layer.name for layer in model.layers if re.compile(layer_regex).match(layer.name)] # Initialize HDF5 to store the activations # filename = 'hdf5_aux_{}'.format(time.time()) # activations_hdf5_aux = h5py.File(filename, 'w') # hdf5_aux = [filename] # # grp_activations = activations_hdf5_aux.create_group('activations') if class_invariance: # grp_labels = activations_hdf5_aux.create_group('labels') labels_true_da = [] labels_pred_da = [] predictions_da = [] # labels_true = grp_labels.create_dataset( # 'labels_true', shape=(n_images, ), dtype=np.uint8) # labels_pred = grp_labels.create_dataset( # 'labels_pred', shape=(n_images, ), dtype=np.uint8) # predictions = grp_labels.create_dataset( # 'predictions', shape=labels.shape, dtype=K.floatx()) idx_softmax = model.output_names.index('softmax') store_labels = True else: store_labels = False # Initialize results dictionary results_dict = {'activations_norm': {}, 'summary': {}, 'class_invariance': {}, 'daug_invariance': {}} # Iterate over the layers for layer_name in layers: # Create batch generator image_gen = get_generator(images, **nodaug_params) batch_gen = generate_batches(image_gen, images, labels, batch_size, aug_per_im=1, shuffle=False) layer = model.get_layer(layer_name) layer_shape = layer.output_shape[1:] n_channels = layer_shape[-1] if re.compile('.*input.*').match(layer_name): layer_name = 'input' print('\nLayer {}\n'.format(layer_name)) # Create a Dataset for the activations of the layer # activations_layer = grp_activations.create_dataset( # layer_name, shape=(n_images, ) + layer_shape, # dtype=K.floatx()) # Create dask array for the activations of the layer activations_layer_da = [] # Initialize placeholders in the results dict for the layer results_dict['activations_norm'].update({layer_name: {n: {'mean': np.zeros(n_channels), 'std': np.zeros(n_channels)} for n in norms}}) layer_dict = results_dict['activations_norm'][layer_name] activation_function = K.function([model.input, K.learning_phase()], [layer.output]) # Iterate over the data set in batches init = 0 for batch_images, batch_labels in tqdm( batch_gen, total=n_batches_per_epoch): batch_size = batch_images.shape[0] end = init + batch_size # Store labels if store_labels: preds = model.predict_on_batch(batch_images) if isinstance(preds, list): preds = preds[idx_softmax] labels_pred_da.append(da.from_array( np.argmax(preds, axis=1))) labels_true_da.append(da.from_array( np.argmax(batch_labels, axis=1))) predictions_da.append(da.from_array(preds)) # labels_pred[init:end] = np.argmax(preds, axis=1) # labels_true[init:end] = np.argmax(batch_labels, axis=1) # predictions[init:end, :] = preds # Get and store activations activations = activation_function([batch_images, 0])[0] activations_layer_da.append(da.from_array( activations, chunks=activations.shape)) # activations_layer[init:end] = activations # Compute norms for norm_key in norms: mean_norm = layer_dict[norm_key]['mean'] std_norm = layer_dict[norm_key]['std'] if norm_key == 'fro': norm = _frobenius_norm(activations) elif norm_key == 'inf': norm = _inf_norm(activations) else: raise NotImplementedError('Implemented norms are fro ' 'and inf') mean_norm, std_norm = _update_stats(mean_norm, std_norm, norm) layer_dict[norm_key]['mean'] = mean_norm layer_dict[norm_key]['std'] = std_norm init = end if init == n_images: store_labels = False break # Concatenate dask arrays activations_layer_da = da.concatenate(activations_layer_da, axis=0) activations_layer_da = activations_layer_da.reshape((n_images, -1)) d_activations = activations_layer_da.shape[-1] if class_invariance: print('\nComputing class invariance\n') labels_pred_da = da.concatenate(labels_pred_da) labels_true_da = da.concatenate(labels_true_da) predictions_da = da.concatenate(predictions_da) n_classes = len(np.unique(labels_true_da)) # Compute MSE matrix of the activations r = da.reshape(da.sum(da.square(activations_layer_da), axis=1), (-1, 1)) mse_matrix_da = (r - 2 * da.dot(activations_layer_da, da.transpose(activations_layer_da)) \ + da.transpose(r)) / d_activations mse_matrix_da = mse_matrix_da.rechunk((mse_matrix_da.chunksize[0], mse_matrix_da.shape[-1])) # Compute class invariance time0 = time() results_dict['class_invariance'].update({layer_name: {}}) class_invariance_scores_da = [] if class_invariance: # mse_matrix_mean = da.mean(mse_matrix_da).compute() for cl in tqdm(range(n_classes)): labels_cl = labels_pred_da == cl labels_cl = labels_cl.compute() mse_class = mse_matrix_da[labels_cl, :][:, labels_cl] mse_class = mse_class.rechunk((-1, -1)) # mse_class_mean = da.mean(mse_class).compute() # class_invariance_score = 1. - np.divide( # mse_class_mean, mse_matrix_mean) # results_dict['class_invariance'][layer_name].update( # {cl: class_invariance_score}) class_invariance_scores_da.append( 1. - da.divide(da.mean(mse_class), da.mean(mse_matrix_da))) # Compute data augmentation invariance print('\nComputing data augmentation invariance\n') mse_daug_da = [] results_dict['daug_invariance'].update({layer_name: {}}) for r in range(n_daug_rep): print('Repetition {}'.format(r)) image_gen_daug = get_generator(images, **daug_params) batch_gen_daug = generate_batches(image_gen_daug, images, labels, batch_size, aug_per_im=1, shuffle=False) activations_layer_daug_da = [] # Iterate over the data set in batches to compute activations init = 0 for batch_images, batch_labels in tqdm( batch_gen, total=n_batches_per_epoch): batch_size = batch_images.shape[0] end = init + batch_size # Get and store activations activations = activation_function([batch_images, 0])[0] activations_layer_daug_da.append(da.from_array( activations, chunks=activations.shape)) init = end if init == n_images: break activations_layer_daug_da = da.concatenate( activations_layer_daug_da, axis=0) activations_layer_daug_da = activations_layer_daug_da.reshape( (n_images, -1)) activations_layer_daug_da = activations_layer_daug_da.rechunk( (activations_layer_daug_da.chunksize[0], activations_layer_daug_da.shape[-1])) # Compute MSE daug mse_daug_da.append(da.mean(da.square(activations_layer_da - \ activations_layer_daug_da), axis=1)) mse_daug_da = da.stack(mse_daug_da, axis=1) mse_sum = da.repeat(da.reshape(da.sum(mse_matrix_da, axis=1), (n_images, 1)), n_daug_rep, axis=1) daug_invariance_score_da = 1 - n_images * da.divide(mse_daug_da, mse_sum) time1 = time() # Compute dask results and update results dict results_dask = da.compute(class_invariance_scores_da, daug_invariance_score_da) time2 = time() results_dict['class_invariance'][layer_name].update( {cl: cl_inv_score for cl, cl_inv_score in enumerate(results_dask[0])}) results_dict['daug_invariance'].update({layer_name: {r: daug_inv_score for r, daug_inv_score in enumerate(results_dask[1].T)}}) # Compute summary statistics of the norms across the channels for layer, layer_dict in results_dict['activations_norm'].items(): results_dict['summary'].update({layer: {}}) for norm_key, norm_dict in layer_dict.items(): results_dict['summary'][layer].update({norm_key: { 'mean': np.mean(norm_dict['mean']), 'std': np.mean(norm_dict['std'])}}) return results_dict
def _center_of_mass_array(dask_array, threshold_value=None, mask_array=None): """Find center of mass of last two dimensions for a dask array. The center of mass can be calculated using a mask and threshold. Parameters ---------- dask_array : Dask array Must have either 2, 3 or 4 dimensions. threshold_value : scalar, optional mask_array : NumPy array, optional Array with bool values. The True values will be masked (i.e. ignored). Must have the same shape as the two last dimensions in dask_array. Returns ------- center_of_mask_dask_array : Dask array Examples -------- >>> import dask.array as da >>> import pyxem.utils.dask_tools as dt >>> data = da.random.random( ... size=(64, 64, 128, 128), chunks=(16, 16, 128, 128)) >>> output_dask = dt._center_of_mass_array(data) >>> output = output_dask.compute() Masking everything except the center of the image >>> mask_array = np.ones(shape=(128, 128), dtype=bool) >>> mask_array[64-10:64+10, 64-10:64+10] = False >>> output_dask = dt._center_of_mass_array(data, mask_array=mask_array) >>> output = output_dask.compute() Masking and thresholding >>> output_dask = dt._center_of_mass_array( ... data, mask_array=mask_array, threshold_value=3) >>> output = output_dask.compute() """ det_shape = dask_array.shape[-2:] y_grad, x_grad = np.mgrid[0:det_shape[0], 0:det_shape[1]] y_grad, x_grad = y_grad.astype(np.float64), x_grad.astype(np.float64) sum_array = np.ones_like(x_grad) if mask_array is not None: if not mask_array.shape == det_shape: raise ValueError( "mask_array ({0}) must have same shape as last two " "dimensions of the dask_array ({1})".format( mask_array.shape, det_shape)) x_grad = x_grad * np.invert(mask_array) y_grad = y_grad * np.invert(mask_array) sum_array = sum_array * np.invert(mask_array) if threshold_value is not None: dask_array = _threshold_array(dask_array, threshold_value=threshold_value, mask_array=mask_array) x_shift = da.multiply(dask_array, x_grad, dtype=np.float64) y_shift = da.multiply(dask_array, y_grad, dtype=np.float64) sum_array = da.multiply(dask_array, sum_array, dtype=np.float64) x_shift = np.sum(x_shift, axis=(-2, -1), dtype=np.float64) y_shift = np.sum(y_shift, axis=(-2, -1), dtype=np.float64) sum_array = np.sum(sum_array, axis=(-2, -1), dtype=np.float64) beam_shifts = da.stack((x_shift, y_shift)) beam_shifts = da.divide(beam_shifts[:], sum_array, dtype=np.float64) return beam_shifts
def main(argv=None): # cluster = LocalCluster(dashboard_address=None) # client = Client(cluster, memory_limit='{}GB'.format(FLAGS.memory_limit), # processes=False) K.set_floatx('float32') chunk_size = FLAGS.chunk_size # Read data set hdf5_file = h5py.File(FLAGS.data_file, 'r') images, labels, _ = hdf52dask(hdf5_file, FLAGS.group, chunk_size, shuffle=FLAGS.shuffle, seed=FLAGS.seed, pct=FLAGS.pct) n_images = images.shape[0] n_batches = int(np.ceil(n_images / float(FLAGS.batch_size))) # Data augmentation parameters daug_params_file = get_daug_scheme_path(FLAGS.daug_params, FLAGS.data_file) daug_params = yaml.load(open(daug_params_file, 'r'), Loader=yaml.FullLoader) nodaug_params_file = get_daug_scheme_path('nodaug.yml', FLAGS.data_file) nodaug_params = yaml.load(open(nodaug_params_file, 'r'), Loader=yaml.FullLoader) # Initialize the network model model_filename = FLAGS.model model = load_model(model_filename) # Print the model summary model.summary() # Get relevant layers if FLAGS.store_input: layer_regex = '({}|.*input.*)'.format(FLAGS.layer_regex) else: layer_regex = FLAGS.layer_regex layers = [ layer.name for layer in model.layers if re.compile(layer_regex).match(layer.name) ] # Create batch generators n_daug_rep = FLAGS.n_daug_rep n_diff_per_batch = int(FLAGS.batch_size / n_daug_rep) image_gen_daug = get_generator(images, **daug_params) batch_gen_daug = batch_generator(image_gen_daug, images, labels, batch_size=n_diff_per_batch, aug_per_im=n_daug_rep, shuffle=False) image_gen_nodaug = get_generator(images, **nodaug_params) batch_gen_nodaug = batch_generator(image_gen_nodaug, images, labels, FLAGS.batch_size, aug_per_im=1, shuffle=False) # Outputs if FLAGS.output_dir == '-1': FLAGS.output_dir = os.path.dirname(FLAGS.model) output_hdf5 = h5py.File( os.path.join(FLAGS.output_dir, FLAGS.output_mse_matrix_hdf5), 'w') output_pickle = os.path.join(FLAGS.output_dir, FLAGS.output_pickle) df_init_idx = 0 df = pd.DataFrame() # Iterate over the layers for layer_idx, layer_name in enumerate(layers): # Reload the model if layer_idx > 0: K.clear_session() model = load_model(model_filename) layer = model.get_layer(layer_name) # Rename input layer if re.compile('.*input.*').match(layer_name): layer_name = 'input' hdf5_layer = output_hdf5.create_group(layer_name) activation_function = K.function( [model.input, K.learning_phase()], [layer.output]) print('\nComputing pairwise similarity at layer {}'.format(layer_name)) # Compute activations of original data (without augmentation) a_nodaug_da = get_activations(activation_function, batch_gen_nodaug) a_nodaug_da = da.squeeze(a_nodaug_da) a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, ) + (a_nodaug_da.shape[1:])) dim_activations = a_nodaug_da.shape[1] # Comute matrix of similarities r = da.reshape(da.sum(da.square(a_nodaug_da), axis=1), (-1, 1)) mse_matrix = (r - 2 * da.dot(a_nodaug_da, da.transpose(a_nodaug_da)) \ + da.transpose(r)) / dim_activations # Compute activations with augmentation a_daug_da = get_activations(activation_function, batch_gen_daug) a_daug_da = da.rechunk(a_daug_da, (chunk_size, dim_activations, 1)) # Compute similarity of augmentations with respect to the # activations of the original data a_nodaug_da = da.repeat(da.reshape(a_nodaug_da, a_nodaug_da.shape + (1, )), repeats=n_daug_rep, axis=2) a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, dim_activations, 1)) mse_daug = da.mean(da.square(a_nodaug_da - a_daug_da), axis=1) # Compute invariance score mse_sum = da.repeat(da.reshape(da.sum(mse_matrix, axis=1), (n_images, 1)), repeats=n_daug_rep, axis=1) mse_sum = da.rechunk(mse_sum, (chunk_size, 1)) invariance = 1 - n_images * da.divide(mse_daug, mse_sum) print('Dimensionality activations: {}x{}x{}'.format( n_images, dim_activations, n_daug_rep)) # Store HDF5 file if FLAGS.output_mse_matrix_hdf5: mse_matrix_ds = hdf5_layer.create_dataset( 'mse_matrix', shape=mse_matrix.shape, chunks=mse_matrix.chunksize, dtype=K.floatx()) mse_daug_ds = hdf5_layer.create_dataset('mse_daug', shape=mse_daug.shape, chunks=mse_daug.chunksize, dtype=K.floatx()) invariance_ds = hdf5_layer.create_dataset( 'invariance', shape=invariance.shape, chunks=invariance.chunksize, dtype=K.floatx()) time_init = time() with ProgressBar(dt=1): da.store([mse_matrix, mse_daug, invariance], [mse_matrix_ds, mse_daug_ds, invariance_ds]) time_end = time() print('Elapsed time: {}'.format(time_end - time_init)) invariance = np.ravel( np.asarray(output_hdf5[layer_name]['invariance'])) else: time_init = time() invariance = da.ravel(invariance).compute() time_end = time() print('Elapsed time: {}'.format(time_end - time_init)) # Update pandas data frame for plotting df_end_idx = df_init_idx + n_images * n_daug_rep d = pd.DataFrame( { 'Layer': layer_name, 'sample': np.repeat(np.arange(n_images), n_daug_rep), 'n_daug': np.tile(np.arange(n_daug_rep), n_images), 'invariance': invariance }, index=np.arange(df_init_idx, df_end_idx).tolist()) df = df.append(d) df_init_idx += df_end_idx pickle.dump(df, open(output_pickle, 'wb')) output_hdf5.close()