def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = {DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST} if DataFold.TRAIN in folds_to_load: data_file = path.join("train.jsonl.gz") self.vocab_source = self._build_vocab( dataset = [datapoint["graph"]["node_features"] for datapoint in data_file.read_by_file_suffix()], vocab_size=self.max_vocab_size ) self.vocab_target = self._build_vocab( dataset = [datapoint["Target"] for datapoint in data_file.read_by_file_suffix()], vocab_size=self.max_vocab_size ) self._loaded_data[DataFold.TRAIN] = self.__load_data(data_file) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data(path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data(path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") if self.metadata == {}: metadata_path = path.join("metadata.pkl.gz") if metadata_path.exists(): logger.info(f"Loading metadata from {metadata_path}") self._metadata = metadata_path.read_by_file_suffix() else: logger.warning( "Using metadata passed to constructor, not metadata stored with data." ) # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def load_data(self, path: RichPath) -> None: # Note that as __load_data produces a generator, we explicitly force loading # (and caching) here: self._loaded_data[DataFold.TRAIN] = \ list(self.__load_data(path.join("graphs-train"), DataFold.TRAIN)) self._loaded_data[DataFold.VALIDATION] = \ list(self.__load_data(path.join("graphs-valid"), DataFold.VALIDATION))
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") self.load_metadata(path) # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> List[GraphSample]: if data_fold == None: data_fold = "train" if data_fold == DataFold.TRAIN: data_name = "train" elif data_fold == DataFold.VALIDATION: data_name = "valid" elif data_fold == DataFold.TEST: data_name = "test" else: raise ValueError("Unknown data fold '%s'" % str(data_fold)) print(" Loading DEOBFUSCATION %s data from %s." % (data_name, data_dir)) if data_dir.join("%s-saved.pkl.gz" % data_name).is_file(): read_data = data_dir.join("%s-saved.pkl.gz" % data_name).read_by_file_suffix() return read_data["all_graphs"], read_data["properties"] all_untensorised = data_dir.join("%s.pkl.gz" % data_name).read_by_file_suffix() graphs = all_untensorised["graphs"] properties = dict() properties["all_user_nodes"] = all_untensorised["name_to_id_mapping"] properties["user_defined_nodes_number"] = all_untensorised[ "total_user_defined_nodes"] properties["edge_mapping"] = all_untensorised["edge_name_to_id"] properties["__num_labels"] = len(properties["all_user_nodes"]) properties["__num_edge_types"] = len(properties["edge_mapping"]) properties["__num_types"] = len(all_untensorised["type_to_id"]) all_graphs = [] for i in tqdm(range(len(graphs))): old_graph = graphs[i] if (old_graph["user_defined_nodes_number"] > 0): all_graphs.append( self.create_graph_sample( old_graph, properties["__num_edge_types"], len(properties["all_user_nodes"]))) print_graph_number = 2500 print([ all_untensorised["ids_to_names"][x] for x in all_graphs[print_graph_number].labels ], all_graphs[print_graph_number].nodes_mask) print(all_graphs[print_graph_number]) to_save = dict() to_save["all_graphs"] = all_graphs to_save["properties"] = properties data_dir.join("%s-saved.pkl.gz" % data_name).save_as_compressed_file(to_save) print("Saved modified data to %s-saved.pkl.gz" % data_name) return all_graphs, properties
def df_to_jsonl(df: pd.DataFrame, RichPath_obj: RichPath, i: int, basefilename='codedata') -> str: dest_filename = f'{basefilename}_{str(i).zfill(5)}.jsonl.gz' RichPath_obj.join(dest_filename).save_as_compressed_file( df.to_dict(orient='records')) return str(RichPath_obj.join(dest_filename))
def run_train(model_class: Type[Model], train_data_path: RichPath, valid_data_path: RichPath, save_folder: str, hyperparameters: Dict[str, Any], run_name: Optional[str]=None, quiet: bool=False) \ -> RichPath: train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*') valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*') model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True) model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps( hyperparameters))) resume = True else: model.load_existing_metadata(train_data_path.join('metadata.pkl.gz')) model.make_model(is_train=True) model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps(hyperparameters))) resume = False model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume) return model_path
def __load_data(self, data_dir: RichPath, data_fold: DataFold): if data_fold == DataFold.TRAIN: data_name = "train" elif data_fold == DataFold.VALIDATION: data_name = "valid" elif data_fold == DataFold.TEST: data_name = "test" else: raise ValueError("Unknown data fold '%s'" % str(data_fold)) print(" Loading observer %s data from %s." % (data_name, data_dir)) graph_pickle_data = data_dir.join("%s_data_collection.pickle" % data_name).read_by_file_suffix() graph_id_to_edges: Dict[int, List[Tuple[int, int]]] = {} graph_id_to_features: Dict[int, List[np.ndarray]] = {} graph_id_to_targets: Dict[int, List[np.ndarray]] = {} #graph_id_to_node_offset: Dict[int, int] = {} for graph_id, graph in enumerate( graph_pickle_data, 1 ): #graph_pickle_data is a list of dictionary. graph_id start at 1 graph_id_to_features[graph_id] = [] graph_id_to_targets[graph_id] = [] graph_id_to_edges[graph_id] = [] for node_dict in graph['graph']['nodes']: graph_id_to_features[graph_id].append( np.array(list(node_dict.values()))) for target_dict in graph['actions'].values(): graph_id_to_targets[graph_id].append( np.array(list(target_dict.values()))) #convert source, target node from id to position of nodes in the list for edge_dict in graph['graph']['links']: src_node, tgt_node = edge_dict['source'], edge_dict['target'] graph_id_to_edges[graph_id].append((src_node, tgt_node)) final_graphs = [] for graph_id in graph_id_to_edges.keys(): num_nodes = len(graph_id_to_features[graph_id]) adjacency_lists, type_to_node_to_num_inedges = process_adjacency_lists( adjacency_lists=[graph_id_to_edges[graph_id]], num_nodes=num_nodes, add_self_loop_edges=self.params["add_self_loop_edges"], tied_fwd_bkwd_edge_types=self._tied_fwd_bkwd_edge_types, ) final_graphs.append( TFAgentsGraphSample( adjacency_lists=adjacency_lists, type_to_node_to_num_inedges=type_to_node_to_num_inedges, node_features=np.array(graph_id_to_features[graph_id]), node_targets=np.array(graph_id_to_targets[graph_id]), )) return final_graphs
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: # 如果没有定义加载数据属于哪个集,那么需要同时加载训练集、验证集和测试集 if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train_token.json")) if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("validate_token.json")) if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test_token.json"))
def load_metadata(self, path: RichPath) -> None: """Load the metadata for a dataset (such as vocabularies, names of properties, ...) from a path on disk. Note: Implementors needing to act on metadata information before loading any actual data should override this method. """ if self.metadata == {}: metadata_path = path.join("metadata.pkl.gz") if metadata_path.exists(): logger.info(f"Loading metadata from {metadata_path}") self._metadata = metadata_path.read_by_file_suffix() else: logger.warning("Using metadata passed to constructor, not metadata stored with data.")
def split_many_files(input_dir: RichPath, output_dir: RichPath, train_ratio: float, valid_ratio: float, test_ratio: float, test_only_projects: Set[str]) -> None: output_paths = {} # type: Dict[str, RichPath] for split_name in ['train', 'valid', 'test', 'test-only']: graph_dir_name_for_split_type = input_dir.basename() + '-' + split_name graph_dir_for_split_type = output_dir.join( graph_dir_name_for_split_type) output_paths[split_name] = graph_dir_for_split_type graph_dir_for_split_type.make_as_dir() pool = Pool() pool.starmap(split_file, [(f, output_paths, train_ratio, valid_ratio, test_ratio, test_only_projects) for f in input_dir.get_filtered_files_in_dir('*')]) return None
def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> List[GraphSample]: if data_fold == DataFold.TRAIN: data_name = "train" elif data_fold == DataFold.VALIDATION: data_name = "valid" elif data_fold == DataFold.TEST: data_name = "test" else: raise ValueError("Unknown data fold '%s'" % str(data_fold)) print(" Loading PPI %s data from %s." % (data_name, data_dir)) graph_json_data = data_dir.join("%s_graph.json" % data_name).read_by_file_suffix() node_to_features = data_dir.join("%s_feats.npy" % data_name).read_by_file_suffix() node_to_labels = data_dir.join("%s_labels.npy" % data_name).read_by_file_suffix() node_to_graph_id = data_dir.join("%s_graph_id.npy" % data_name).read_by_file_suffix() self.__initial_node_feature_size = node_to_features.shape[-1] self.__num_labels = node_to_labels.shape[-1] # We read in all the data in two steps: # (1) Read features, labels and insert self-loop edges (edge type 0). # Implicitly, this gives us the number of nodes per graph. # (2) Read all edges, and shift them so that each graph starts with node 0. fwd_edge_type = 0 self.__num_edge_types = 1 if self.params['add_self_loop_edges']: self_loop_edge_type = self.__num_edge_types self.__num_edge_types += 1 if not self.params['tie_fwd_bkwd_edges']: bkwd_edge_type = self.__num_edge_types self.__num_edge_types += 1 graph_id_to_graph_data = {} # type: Dict[int, GraphSample] graph_id_to_node_offset = {} num_total_nodes = node_to_features.shape[0] for node_id in range(num_total_nodes): graph_id = node_to_graph_id[node_id] # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0 if graph_id not in graph_id_to_graph_data: graph_id_to_graph_data[graph_id] = \ GraphSample(adjacency_lists=[[] for _ in range(self.__num_edge_types)], type_to_node_to_num_incoming_edges=[[] for _ in range(self.__num_edge_types)], node_features=[], node_labels=[]) graph_id_to_node_offset[graph_id] = node_id cur_graph_data = graph_id_to_graph_data[graph_id] cur_graph_data.node_features.append(node_to_features[node_id]) cur_graph_data.node_labels.append(node_to_labels[node_id]) shifted_node_id = node_id - graph_id_to_node_offset[graph_id] if self.params['add_self_loop_edges']: cur_graph_data.adjacency_lists[self_loop_edge_type].append( (shifted_node_id, shifted_node_id)) cur_graph_data.type_to_node_to_num_incoming_edges[ self_loop_edge_type].append(1) # Prepare reading of the edges by setting counters to 0: for graph_data in graph_id_to_graph_data.values(): num_graph_nodes = len(graph_data.node_features) graph_data.type_to_node_to_num_incoming_edges[ fwd_edge_type] = np.zeros([num_graph_nodes], np.int32) if not self.params['tie_fwd_bkwd_edges']: graph_data.type_to_node_to_num_incoming_edges[ bkwd_edge_type] = np.zeros([num_graph_nodes], np.int32) for edge_info in graph_json_data['links']: src_node, tgt_node = edge_info['source'], edge_info['target'] # First, shift node IDs so that each graph starts at node 0: graph_id = node_to_graph_id[src_node] graph_node_offset = graph_id_to_node_offset[graph_id] src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset cur_graph_data = graph_id_to_graph_data[graph_id] cur_graph_data.adjacency_lists[fwd_edge_type].append( (src_node, tgt_node)) cur_graph_data.type_to_node_to_num_incoming_edges[fwd_edge_type][ tgt_node] += 1 if not self.params['tie_fwd_bkwd_edges']: cur_graph_data.adjacency_lists[bkwd_edge_type].append( (tgt_node, src_node)) cur_graph_data.type_to_node_to_num_incoming_edges[ bkwd_edge_type][src_node] += 1 final_graphs = [] for graph_data in graph_id_to_graph_data.values(): # numpy-ize: adj_lists = [] for edge_type_idx in range(self.__num_edge_types): adj_lists.append( np.array(graph_data.adjacency_lists[edge_type_idx])) final_graphs.append( GraphSample(adjacency_lists=adj_lists, type_to_node_to_num_incoming_edges=np.array( graph_data.type_to_node_to_num_incoming_edges), node_features=np.array(graph_data.node_features), node_labels=np.array(graph_data.node_labels))) return final_graphs
def load_eval_data_from_path(self, path: RichPath) -> Iterable[Any]: if path.path == self.default_data_path(): path = path.join("graphs-test") return iter(self.__load_data(path, DataFold.TEST))
def tensorise_data_in_dir(self, input_data_dir: RichPath, output_dir: RichPath, for_test: bool, max_num_files: Optional[int]=None, add_raw_data: bool=False, return_num_original_samples: bool = False) \ -> Union[List[RichPath], Tuple[List[RichPath], int]]: """ Tensorises data in directory by sample-by-sample, generating "chunk" files of lists of tensorised samples that are then consumed in the split_data_into_minibatches pipeline to construct minibatches. Args: input_data_dir: Where to load the raw data from (should come from the extraction pipeline) output_dir: Where to store the data to. for_test: Flag indicating if the data is to be used for testing (which required additional tensorisation steps) max_num_files: Maximal number of files to load data from. add_raw_data: Flag indicating that the original data should be added to the tensorised data. return_num_original_samples: Flag indicating that the return value should contain the number of samples we tried to load, including those that we discarded (e.g., because they were too big) Return: List of paths to the generated chunk files, or tuple of that list and the number of samples loaded (iff return_num_original_samples was set) """ data_files = get_data_files_from_directory(input_data_dir, max_num_files) tensorisation_argument_tuples = [] chunk_paths = [] for (partition_idx, raw_graph_file_partition) in enumerate( partition_files_by_size(data_files, 40 * 1024 * 1024)): target_file = output_dir.join("chunk_%04i.pkl.gz" % (partition_idx, )) tensorisation_argument_tuples.append( (raw_graph_file_partition, target_file)) chunk_paths.append(target_file) parsing_result_data = {"num_all_samples": 0, "num_used_samples": 0} data_file_parser_fn = make_data_file_parser(type(self), self.hyperparameters, self.metadata, for_test=for_test, add_raw_data=add_raw_data) def received_result_callback(result): (num_all_samples, num_used_samples) = result parsing_result_data['num_all_samples'] += num_all_samples parsing_result_data['num_used_samples'] += num_used_samples def finished_callback(): pass run_jobs_in_parallel(tensorisation_argument_tuples, data_file_parser_fn, received_result_callback, finished_callback) # Store the metadata we used for this as well, so that we can re-use the results: metadata_path = output_dir.join("metadata.pkl.gz") metadata_path.save_as_compressed_file({ "hyperparameters": self.hyperparameters, "metadata": self.__metadata, "num_used_samples": parsing_result_data['num_used_samples'], "num_all_samples": parsing_result_data['num_all_samples'] }) self.train_log( "Tensorised %i (%i before filtering) samples from '%s' into '%s'." % (parsing_result_data['num_used_samples'], parsing_result_data['num_all_samples'], input_data_dir, output_dir)) if return_num_original_samples: return chunk_paths, parsing_result_data['num_all_samples'] return chunk_paths
def load_eval_data_from_path(self, path: RichPath) -> Iterable[Any]: if path.path == self.default_data_path(): path = path.join("test.jsonl.gz") return self.__load_data(path)
def load_data(self, path: RichPath) -> None: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz"))
def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> List[PPIGraphSample]: """ ppi数据集包括4类文件,分别是 XXX_feats.npy XXX_graph.json XXX_graph_id.npy XXX_labels.npy read_by_file_suffix() : 读取 npy、json、pkl、jsonl等类型的文件 """ if data_fold == DataFold.TRAIN: data_name = "train" elif data_fold == DataFold.VALIDATION: data_name = "valid" elif data_fold == DataFold.TEST: data_name = "test" else: raise ValueError("Unknown data fold '%s'" % str(data_fold)) print(" Loading PPI %s data from %s." % (data_name, data_dir)) graph_json_data = data_dir.join("%s_graph.json" % data_name).read_by_file_suffix() node_to_features = data_dir.join("%s_feats.npy" % data_name).read_by_file_suffix() node_to_labels = data_dir.join("%s_labels.npy" % data_name).read_by_file_suffix() node_to_graph_id = data_dir.join("%s_graph_id.npy" % data_name).read_by_file_suffix() # We read in all the data in two steps: # (1) Read features and labels. Implicitly, this gives us the number of nodes per graph. # 每个节点的特征是50维,每个节点的label是121个 # (2) Read all edges, and shift them so that each graph starts with node 0. # 提取每个节点的边、特征、标签和节点 graph_id_to_edges: Dict[int, List[Tuple[int, int]]] = {} graph_id_to_features: Dict[int, List[np.ndarray]] = {} graph_id_to_labels: Dict[int, List[np.ndarray]] = {} graph_id_to_node_offset: Dict[int, int] = {} num_total_nodes = node_to_features.shape[0] for node_id in range(num_total_nodes): graph_id = node_to_graph_id[node_id] # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0 if graph_id not in graph_id_to_edges: graph_id_to_edges[graph_id] = [] graph_id_to_features[graph_id] = [] graph_id_to_labels[graph_id] = [] graph_id_to_node_offset[graph_id] = node_id graph_id_to_features[graph_id].append(node_to_features[node_id]) graph_id_to_labels[graph_id].append(node_to_labels[node_id]) for edge_info in graph_json_data["links"]: src_node, tgt_node = edge_info["source"], edge_info["target"] # First, shift node IDs so that each graph starts at node 0: graph_id = node_to_graph_id[src_node] graph_node_offset = graph_id_to_node_offset[graph_id] src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset graph_id_to_edges[graph_id].append((src_node, tgt_node)) final_graphs = [] for graph_id in graph_id_to_edges.keys(): num_nodes = len(graph_id_to_features[graph_id]) adjacency_lists, type_to_node_to_num_inedges = process_adjacency_lists( adjacency_lists=[graph_id_to_edges[graph_id]], num_nodes=num_nodes, add_self_loop_edges=self.params["add_self_loop_edges"], tied_fwd_bkwd_edge_types=self._tied_fwd_bkwd_edge_types, ) final_graphs.append( PPIGraphSample( adjacency_lists=adjacency_lists, type_to_node_to_num_inedges=type_to_node_to_num_inedges, node_features=np.array(graph_id_to_features[graph_id]), node_labels=np.array(graph_id_to_labels[graph_id]), )) return final_graphs
def load_data(cls, data_dir: RichPath, data_fold: str) -> List[PPIGraphSample]: print(" Loading PPI %s data from %s." % (data_fold, data_dir)) graph_json_data = data_dir.join("%s_graph.json" % data_fold).read_by_file_suffix() node_to_features = data_dir.join("%s_feats.npy" % data_fold).read_by_file_suffix() node_to_labels = data_dir.join("%s_labels.npy" % data_fold).read_by_file_suffix() node_to_graph_id = data_dir.join("%s_graph_id.npy" % data_fold).read_by_file_suffix() # We read in all the data in two steps: # (1) Read features, labels. Implicitly, this gives us the number of nodes per graph. # (2) Read all edges, and shift them so that each graph starts with node 0. fwd_edge_type = 0 graph_id_to_graph_data: Dict[int, PPIGraphSample] = {} graph_id_to_node_offset: Dict[int, int] = {} num_total_nodes = node_to_features.shape[0] for node_id in range(num_total_nodes): graph_id = node_to_graph_id[node_id] # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0 if graph_id not in graph_id_to_graph_data: graph_id_to_graph_data[graph_id] = PPIGraphSample( adjacency_lists=[[]], node_features=[], node_labels=[], ) graph_id_to_node_offset[graph_id] = node_id cur_graph_data = graph_id_to_graph_data[graph_id] cur_graph_data.node_features.append(node_to_features[node_id]) cur_graph_data.node_labels.append(node_to_labels[node_id]) for edge_info in graph_json_data["links"]: src_node, tgt_node = edge_info["source"], edge_info["target"] # First, shift node IDs so that each graph starts at node 0: graph_id = node_to_graph_id[src_node] graph_node_offset = graph_id_to_node_offset[graph_id] src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset cur_graph_data = graph_id_to_graph_data[graph_id] cur_graph_data.adjacency_lists[fwd_edge_type].append( (src_node, tgt_node)) final_graphs = [] for graph_data in graph_id_to_graph_data.values(): # numpy-ize: adj_lists = [ np.array(graph_data.adjacency_lists[fwd_edge_type], dtype=np.int32) ] final_graphs.append( PPIGraphSample( adjacency_lists=adj_lists, node_features=np.array(graph_data.node_features, dtype=np.float32), node_labels=np.array(graph_data.node_labels, dtype=np.bool), )) return final_graphs
def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]): np.random.shuffle(data_window) for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)): out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx)) out_file.save_as_compressed_file(data_chunk)