def saveRunData(path, config=None, feas_dict=None, rdd=None, batchSize=10): if path[-1] == '/': path = path[0:-1] # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(path): das_utils.makePath(path) if config is not None: config_path = path + "/config.ini" logging.debug("Saving config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, config) if rdd is not None: logging.debug("Pickle Batch Size: {}".format(batchSize)) data_path = path + "/data" logging.debug("Saving data to directory: {}".format(data_path)) das_utils.savePickledRDD(data_path, rdd, batchSize=batchSize) if feas_dict is not None: for key in feas_dict.keys(): feas_dict[key] = feas_dict[ key].value #this seems redundant, but is actually needed for the accumulator logging.info("Feasibility dictionary: {}".format(feas_dict)) feas_path = path + "/feas_dict.json" logging.debug("Saving feas_dict to directory: {}".format(feas_path)) das_utils.saveJSONFile(feas_path, feas_dict)
def saveRunData(self, path, feas_dict=None, rdd=None): self.annotate("saveRunData", verbose=True) if path[-1] == '/': path = path[0:-1] # RDD must be saved first, because it needs an empty prefix. if rdd is not None: output_datafile_name = os.path.join(path, self.output_datafname) if self.overwrite_flag: das_utils.clearPath(output_datafile_name) # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(output_datafile_name): das_utils.makePath(output_datafile_name) output_metadata_file_name = output_datafile_name + "/0_metadata" # sorts before 'p' output_header_file_name = output_datafile_name + "/1_header" # sorts before 'p' but after '1' self.annotate(f"writing RDD to {output_datafile_name}") self.saveRDD(output_datafile_name, rdd) if self.write_metadata: now = datetime.datetime.now().isoformat() self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count()) self.saveHeader(path=output_header_file_name) if self.s3cat: self.annotate(f"combining {output_datafile_name} with s3cat") s3cat.s3cat(output_datafile_name, demand_success=True, suffix=self.s3cat_suffix, verbose=self.s3cat_verbose) self.add_output_path(output_datafile_name + self.s3cat_suffix) else: self.add_output_path(output_datafile_name) config_path = os.path.join(path, C.CONFIG_INI) self.annotate("Saving config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, self.config) if feas_dict is not None: for key in feas_dict.keys(): if hasattr(feas_dict[key], 'value'): feas_dict[key] = feas_dict[ key].value # this seems redundant, but is actually needed for the accumulator self.log_and_print(f"Feasibility dictionary: {feas_dict}") feas_path = os.path.join(path, C.FEAS_DICT_JSON) self.annotate(f"Saving feas_dict to directory: {feas_path}") das_utils.saveJSONFile(feas_path, feas_dict)
def saveNoisyAnswers(self, nodes: __NodeDict__, postfix: str = "") -> None: """ Save RDDs with geonodes as pickle files, by geolevel :param nodes: RDD or by-geolevel dictionary of noisy nodes RDDs :param postfix: postfix to add to default filename (e.g. "_ms" for minimal_schema run) :return: """ for level, nodes_rdd in nodes.items(): path = self.noisyPath(self.app_id, level, postfix) das_utils.savePickledRDD(path, nodes_rdd) das_utils.saveConfigFile( os.path.join(self.saveloc, f"{self.app_id}-bylevel_pickled_rdds.config"), self.config)
def saveNoisyAnswers(self, nodes: __NodeDict__, repart_by_parent=True, postfix: str = "") -> None: """ Save RDDs with geonodes as pickle files, by geolevel :param repart_by_parent: :param nodes: RDD or by-geolevel dictionary of noisy nodes RDDs :param postfix: postfix to add to default filename (e.g. "_ms" for minimal_schema run) :return: """ if self.setup.dvs_enabled: from programs.python_dvs.dvs import DVS_Singleton dvs_singleton = DVS_Singleton() else: dvs_singleton = None noisy_partitions_dict = self.setup.noisy_partitions_dict for level, nodes_rdd in nodes.items(): self.annotate(f"Saving {level}{postfix} noisy measurements") path = self.noisyPath(self.app_id, level, postfix) num_noisy_parts = noisy_partitions_dict[level] rdd2save = nodes_rdd if repart_by_parent: self.annotate(f"Repartitioning by parent geocode") rdd2save = das_utils.partitionByParentGeocode( nodes_rdd, nodes_rdd.getNumPartitions()) elif num_noisy_parts > 0: self.annotate( f"Coalescing noisy measurements to {num_noisy_parts} parts" ) self.annotate( f"NOTE: NOT actually Coalescing noisy measurements to {num_noisy_parts} parts" ) # rdd2save = nodes_rdd.coalesce(num_noisy_parts) rdd2save = rdd2save.map(lambda node: node.zipNoisy()) das_utils.savePickledRDD(path, rdd2save, dvs_singleton=dvs_singleton) das_utils.saveConfigFile( os.path.join(self.saveloc, f"{self.app_id}-bylevel_pickled_rdds.config"), self.config)
def saveRunData(self, path, feas_dict=None, rdd=None): self.annotate("saveRunData", verbose=True) if path[-1] == '/': path = path[0:-1] # RDD must be saved first, because it needs an empty prefix. if rdd is not None: output_datafile_name = os.path.join(path, self.output_datafname) if self.overwrite_flag: das_utils.clearPath(output_datafile_name) # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(output_datafile_name): das_utils.makePath(output_datafile_name) output_metadata_file_name = output_datafile_name+"/0_metadata" # sorts before 'p' output_header_file_name = output_datafile_name+"/1_header" # sorts before 'p' but after '1' self.annotate(f"writing RDD to {output_datafile_name}") self.saveRDD(output_datafile_name, rdd) if self.write_metadata: now = datetime.datetime.now().isoformat() self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count()) self.saveHeader(path=output_header_file_name) if self.s3cat: # If we combine the data with s3cat # note the combined filename in the annotated output, the DFXML file, the DVS object, and do it. self.annotate(f"combining {output_datafile_name} with s3cat") # Record this with DFXML ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_S3CAT, {'output_datafile_name':output_datafile_name, 'demand_success':'True', 'suffix':self.s3cat_suffix, 'verbose':str(self.s3cat_verbose)}) self.add_output_path(output_datafile_name + self.s3cat_suffix) s3cat.s3_cat(output_datafile_name) else: # Otherwise just note the prefix in DFS and DFXML ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_OUTPUT).text=output_datafile_name+"/" self.add_output_path(output_datafile_name + "/") config_path = os.path.join(path, f"{self.output_datafname}_{CC.CONFIG_INI}") self.annotate("Saving the flattened config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, self.config) f = io.StringIO() self.config.write(f) ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_CONFIG).text = f.getvalue() if feas_dict is not None: for key in feas_dict.keys(): if hasattr(feas_dict[key], 'value'): feas_dict[key] = feas_dict[key].value # this seems redundant, but is actually needed for the accumulator self.log_and_print(f"Feasibility dictionary: {feas_dict}") feas_path = os.path.join(path, f"{self.output_datafname}_{CC.FEAS_DICT_JSON}") self.annotate(f"Saving feas_dict to directory: {feas_path}") das_utils.saveJSONFile(feas_path, feas_dict)