def saveMetadata(self, *, path, now, count): """Saves metadata to the requested S3 location. This header will then be combined with the contents by the s3cat command""" logging.info(f"writing metadata to {path} {now} count={count}") with s3open(path, "w", fsync=True) as f: f.write("# {}\n".format( self.getconfig(C.CLASSIFICATION_LEVEL, section=C.WRITER, default=C.DEFAULT_CLASSIFICATION_LEVEL))) f.write("# Created: {}\n".format(now)) f.write("# Records: {}\n".format(count)) f.write("# Command line: {}\n".format(sys.executable + " " + " ".join(sys.argv))) f.write("# uid: {}\n".format(os.getuid())) f.write("# username: {}\n".format(pwd.getpwuid(os.getuid())[0])) f.write("# Boot Time: {}\n".format( datetime.datetime.fromtimestamp( psutil.boot_time()).isoformat())) f.write("# Start Time: {}\n".format( datetime.datetime.fromtimestamp(self.das.t0).isoformat())) uname = os.uname() uname_fields = [ 'os_sysname', 'host', 'os_release', 'os_version', 'arch' ] for i in range(len(uname_fields)): f.write("# {}: {}\n".format(uname_fields[i], uname[i]))
def saveJSONFile(path, data, indent=None): if isS3Path(path): savefile = s3.s3open(path=path, mode="w") json.dump(data, savefile, indent=indent) savefile.close() else: with open(path, 'w') as f: json.dump(data, f, indent=indent)
def savePickleFile(path, data): if isS3Path(path): savefile = s3.s3open(path=path, mode='wb') pickle.dump(data, savefile) savefile.close() else: with open(path, 'wb') as f: pickle.dump(data, f)
def saveConfigFile(path, config): if isS3Path(path): savefile = s3.s3open(path=path, mode="w") config.write(savefile) savefile.close() else: with open(path, 'w') as f: config.write(f)
def saveConfigFile(path, config): # Only save the config file in S3. # It's in the DFXML file anyway. if isS3Path(path): savefile = s3.s3open(path=path, mode="w") config.write(savefile) savefile.close() else: logging.warning(f"saveConfigFile: not saving config file to {path}")
def saveListAsTextFile(path, thelist, mode="w"): if isS3Path(path): savefile = s3.s3open(path=path, mode=mode) for item in thelist: savefile.write("{}\n".format(item)) savefile.close() else: with open(path, mode) as f: for item in thelist: f.write("{}\n".format(item))
def loadConfigFile(path): config = ConfigParser() if isS3Path(path): config_file = s3.s3open(path=path, mode="r") config.readfp(config_file) config_file.close() else: with open(path, 'r') as config_file: config.readfp(config_file) return config
def savePickleFile(path, data): """Saves data from the DRIVER node to a local file or to S3 as a pickle. :param path: where to save the data. Can be a regular path or an s3:// path. :param data: the data to save as a pickle. """ path = expandPathRemoveHdfs(path) if isS3Path(path): savefile = s3.s3open(path=path, mode='wb') pickle.dump(data, savefile) savefile.close() else: with open(path, 'wb') as f: pickle.dump(data, f)
def loadConfigFile(path): config = ConfigParser() # since the config parser object automatically converts item names to lowercase, # use this to prevent it from doing so config.optionxform = str if isS3Path(path): config_file = s3.s3open(path=path, mode="r") # config.readfp(config_file) This is deprecated config.read_file(config_file) config_file.close() else: with open(path, 'r') as config_file: # config.readfp(config_file) This is deprecated config.read_file(config_file) return config
def stats_for_s3url(self, s3url): """Given an S3 URL in the form s3://bucket/key, get the data, decompress it, parse, and return a list of objects. This is a slow process, and we use multithreading to make it run quickly.""" if self.debug: print("Download {}".format(s3url)) download_data = s3.s3open(s3url, 'rb', cache=self.cache).read() if s3url.endswith(".gz"): data = gzip.decompress(download_data).decode('utf-8') else: data = download_data.decode('utf-8') if data.startswith("<dfxml>"): return extract_dfxml_state(data, debug=self.debug) try: return self.extract_instance_state(s3url, data) except SnipError as e: self.vprint("Snip error processing {}:\n{}\n".format(s3url, e)) return []
filename="convert_hh.log", format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s" ) for experiment in EXPERIMENTS: invar_loaded = True print( f'Converting {experiment.type} experiment at: {experiment.folder}') for sub_folder in experiment.sub_folders: for run_number in range(experiment.runs): full_path = f'{experiment.folder}/{sub_folder}/run_000{str(run_number)}' config_path = f'{experiment.folder}/{sub_folder}/run_000{str(run_number)}/config.ini' print(f'Converting experiment at (full path) {full_path}') print(f'Config file located at: {config_path}') config_file = s3open(config_path).read() print(f'type(config file): {type(config_file)}') print(f'config file: {config_file}') config = ConfigParser() config.read_string(config_file) print( f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}' ) output_datafile_name = config.get(CC.WRITER_SECTION, CC.OUTPUT_DATAFILE_NAME) print( f'section:writer, output_datafile_name: {output_datafile_name}' )
def loadPickleS3(path): loadfile = s3.s3open(path=path, mode="rb") contents = pickle.load(loadfile.file_obj) return contents
def loadJSONS3(path): jsonfile = s3.s3open(path=path, mode='r') contents = json.load(jsonfile.file_obj) return contents
def saveHeader(path, var_list): """Saves header to the requested S3 location. This header will then be combined with the contents by the s3cat command""" with s3open(path, "w", fsync=True) as f: f.write("|".join(var_list)) f.write("\n")
def saveHeader(self, *, path): """Saves header to the requested S3 location. This header will then be combined with the contents by the s3cat command""" self.annotate(f"writing header to {path}") with s3open(path, "w", fsync=True) as f: f.write("|".join(self.var_list)) f.write("\n")
def old_main(s3path, config_path): print('Beginning of pickle picker') logging.info("Beginning of pickle picker") spark = SparkSession.builder.getOrCreate() files_shipped = False logging.basicConfig( filename="convert.log", format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s" ) invar_loaded = False print(f'Source data: {s3path}') print(f'Config file located at: {config_path}') config = ConfigParser() config.read_string(s3open(config_path).read()) """ print(f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}') output_datafile_name = config.get(CC.WRITER_SECTION, CC.OUTPUT_DATAFILE_NAME) print(f'section:writer, output_datafile_name: {output_datafile_name}') output_path = f'{experiment.folder}_unpickled/{sub_folder}/run_000{str(run_number)}' config.set(CC.WRITER_SECTION, CC.OUTPUT_PATH, output_path) config.set(CC.WRITER_SECTION, CC.S3CAT, '1') config.set(CC.WRITER_SECTION, CC.S3CAT_SUFFIX, '.csv') config.set(CC.WRITER_SECTION, CC.OVERWRITE_FLAG, '0') config.set(CC.WRITER_SECTION, CC.WRITE_METADATA, '1') config.set(CC.WRITER_SECTION, CC.CLASSIFICATION_LEVEL, 'C_U_I//CENS') config.set(CC.WRITER_SECTION, CC.NUM_PARTS, '5000') print(f'modified writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}') print(f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}') # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}') print(f"Reading pickled data: {s3path}") """ # Ship the files to spark and get the setup object das_stub = DASStub() das_stub.t0 = time.time() das_stub.output_paths = [] setup = ds.DASDecennialSetup(config=config, name='setup', das=das_stub) setup_data = setup.setup_func() nodes_dict_rdd = spark.sparkContext.pickleFile(s3path) """ a_node_dict = nodes_dict_rdd.take(1)[0] if not (experiment.type is PERSON): if INVAR not in a_node_dict and '_invar' not in a_node_dict: if not invar_loaded: invar_rdd = spark\ .sparkContext\ .pickleFile('s3://uscb-decennial-ite-das/users/sexto015/experiments/full_household/Sept12_TestMUD_VA_PLB_Experiment/td001/run_0000/data') \ .map(lambda nd: (nd[GEOCODE], nd['_invar'])) invar_loaded = True nodes_dict_rdd = nodes_dict_rdd\ .map(lambda nd: (nd[GEOCODE], nd[SYN]))\ .join(invar_rdd)\ .map(lambda g_sk: {GEOCODE: g_sk[0], SYN: g_sk[1][0], INVAR: g_sk[1][1]}) # print(nodes_dict_rdd.count()) # from rdd_like_list import RDDLikeList # nodes_dict_rdd = RDDLikeList(nodes_dict_rdd.take(10)) if experiment.type is PERSON: print('Using Person Writer') w = NonConvertingMDF2020PersonWriter(config=config, setup=setup_data, name='writer', das=das_stub) else: print('Using Household Writer') w = NonConvertingMDF2020HouseholdWriter(config=config, setup=setup_data, name='writer', das=das_stub) print('Writing') """ # calls programs.writer.write() which takes an engine_tuple # engine_tuple is (blocknoderdd, feas_dict) # w.write((nodes_dict_rdd, None)) # For testing, just take the first record and print it record = nodes_dict_rdd.take(1) print("record:", record)