Exemplo n.º 1
0
 def saveMetadata(self, *, path, now, count):
     """Saves metadata to the requested S3 location. This header will then be combined with the contents by the s3cat command"""
     logging.info(f"writing metadata to {path} {now} count={count}")
     with s3open(path, "w", fsync=True) as f:
         f.write("# {}\n".format(
             self.getconfig(C.CLASSIFICATION_LEVEL,
                            section=C.WRITER,
                            default=C.DEFAULT_CLASSIFICATION_LEVEL)))
         f.write("# Created: {}\n".format(now))
         f.write("# Records: {}\n".format(count))
         f.write("# Command line: {}\n".format(sys.executable + " " +
                                               " ".join(sys.argv)))
         f.write("# uid: {}\n".format(os.getuid()))
         f.write("# username: {}\n".format(pwd.getpwuid(os.getuid())[0]))
         f.write("# Boot Time: {}\n".format(
             datetime.datetime.fromtimestamp(
                 psutil.boot_time()).isoformat()))
         f.write("# Start Time: {}\n".format(
             datetime.datetime.fromtimestamp(self.das.t0).isoformat()))
         uname = os.uname()
         uname_fields = [
             'os_sysname', 'host', 'os_release', 'os_version', 'arch'
         ]
         for i in range(len(uname_fields)):
             f.write("# {}: {}\n".format(uname_fields[i], uname[i]))
Exemplo n.º 2
0
def saveJSONFile(path, data, indent=None):
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode="w")
        json.dump(data, savefile, indent=indent)
        savefile.close()
    else:
        with open(path, 'w') as f:
            json.dump(data, f, indent=indent)
Exemplo n.º 3
0
def savePickleFile(path, data):
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode='wb')
        pickle.dump(data, savefile)
        savefile.close()
    else:
        with open(path, 'wb') as f:
            pickle.dump(data, f)
Exemplo n.º 4
0
def saveConfigFile(path, config):
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode="w")
        config.write(savefile)
        savefile.close()
    else:
        with open(path, 'w') as f:
            config.write(f)
Exemplo n.º 5
0
def saveConfigFile(path, config):
    # Only save the config file in S3.
    # It's in the DFXML file anyway.
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode="w")
        config.write(savefile)
        savefile.close()
    else:
        logging.warning(f"saveConfigFile: not saving config file to {path}")
Exemplo n.º 6
0
def saveListAsTextFile(path, thelist, mode="w"):
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode=mode)
        for item in thelist:
            savefile.write("{}\n".format(item))
        savefile.close()
    else:
        with open(path, mode) as f:
            for item in thelist:
                f.write("{}\n".format(item))
Exemplo n.º 7
0
def loadConfigFile(path):
    config = ConfigParser()
    if isS3Path(path):
        config_file = s3.s3open(path=path, mode="r")
        config.readfp(config_file)
        config_file.close()
    else:
        with open(path, 'r') as config_file:
            config.readfp(config_file)

    return config
Exemplo n.º 8
0
def savePickleFile(path, data):
    """Saves data from the DRIVER node to a local file or to S3 as a pickle.
    :param path: where to save the data. Can be a regular path or an s3:// path.
    :param data: the data to save as a pickle.
    """
    path = expandPathRemoveHdfs(path)
    if isS3Path(path):
        savefile = s3.s3open(path=path, mode='wb')
        pickle.dump(data, savefile)
        savefile.close()
    else:
        with open(path, 'wb') as f:
            pickle.dump(data, f)
Exemplo n.º 9
0
def loadConfigFile(path):
    config = ConfigParser()
    # since the config parser object automatically converts item names to lowercase,
    # use this to prevent it from doing so
    config.optionxform = str
    if isS3Path(path):
        config_file = s3.s3open(path=path, mode="r")
        # config.readfp(config_file)  This is deprecated
        config.read_file(config_file)
        config_file.close()
    else:
        with open(path, 'r') as config_file:
            # config.readfp(config_file)  This is deprecated
            config.read_file(config_file)

    return config
    def stats_for_s3url(self, s3url):
        """Given an S3 URL in the form s3://bucket/key, get the data, decompress it, parse, 
        and return a list of objects. This is a slow process,
        and we use multithreading to make it run quickly."""

        if self.debug:
            print("Download {}".format(s3url))

        download_data = s3.s3open(s3url, 'rb', cache=self.cache).read()
        if s3url.endswith(".gz"):
            data = gzip.decompress(download_data).decode('utf-8')
        else:
            data = download_data.decode('utf-8')

        if data.startswith("<dfxml>"):
            return extract_dfxml_state(data, debug=self.debug)
        try:
            return self.extract_instance_state(s3url, data)
        except SnipError as e:
            self.vprint("Snip error processing {}:\n{}\n".format(s3url, e))
            return []
Exemplo n.º 11
0
        filename="convert_hh.log",
        format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s"
    )
    for experiment in EXPERIMENTS:
        invar_loaded = True
        print(
            f'Converting {experiment.type} experiment at: {experiment.folder}')
        for sub_folder in experiment.sub_folders:
            for run_number in range(experiment.runs):
                full_path = f'{experiment.folder}/{sub_folder}/run_000{str(run_number)}'
                config_path = f'{experiment.folder}/{sub_folder}/run_000{str(run_number)}/config.ini'

                print(f'Converting experiment at (full path) {full_path}')
                print(f'Config file located at: {config_path}')

                config_file = s3open(config_path).read()

                print(f'type(config file): {type(config_file)}')
                print(f'config file: {config_file}')

                config = ConfigParser()
                config.read_string(config_file)

                print(
                    f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}'
                )
                output_datafile_name = config.get(CC.WRITER_SECTION,
                                                  CC.OUTPUT_DATAFILE_NAME)
                print(
                    f'section:writer, output_datafile_name: {output_datafile_name}'
                )
Exemplo n.º 12
0
def loadPickleS3(path):
    loadfile = s3.s3open(path=path, mode="rb")
    contents = pickle.load(loadfile.file_obj)
    return contents
Exemplo n.º 13
0
def loadJSONS3(path):
    jsonfile = s3.s3open(path=path, mode='r')
    contents = json.load(jsonfile.file_obj)
    return contents
Exemplo n.º 14
0
def saveHeader(path, var_list):
    """Saves header to the requested S3 location. This header will then be combined with the contents by the s3cat command"""
    with s3open(path, "w", fsync=True) as f:
        f.write("|".join(var_list))
        f.write("\n")
Exemplo n.º 15
0
 def saveHeader(self, *, path):
     """Saves header to the requested S3 location. This header will then be combined with the contents by the s3cat command"""
     self.annotate(f"writing header to {path}")
     with s3open(path, "w", fsync=True) as f:
         f.write("|".join(self.var_list))
         f.write("\n")
Exemplo n.º 16
0
def old_main(s3path, config_path):
    print('Beginning of pickle picker')
    logging.info("Beginning of pickle picker")

    spark = SparkSession.builder.getOrCreate()
    files_shipped = False

    logging.basicConfig(
        filename="convert.log",
        format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s"
    )
    invar_loaded = False
    print(f'Source data: {s3path}')
    print(f'Config file located at: {config_path}')

    config = ConfigParser()
    config.read_string(s3open(config_path).read())
    """
    print(f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}')
    output_datafile_name = config.get(CC.WRITER_SECTION, CC.OUTPUT_DATAFILE_NAME)
    print(f'section:writer, output_datafile_name: {output_datafile_name}')
    output_path = f'{experiment.folder}_unpickled/{sub_folder}/run_000{str(run_number)}'
    config.set(CC.WRITER_SECTION, CC.OUTPUT_PATH, output_path)
    config.set(CC.WRITER_SECTION, CC.S3CAT, '1')
    config.set(CC.WRITER_SECTION, CC.S3CAT_SUFFIX, '.csv')
    config.set(CC.WRITER_SECTION, CC.OVERWRITE_FLAG, '0')
    config.set(CC.WRITER_SECTION, CC.WRITE_METADATA, '1')
    config.set(CC.WRITER_SECTION, CC.CLASSIFICATION_LEVEL, 'C_U_I//CENS')
    config.set(CC.WRITER_SECTION, CC.NUM_PARTS, '5000')
    print(f'modified writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}')
    print(f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}')

    # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}')



    print(f"Reading pickled data: {s3path}")
    """

    # Ship the files to spark and get the setup object
    das_stub = DASStub()
    das_stub.t0 = time.time()
    das_stub.output_paths = []
    setup = ds.DASDecennialSetup(config=config, name='setup', das=das_stub)
    setup_data = setup.setup_func()
    nodes_dict_rdd = spark.sparkContext.pickleFile(s3path)
    """
    a_node_dict = nodes_dict_rdd.take(1)[0]
    if not (experiment.type is PERSON):
        if INVAR not in a_node_dict and '_invar' not in a_node_dict:
            if not invar_loaded:
                invar_rdd = spark\
                    .sparkContext\
                    .pickleFile('s3://uscb-decennial-ite-das/users/sexto015/experiments/full_household/Sept12_TestMUD_VA_PLB_Experiment/td001/run_0000/data') \
                    .map(lambda nd: (nd[GEOCODE], nd['_invar']))
                invar_loaded = True
            nodes_dict_rdd = nodes_dict_rdd\
                .map(lambda nd: (nd[GEOCODE], nd[SYN]))\
                .join(invar_rdd)\
                .map(lambda g_sk: {GEOCODE: g_sk[0], SYN: g_sk[1][0], INVAR: g_sk[1][1]})

    # print(nodes_dict_rdd.count())
    # from rdd_like_list import RDDLikeList
    # nodes_dict_rdd = RDDLikeList(nodes_dict_rdd.take(10))

    if experiment.type is PERSON:
        print('Using Person Writer')
        w = NonConvertingMDF2020PersonWriter(config=config, setup=setup_data, name='writer', das=das_stub)
    else:
        print('Using Household Writer')
        w = NonConvertingMDF2020HouseholdWriter(config=config, setup=setup_data, name='writer', das=das_stub)

    print('Writing')
    """

    # calls programs.writer.write() which takes an engine_tuple
    # engine_tuple is (blocknoderdd, feas_dict)
    # w.write((nodes_dict_rdd, None))
    # For testing, just take the first record and print it
    record = nodes_dict_rdd.take(1)
    print("record:", record)