def check_output_uri(uri): dirname = os.path.dirname(uri) if not fileio.exists(dirname): fileio.make_dir(dirname) if not fileio.exists(dirname): raise ValueError( 'Output directory does not exist: {}'.format(dirname))
def check_input_uri(uri): if "," in uri: uri_list = uri.split(",") else: uri_list = [uri] for path in uri_list: if not fileio.exists(path): raise ValueError("Input file does not exist: {}".format(path))
def check_input_uri(uri): if ',' in uri: uri_list = uri.split(',') else: uri_list = [uri] for path in uri_list: if not fileio.exists(path): raise ValueError('Input file does not exist: {}'.format(path))
def load_autodetect(cls, path, dtype): """ Load from the given path. This can be anything that spark will read from: local file or HDFS file. It can also be a directory, and spark will read and concatenate them all. """ # Read the file as string # Examine the first 100 lines, and cast if necessary to int, float, or datetime cls._entry(path=path, dtype=dtype) # If the path is a directory, then look for sarray-data file in the directory. # If the path is a file, look for that file # Use type inference to determine the element type. # Passed-in dtype is always str and is ignored. lineage = Lineage.init_array_lineage(path) sc = CommonSparkContext.spark_context() if os.path.isdir(path): res = XRdd(sc.pickleFile(path)) metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path) as f: dtype = pickle.load(f) lineage_path = os.path.join(path, '_lineage') if fileio.exists(lineage_path): lineage = Lineage.load(lineage_path) else: res = XRdd(sc.textFile(path, use_unicode=False)) dtype = infer_type(res) if dtype != str: if dtype in (list, dict): res = res.map(lambda x: ast.literal_eval(x)) elif dtype is datetime.datetime: res = res.map(lambda x: date_parser.parse(x)) else: res = res.map(lambda x: dtype(x)) return cls(res, dtype, lineage)
def check_output_uri(uri): dirname = os.path.dirname(uri) if not fileio.exists(dirname): fileio.make_dir(dirname) if not fileio.exists(dirname): raise ValueError("Output directory does not exist: {}".format(dirname))