def __init__(self, filename): """ Construct a GLC unpickler. Parameters ---------- filename : Name of the file to read from. The file can be a GLC pickle file, a cloud pickle file, or a python pickle file. Returns ---------- GLC unpickler. """ self.gl_object_memo = {} self.pickle_filename = None self.tmp_file = None self.file = None self.gl_temp_storage_path = _get_tmp_file_location() # GLC 1.3 used Zipfiles for storing the objects. self.directory_mode = True if _file_util.is_s3_path(filename): self.tmp_file = _get_temp_filename() # GLC 1.3 uses zipfiles if _file_util._is_valid_s3_key(filename): _file_util.download_from_s3(filename, self.tmp_file, \ aws_credentials = _get_aws_credentials(), is_dir=False, silent=True) # GLC 1.4 uses directories else: _file_util.download_from_s3(filename, self.tmp_file, \ aws_credentials = _get_aws_credentials(), is_dir=True, silent=True) filename = self.tmp_file elif _file_util.is_hdfs_path(filename): self.tmp_file = _get_temp_filename() _file_util.download_from_hdfs(filename, self.tmp_file) filename = self.tmp_file else: if not _os.path.exists(filename): raise IOError('%s is not a valid file name.' % filename) # GLC 1.3 Pickle file if _zipfile.is_zipfile(filename): self.directory_mode = False pickle_filename = None # Get the pickle file name. zf = _zipfile.ZipFile(filename, allowZip64=True) for info in zf.infolist(): if info.filename == 'pickle_file': pickle_filename = zf.read(info.filename) if pickle_filename is None: raise IOError(("Cannot pickle file of the given format. File" " must be one of (a) GLPickler archive, " "(b) Cloudpickle archive, or (c) python pickle archive.")) # Extract the zip file. try: outpath = self.gl_temp_storage_path zf.extractall(outpath) except IOError as err: print "Graphlab pickle extraction error: %s " % err self.pickle_filename = _os.path.join(self.gl_temp_storage_path, pickle_filename) # GLC Pickle directory mode. elif _os.path.isdir(filename): self.directory_mode = True pickle_filename = _os.path.join(filename, "pickle_archive") if not _os.path.exists(pickle_filename): raise IOError("Corrupted archive: Missing pickle file %s." % pickle_filename) if not _os.path.exists(_os.path.join(filename, "version")): raise IOError("Corrupted archive: Missing version file.") self.pickle_filename = pickle_filename self.gl_temp_storage_path = _os.path.abspath(filename) # Pure pickle file. else: self.directory_mode = False self.pickle_filename = filename self.file = open(self.pickle_filename, 'rb') _pickle.Unpickler.__init__(self, self.file)
def __init__(self, filename): """ Construct a GLC unpickler. Parameters ---------- filename : Name of the file to read from. The file can be a GLC pickle file, a cloud pickle file, or a python pickle file. Returns ---------- GLC unpickler. """ self.gl_object_memo = {} self.pickle_filename = None self.tmp_file = None self.file = None self.gl_temp_storage_path = _get_tmp_file_location() # GLC 1.3 used Zipfiles for storing the objects. self.directory_mode = True if _file_util.is_s3_path(filename): self.tmp_file = _get_temp_filename() # GLC 1.3 uses zipfiles if _file_util._is_valid_s3_key(filename): _file_util.download_from_s3(filename, self.tmp_file, \ aws_credentials = _get_aws_credentials(), is_dir=False, silent=True) # GLC 1.4 uses directories else: _file_util.download_from_s3(filename, self.tmp_file, \ aws_credentials = _get_aws_credentials(), is_dir=True, silent=True) filename = self.tmp_file elif _file_util.is_hdfs_path(filename): self.tmp_file = _get_temp_filename() _file_util.download_from_hdfs(filename, self.tmp_file) filename = self.tmp_file else: if not _os.path.exists(filename): raise IOError('%s is not a valid file name.' % filename) # GLC 1.3 Pickle file if _zipfile.is_zipfile(filename): self.directory_mode = False pickle_filename = None # Get the pickle file name. zf = _zipfile.ZipFile(filename, allowZip64=True) for info in zf.infolist(): if info.filename == 'pickle_file': pickle_filename = zf.read(info.filename) if pickle_filename is None: raise IOError( ("Cannot pickle file of the given format. File" " must be one of (a) GLPickler archive, " "(b) Cloudpickle archive, or (c) python pickle archive.")) # Extract the zip file. try: outpath = self.gl_temp_storage_path zf.extractall(outpath) except IOError as err: print "Graphlab pickle extraction error: %s " % err self.pickle_filename = _os.path.join(self.gl_temp_storage_path, pickle_filename) # GLC Pickle directory mode. elif _os.path.isdir(filename): self.directory_mode = True pickle_filename = _os.path.join(filename, "pickle_archive") if not _os.path.exists(pickle_filename): raise IOError("Corrupted archive: Missing pickle file %s." % pickle_filename) if not _os.path.exists(_os.path.join(filename, "version")): raise IOError("Corrupted archive: Missing version file.") self.pickle_filename = pickle_filename self.gl_temp_storage_path = _os.path.abspath(filename) # Pure pickle file. else: self.directory_mode = False self.pickle_filename = filename self.file = open(self.pickle_filename, 'rb') _pickle.Unpickler.__init__(self, self.file)
def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): """ Construct a GLC pickler. Parameters ---------- filename : Name of the file to write to. This file is all you need to pickle all objects (including GLC objects). protocol : Pickle protocol (see pickle docs). Note that all pickle protocols may not be compatable with GLC objects. min_bytes_to_save : Cloud pickle option (see cloud pickle docs). Returns ---------- GLC pickler. """ # Zipfile # -------- # Version 1: GLC 1.2.1 # # Directory: # ---------- # Version 1: GLC 1.4: 1 self.archive_filename = None self.gl_temp_storage_path = _get_tmp_file_location() self.gl_object_memo = set() self.mark_for_delete = set() if _file_util.is_s3_path(filename): self.s3_path = filename self.hdfs_path = None elif _file_util.is_hdfs_path(filename): self.s3_path = None self.hdfs_path = filename self.hadoop_conf_dir = None else: # Make sure the directory exists. filename = _os.path.abspath(filename) if not _os.path.exists(filename): _os.makedirs(filename) elif _os.path.isdir(filename): self.mark_for_delete = self._to_abs_path_set( _glob.glob(_os.path.join(filename, "*"))) self.mark_for_delete -= self._to_abs_path_set( [_os.path.join(filename, 'pickle_archive'), _os.path.join(filename, 'version')]) elif _os.path.isfile(filename): _os.remove(filename) _os.makedirs(filename) # Create a new directory. self.gl_temp_storage_path = filename self.s3_path = None self.hdfs_path = None self.hadoop_conf_dir = None # The pickle file where all the Python objects are saved. relative_pickle_filename = "pickle_archive" pickle_filename = _os.path.join(self.gl_temp_storage_path, relative_pickle_filename) try: # Initialize the pickle file with cloud _pickle. Note, cloud pickle # takes a file handle for initialization. self.file = open(pickle_filename, 'wb') _cloudpickle.CloudPickler.__init__(self, self.file, protocol) except IOError as err: print "GraphLab create pickling error: %s" % err # Write the version number. with open(_os.path.join(self.gl_temp_storage_path, 'version'), 'w') as f: f.write("1.0")
def __init__(self, filename, protocol=-1, min_bytes_to_save=0): """ Construct a GLC pickler. Parameters ---------- filename : Name of the file to write to. This file is all you need to pickle all objects (including GLC objects). protocol : Pickle protocol (see pickle docs). Note that all pickle protocols may not be compatable with GLC objects. min_bytes_to_save : Cloud pickle option (see cloud pickle docs). Returns ---------- GLC pickler. """ # Zipfile # -------- # Version 1: GLC 1.2.1 # # Directory: # ---------- # Version 1: GLC 1.4: 1 self.archive_filename = None self.gl_temp_storage_path = _get_tmp_file_location() self.gl_object_memo = set() self.mark_for_delete = set() if _file_util.is_s3_path(filename): self.s3_path = filename self.hdfs_path = None elif _file_util.is_hdfs_path(filename): self.s3_path = None self.hdfs_path = filename self.hadoop_conf_dir = None else: # Make sure the directory exists. filename = _os.path.abspath(filename) if not _os.path.exists(filename): _os.makedirs(filename) elif _os.path.isdir(filename): self.mark_for_delete = self._to_abs_path_set( _glob.glob(_os.path.join(filename, "*"))) self.mark_for_delete -= self._to_abs_path_set([ _os.path.join(filename, 'pickle_archive'), _os.path.join(filename, 'version') ]) elif _os.path.isfile(filename): _os.remove(filename) _os.makedirs(filename) # Create a new directory. self.gl_temp_storage_path = filename self.s3_path = None self.hdfs_path = None self.hadoop_conf_dir = None # The pickle file where all the Python objects are saved. relative_pickle_filename = "pickle_archive" pickle_filename = _os.path.join(self.gl_temp_storage_path, relative_pickle_filename) try: # Initialize the pickle file with cloud _pickle. Note, cloud pickle # takes a file handle for initialization. self.file = open(pickle_filename, 'wb') _cloudpickle.CloudPickler.__init__(self, self.file, protocol) except IOError as err: print "GraphLab create pickling error: %s" % err # Write the version number. with open(_os.path.join(self.gl_temp_storage_path, 'version'), 'w') as f: f.write("1.0")