Exemplo n.º 1
0
def write(idir, odir, remove, check, verbose):
    "Write files from given input area into HDFS"
    if  not os.path.isdir(idir):
        print("Source area %s does not exists" % idir)
        sys.exit(1)
    if  not hdfs.path.isdir(odir):
        print("Destination area on HDFS %s does not exists" % odir)
        print("Create it first with the following command")
        print("hadoop fs -mkdir %s" % odir)
        sys.exit(1)
    for name in os.listdir(idir):
        fname = os.path.join(idir, name)
        if  not (name.endswith('.avro') or \
            name.endswith('.avro.gz') or \
            name.endswith('.avro.bz2')):
            if  verbose:
                print("Skip %s" % fname)
            continue
        oname = hdfs_file(odir, name)
        if  not hdfs.path.isfile(oname):
            if  verbose:
                print("Migrate %s to %s" % (fname, oname))
            hdfs.put(fname, oname)
            if  check:
                fsize = os.stat(fname).st_size
                osize = hdfs.stat(oname).st_size
                if  fsize != osize:
                    print("Size %s (%s) != %s (%s)" % (fname, fsize, oname, osize))
                    sys.exit(1)
            if  remove:
                os.remove(fname)
Exemplo n.º 2
0
def _create_directories(app_id, run_id, param_string, type, sub_type=None):
    """
    Creates directories for an experiment, if Experiments folder exists it will create directories
    below it, otherwise it will create them in the Logs directory.

    Args:
        :app_id: YARN application ID of the experiment
        :run_id: Experiment ID
        :param_string: name of the new directory created under parent directories
        :type: type of the new directory parent, e.g differential_evolution
        :sub_type: type of sub directory to parent, e.g generation

    Returns:
        The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir)
    """

    pyhdfs_handle = get()

    if pyhdfs_handle.exists(project_path() + "Experiments"):
        hdfs_events_parent_dir = project_path() + "Experiments"
    elif pyhdfs_handle.exists(project_path() + "Logs"):
        hdfs_events_parent_dir = project_path() + "Logs/TensorFlow"
        try:
            st = hdfs.stat(hdfs_events_parent_dir)
            if not bool(st.st_mode & local_stat.S_IWGRP
                        ):  # if not group writable make it so
                hdfs.chmod(hdfs_events_parent_dir, "g+w")
        except IOError:
            # If this happens then the permission is set correct already since the creator of the /Logs/TensorFlow already set group writable
            pass

    hdfs_appid_logdir = hdfs_events_parent_dir + "/" + app_id
    # if not pyhdfs_handle.exists(hdfs_appid_logdir):
    # pyhdfs_handle.create_directory(hdfs_appid_logdir)

    hdfs_run_id_logdir = hdfs_appid_logdir + "/" + type + "/run." + str(run_id)

    # determine directory structure based on arguments
    if sub_type:
        hdfs_exec_logdir = hdfs_run_id_logdir + "/" + str(
            sub_type) + '/' + str(param_string)
    elif not param_string and not sub_type:
        hdfs_exec_logdir = hdfs_run_id_logdir + '/'
    else:
        hdfs_exec_logdir = hdfs_run_id_logdir + '/' + str(param_string)

    # Need to remove directory if it exists (might be a task retry)
    if pyhdfs_handle.exists(hdfs_exec_logdir):
        pyhdfs_handle.delete(hdfs_exec_logdir, recursive=True)

    # create the new directory
    pyhdfs_handle.create_directory(hdfs_exec_logdir)

    # update logfile
    logfile = hdfs_exec_logdir + '/' + 'logfile'
    os.environ['EXEC_LOGFILE'] = logfile

    return hdfs_exec_logdir, hdfs_appid_logdir
Exemplo n.º 3
0
def stat(hdfs_path):
    """
    Performs the equivalent of os.stat() on hdfs_path, returning a StatResult object.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    Returns:
        returns a list of hdfs paths
    """
    hdfs_path = _expand_path(hdfs_path)
    return hdfs.stat(hdfs_path)
Exemplo n.º 4
0
def stat(hdfs_path, project=None):
    """
    Performs the equivalent of os.stat() on path, returning a StatResult object.

    Args:
        :hdfs_path: If this value is not specified, it will get the path to your project. You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :project: If this value is not specified, it will get the path to your project. If you need to path to another project, you can specify the name of the project as a string.

    Returns:
        StatResult object
    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)
    return hdfs.stat(hdfs_path)
Exemplo n.º 5
0
 def chown(self, path, uid, gid):
     '''Filter for unchanged uids and gids (-1)'''
     st = hdfs.stat(path)
     if uid == -1:
         if gid == -1:
             hdfs.chown(path)
         else:
             group= self._G2h_g(gid)
             hdfs.chown(path,group=group)
     else:
         user = self._U2h_u(uid)
         if gid == -1:
             hdfs.chown(path,user)
         else:
             group= self._G2h_g(gid)
             hdfs.chown(path, user, group)
Exemplo n.º 6
0
 def getattr(self, path, fh=None):
     if not hdfs.path.exists(path):
         raise FuseOSError(errno.ENOENT)
     st = hdfs.stat(path)
     data={}
     data['st_atime']=st.st_atime
     data['st_ctime']=st.st_ctime
     data['st_gid']=self._h_g2G(st.st_gid)
     mode = st.st_mode
     if st.kind.lower() == 'directory':
         mode = mode + 16384
     if st.kind.lower() == 'file':
         mode = mode + 32768
     data['st_mode']=mode
     data['st_mtime']=st.st_mtime
     data['st_nlink']=st.st_nlink
     data['st_size']=st.st_size
     data['st_uid']=self._h_u2U(st.st_uid)
     return data
def upsert_a_file(src_dir, hdfs_tgt_dir, filename, debug):
    src_fname = os.path.join(src_dir, filename)
    tgt_fname = os.path.join(hdfs_tgt_dir, filename)
    # get source file info
    try:
        src_ctime_int = int(os.path.getctime(src_fname))
    except:
        src_ctime_int = None
    print "src_ctime_int=", src_ctime_int
    # get target file info
    try:
        tgt_stat = hdfs.stat(tgt_fname)
        tgt_mtime = tgt_stat.st_mtime
    except:
        tgt_mtime = None
    print "tgt_mtime=", tgt_mtime

    # put or rm/put
    try:
        if tgt_mtime is None:
            #insert new one
            if debug == 'N':
                hdfs.put(src_fname, hdfs_tgt_dir)
            else:
                print "DEBUG: put ", src_fname, "to", hdfs_tgt_dir
        elif src_ctime_int > tgt_mtime:
            if debug == 'N':
                hdfs.rmr(tgt_fname)
                hdfs.put(src_fname, hdfs_tgt_dir)
            else:
                print "DEBUG: replace ", tgt_fname, "by", src_fname
        else:
            print tgt_fname, "has a newer mdate:", tgt_mtime, "than", src_fname, ":", src_ctime_int
    except:
        e = sys.exc_info()[0]
        print "Error({0}): {1}".format(e.errno, e.strerror)