Exemplo n.º 1
0
 def parse(self, line):
     d = {}
     try:
         so = ScanfParser.parse(self, " ".join(line.split()), True)
         for attr, value in zip(self._attrs, so.ungrouped()):
             d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value
     except ScanfParser.ParseError as e:
         return {}
     return d
Exemplo n.º 2
0
 def parse(self, line):
   d = {}
   try:
     so = ScanfParser.parse(self, ' '.join(line.split()), True)
     for attr, value in zip(self._attrs, so.ungrouped()):
       d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value
   except ScanfParser.ParseError as e:
     return {}
   return d
Exemplo n.º 3
0
 def parse(self, line):
   d = {}
   try:
     so = ScanfParser.parse(self, ' '.join(line.split()), True)
     for attr, value in zip(self._attrs, so.ungrouped()):
       d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value
   except ScanfParser.ParseError as e:
     if log: log.error('ProcessHandleParser failed: %s' % e)
     return {}
   return d
Exemplo n.º 4
0
 def parse(self, line):
     d = {}
     try:
         so = ScanfParser.parse(self, ' '.join(line.split()), True)
         for attr, value in zip(self._attrs, so.ungrouped()):
             d[attr] = self._handlers[attr](
                 attr, value) if attr in self._handlers else value
     except ScanfParser.ParseError as e:
         if log: log.error('ProcessHandleParser failed: %s' % e)
         return {}
     return d
Exemplo n.º 5
0
 def __init__(self, attrs, type_map, handlers = {}):
   self._attrs = attrs
   self._handlers = handlers
   attr_list = map(type_map.get, attrs)
   ScanfParser.__init__(self, ' '.join(attr_list))
Exemplo n.º 6
0
class HDFSHelper(object):
    """
  This Class provides a set of functions for hdfs operations.
  NOTE: This class assumes a local hdfs or hadoop client on the path.
  """
    class InternalError(Exception):
        pass

    PARSER = ScanfParser(
        '%(mode)s %(dirents)s %(user)s %(group)s %(filesize)d '
        '%(year)d-%(month)d-%(day)d %(hour)d:%(minute)d')

    def __init__(self,
                 config,
                 command_class=CommandUtil,
                 heap_limit=Amount(256, Data.MB),
                 use_hadoop_v1=False):
        """
    heap_limit is the maximum heap that should be allocated to the command process,
    defined using twitter.common.quantity.Data.

    use_hadoop_v1 sets the command to hadoop instead of hdfs.
    """
        if not os.path.isdir(config):
            raise ValueError('Command requires root of a config tree')
        self._config = config
        self._cmd_class = command_class
        if heap_limit is None:
            raise ValueError(
                'The hdfs heap_limit must not be specified as "None".')
        self._heap_limit = heap_limit
        self.cli_command = 'hdfs'
        if use_hadoop_v1:
            self.cli_command = 'hadoop'
        if self._cmd_class.execute_suppress_stdout_stderr(
                self.cli_command) != 0:
            raise OSError(
                'The "{0}" utility is not available on the system PATH'.format(
                    self.cli_command))

    @property
    def config(self):
        return self._config

    def _call(self, cmd, *args, **kwargs):
        """Runs fs command with the given command and args.
    Checks the result of the call by default but this can be disabled with check=False.
    """
        cmd = [self.cli_command, '--config', self._config, 'dfs', cmd
               ] + list(args)
        heapsize = str(int(self._heap_limit.as_(Data.MB)))
        with environment_as(HADOOP_HEAPSIZE=heapsize):
            if kwargs.get('check'):
                return self._cmd_class.check_call(cmd)
            elif kwargs.get('return_output'):
                return self._cmd_class.execute_and_get_output(cmd)
            elif kwargs.get('supress_output'):
                return self._cmd_class.execute_suppress_stdout(cmd)
            else:
                return self._cmd_class.execute(cmd)

    def get(self, src, dst):
        """
    Copy file(s) in HDFS to local path (via proxy if necessary).
    NOTE: If src matches multiple files, make sure dst is a directory!
    """
        if isinstance(src, list):
            hdfs_src = " ".join(src)
        else:
            hdfs_src = src
        return self._call('-get', hdfs_src, dst)

    def put(self, src, dst):
        """
    Copy the local file src to a HDFS path dst.
    """
        abs_src = os.path.expanduser(src)
        assert os.path.exists(
            abs_src), 'File does not exist, cannot copy: %s' % abs_src
        return self._do_put(abs_src, dst)

    def _do_put(self, source, dst):
        """
    Put the local file in to HDFS
    """
        if isinstance(dst, list):
            hdfs_dst = " ".join(dst)
        else:
            hdfs_dst = dst
        if not self._call('-test', '-e', hdfs_dst, check=False):
            self._call('-rm', '-skipTrash', hdfs_dst)
        return self._call('-put', source, hdfs_dst)

    def exists(self, path, flag='-e'):
        """
    Checks if the path exists in HDFS
    Returns true if it exists or else
    Returns false
    """
        try:
            return self._call("-test", flag, path) == 0
        except subprocess.CalledProcessError:
            return False

    def cat(self, remote_file_pattern, local_file=sys.stdout):
        """
    Cat HDFS file to local
    """
        return self._call("-cat",
                          remote_file_pattern,
                          also_output_to_file=local_file)

    def _ls(self, path, is_dir=False, is_recursive=False):
        """
    Return list of [hdfs_full_path, filesize]
    Raises exception when the HDFS ls command returns error
    """
        hdfs_cmd = '-lsr' if is_recursive else '-ls'
        (exit_code, ls_result) = self._call(hdfs_cmd, path, return_output=True)
        if exit_code != 0:
            raise self.InternalError(
                "Error occurred. %s.Check logs for details" % ls_result)
        file_list = []
        if ls_result is None:
            return file_list
        lines = ls_result.splitlines()
        for line in lines:
            if line == "" or line.startswith("Found"):
                continue

            seg = line.split(None, 7)
            if len(seg) < 8:
                raise self.InternalError("Invalid hdfs -ls output. [%s]" %
                                         line)

            filename = seg[-1]
            try:
                metadata = self.PARSER.parse(' '.join(seg[0:7]))
            except ScanfParser.ParseError as e:
                raise self.InternalError('Unable to parse hdfs output: %s' % e)
            #seg[0] example: drwxrwx---
            if metadata.mode.startswith('d') != is_dir:
                continue

            file_list.append([filename, metadata.filesize])
        return file_list

    def ls(self, path, is_dir=False):
        """
    Returns list of [hdfs_full_path, filesize]
    If is_dir is true returns only the toplevel directories.
    """
        return self._ls(path, is_dir, False)

    def lsr(self, path, is_dir=False):
        """
    Returns list of [hdfs_full_path, filesize] in recursive manner
    If is_dir is true returns only the directories.
    """
        return self._ls(path, is_dir, True)

    def read(self, filename):
        """
      Return the contents of filename, or None if an error occurred.
    """
        with temporary_file() as fp:
            os.unlink(fp.name)
            if self._call("-copyToLocal", filename, fp.name) == 0:
                with open(fp.name) as f:
                    return f.read()
            else:
                return None

    def write(self, filename, text):
        """
    Write will write the contents in the text to the filename given
    The file will be overwritten if it already exists
    """
        self._call("-rm", filename)
        with temporary_file() as fp:
            fp.write(text)
            fp.flush()
            return self._call('-copyFromLocal', fp.name, filename)

    def mkdir(self, path):
        """
    Mkdir will create a directory. If already present, it will return an error
    """
        return self._call("-mkdir", path)

    def mkdir_suppress_err(self, path):
        """
    Creates a directory if it does not exists
    """
        if not self.exists(path):
            return self.mkdir(path)

    def rm(self, filename):
        """
    Removes a file.
    """
        return self._call("-rm", filename, suppress_output=True)

    def cp(self, src, dest):
        """
    Copies a src file to dest
    """
        return self._call("-cp", src, dest, suppress_output=True)

    def mv(self, src, dest):
        """
    Move a src file to dest
    """
        return self._call("-mv", src, dest, suppress_output=True)

    def copy_from_local(self, local, remote):
        """
    Copies the file from local to remote
    """
        return self._call("-copyFromLocal",
                          local,
                          remote,
                          suppress_output=True)

    def copy_to_local(self, remote, local):
        """
    Copies the file from remote to local
    """
        return self._call("-copyToLocal", remote, local, suppress_output=True)
Exemplo n.º 7
0
class ExecutorDetector(object):
    class Error(Exception):
        pass

    class CannotFindRoot(Error):
        pass

    LOG_PATH = 'executor_logs'
    RESOURCE_PATH = 'resource_usage.recordio'
    VARS_PATH = 'executor_vars.json'
    PATTERN = [
        '%(root)s', 'slaves', '%(slave_id)s', 'frameworks', '%(framework_id)s',
        'executors', '%(executor_id)s', 'runs', '%(run)s'
    ]
    EXTRACTOR = ScanfParser(os.path.join(*PATTERN))

    @classmethod
    def find_root(cls, path):
        """Does this path appear to match the executor directory pattern?"""
        def root_from_path(path):
            path = os.path.normpath(path)
            path_vector = path.split(os.path.sep)
            pattern_vector = cls.PATTERN
            if len(path_vector) < len(pattern_vector):
                return None
            for pattern, path_component in zip(reversed(pattern_vector),
                                               reversed(path_vector)):
                if pattern.startswith('%'):
                    continue
                if path_component != pattern:
                    return None
            matched_path = os.path.join(*path_vector[-len(pattern_vector) +
                                                     1:])
            return os.path.normpath(path[:-len(matched_path)])

        while path != os.path.dirname(path):
            root = root_from_path(path)
            if root:
                return root
            path = os.path.dirname(path)

    @classmethod
    def match(cls, path):
        try:
            return cls.EXTRACTOR.parse(path)
        except ScanfParser.ParseError:
            return None

    @classmethod
    def path(cls, result):
        return os.path.join(*cls.PATTERN) % result.groups()

    @classmethod
    def find(cls,
             root,
             slave_id='*',
             framework_id='*',
             executor_id='*',
             run='*'):
        mixins = dict(root=root,
                      slave_id=slave_id,
                      framework_id=framework_id,
                      executor_id=executor_id,
                      run=run)
        return filter(
            None, map(cls.match, glob(os.path.join(*cls.PATTERN) % mixins)))

    def __init__(self, root=None):
        self.root = root or self.find_root(os.getcwd())
        if self.root is None:
            raise self.CannotFindRoot('Not a valid executor root!')

    def __iter__(self):
        for extraction in self.find(root=self.root):
            yield extraction
Exemplo n.º 8
0
Arquivo: hdfs.py Projeto: xianxu/pants
class HDFSHelper(object):
    """
  This Class provides a set of function for hadoop operations.
  """
    class InternalError(Exception):
        pass

    PARSER = ScanfParser(
        '%(mode)s %(dirents)s %(user)s %(group)s %(filesize)d '
        '%(year)d-%(month)d-%(day)d %(hour)d:%(minute)d')

    def __init__(self, config, command_class=CommandUtil):
        #Point to test hadoop cluster if no config given
        self._config = config
        self._cmd_class = command_class

    @property
    def config(self):
        return self._config

    def _call(self, cmd, *args, **kwargs):
        """Runs hadoop fs command  with the given command and args.
    Checks the result of the call by default but this can be disabled with check=False.
    """
        cmd = ['hadoop', '--config', self._config, 'dfs', cmd] + list(args)
        if kwargs.get('check'):
            return self._cmd_class.check_call(cmd)
        elif kwargs.get('return_output'):
            return self._cmd_class.execute_and_get_output(cmd)
        elif kwargs.get('supress_output'):
            return self._cmd_class.execute_suppress_stdout(cmd)
        else:
            return self._cmd_class.execute(cmd)

    def get(self, src, dst):
        """
    Copy file(s) in hdfs to local path (via proxy if necessary).
    NOTE: If src matches multiple files, make sure dst is a directory!
    """
        if isinstance(src, list):
            hdfs_src = " ".join(src)
        else:
            hdfs_src = src
        return self._call('-get', hdfs_src, dst)

    def put(self, src, dst):
        """
    Copy the local file src to a hadoop path dst.
    """
        abs_src = os.path.expanduser(src)
        assert os.path.exists(
            abs_src), 'File does not exist, cannot copy: %s' % abs_src
        return self._do_put(abs_src, dst)

    def _do_put(self, source, dst):
        """
    Put the local file in to HDFS
    """
        if isinstance(dst, list):
            hdfs_dst = " ".join(dst)
        else:
            hdfs_dst = dst
        if not self._call('-test', '-e', hdfs_dst, check=False):
            self._call('-rm', '-skipTrash', hdfs_dst)
        return self._call('-put', source, hdfs_dst)

    def exists(self, path, flag='-e'):
        """
    Checks if the path exists in hdfs
    Returns true if it exists or else
    Returns false
    """
        try:
            return self._call("-test", flag, path) == 0
        except subprocess.CalledProcessError:
            return False

    def cat(self, remote_file_pattern, local_file=sys.stdout):
        """
    Cat hdfs file to local
    """
        return self._call("-cat",
                          remote_file_pattern,
                          also_output_to_file=local_file)

    def _ls(self, path, is_dir=False, is_recursive=False):
        """
    Return list of [hdfs_full_path, filesize]
    Raises exception when the hadoop ls command returns error
    """
        hdfs_cmd = '-lsr' if is_recursive else '-ls'
        (exit_code, ls_result) = self._call(hdfs_cmd, path, return_output=True)
        if exit_code != 0:
            raise self.InternalError(
                "Error occurred. %s.Check logs for details" % ls_result)
        file_list = []
        if ls_result == None:
            return file_list
        lines = ls_result.splitlines()
        for line in lines:
            if line == "" or line.startswith("Found"):
                continue

            seg = line.split(None, 7)
            if len(seg) < 8:
                raise self.InternalError("Invalid hdfs -ls output. [%s]" %
                                         line)

            filename = seg[-1]
            try:
                metadata = self.PARSER.parse(' '.join(seg[0:7]))
            except ScanfParser.ParseError as e:
                raise self.InternalError('Unable to parse hdfs output: %s' % e)
            #seg[0] example: drwxrwx---
            if metadata.mode.startswith('d') != is_dir:
                continue

            file_list.append([filename, metadata.filesize])
        return file_list

    def ls(self, path, is_dir=False):
        """
    Returns list of [hdfs_full_path, filesize]
    If is_dir is true returns only the toplevel directories.
    """
        return self._ls(path, is_dir, False)

    def lsr(self, path, is_dir=False):
        """
    Returns list of [hdfs_full_path, filesize] in recursive manner
    If is_dir is true returns only the directories.
    """
        return self._ls(path, is_dir, True)

    def read(self, filename):
        """
      Read will return the contents of the file in a
      variable
    """
        tmp_file = tempfile.mktemp()
        if self._call("-copyToLocal", filename, tmp_file) == 0:
            with open(tmp_file, "r") as f:
                text = f.read()
        else:
            text = None
        return text

    def write(self, filename, text):
        """
    Write will write the contents in the text to the filename given
    The file will be overwritten if it already exists
    """
        self._call("-rm", filename)
        with temporary_file() as fp:
            fp.write(text)
            fp.flush()
            te = self._call('-copyFromLocal', fp.name, filename)
            print "sel", te
            return te
            return self._call('-copyFromLocal', fp.name, filename)

    def mkdir(self, path):
        """
    Mkdir will create a directory. If already present, it will return an error
    """
        return self._call("-mkdir", path)

    def mkdir_suppress_err(self, path):
        """
    Creates a directory if it does not exists
    """
        if (not self.exists(path)):
            return self.mkdir(path)

    def rm(self, filename):
        """
    Removes a file.
    """
        return self._call("-rm", filename, suppress_output=True)

    def cp(self, src, dest):
        """
    Copies a src file to dest
    """
        return self._call("-cp", src, dest, suppress_output=True)

    def copy_from_local(self, local, remote):
        """
    Copies the file from local to remote
    """
        return self._call("-copyFromLocal",
                          local,
                          remote,
                          suppress_output=True)

    def copy_to_local(self, remote, local):
        """
    Copies the file from remote to local
    """
        return self._call("-copyToLocal", remote, local, suppress_output=True)
Exemplo n.º 9
0
 def __init__(self, attrs, type_map, handlers = {}):
   self._attrs = attrs
   self._handlers = handlers
   attr_list = map(type_map.get, attrs)
   ScanfParser.__init__(self, ' '.join(attr_list))