Python DataSource 예제들, pydra.cluster.tasks.datasource.DataSource Python 예제들

예제 #1

0

파일 보기

파일: init.py 프로젝트: brianmartin/Pydra

    def test_iterslicer(self):
        l = [chr(i) for i in range(255)]
        u = u"\u03c0 \u042f \u97f3 \u00e6 \u221e"
        s = "Aye aye, Cap'n."
        t = (True, False, None)
        x = xrange(10)

        for i in l, u, s, t, x:
            ds = DataSource(IterSlicer, i)
            for expected, unpacked in itertools.izip_longest(i, ds.unpack()):
                self.assertEqual(expected, unpacked)

예제 #2

0

파일 보기

파일: parallel_task.py 프로젝트: MostAwesomeDude/pydra

    def __init__(self, msg=None):
        Task.__init__(self, msg)
        self._lock = RLock()             # general lock
        self._subtask = None              # subtask that is parallelized
        self._subtask_class = None      # class of subtask
        self._subtask_args = None       # args for initializing subtask
        self._subtask_kwargs = None     # kwargs for initializing subtask

        self.datasource = DataSource(self.datasource)

        self.logger = logging.getLogger('root')

예제 #3

0

파일 보기

파일: parallel_task.py 프로젝트: MostAwesomeDude/pydra

class ParallelTask(Task):
    """
    ParallelTask - is a task that can be broken into discrete work units
    """
    _data_in_progress = {}      # workunits of data
    _workunit_count = 0         # count of workunits handed out.  This is used to identify transactions
    _workunit_total = 0
    _workunit_completed = 0     # count of workunits handed out.  This is used to identify transactions
    subtask_key = None          # cached key from subtask

    datasource = None
    """The datasource description."""

    def __init__(self, msg=None):
        Task.__init__(self, msg)
        self._lock = RLock()             # general lock
        self._subtask = None              # subtask that is parallelized
        self._subtask_class = None      # class of subtask
        self._subtask_args = None       # args for initializing subtask
        self._subtask_kwargs = None     # kwargs for initializing subtask

        self.datasource = DataSource(self.datasource)

        self.logger = logging.getLogger('root')

    @property
    def subtask(self):
        """
        Lazily provide the subtask on-demand.

        Note that this will, in all likelihood, return None if requested
        immediately after instantiation. Fill out the subtask fields first.

        This property currently does not check to see whether the arguments to
        the subtask have changed since the last request.
        """

        if not self._subtask and self._subtask_class:
            args = self._subtask_args
            if not args:
                args = tuple()
            kwargs = self._subtask_kwargs
            if not kwargs:
                kwargs = dict()

            subtask = self._subtask_class(*args, **kwargs)
            subtask.parent = self
            self._subtask = subtask
        return self._subtask

    @subtask.setter
    def subtask(self, value):
        """
        Standard setter.

        This setter adds this task as a parent to the provided subtask.
        """

        if value:
            self._subtask = value
            value.parent = self

    def _get_subtask(self, task_path, clean=False):
        """
        Returns the subtask specified by the path.  Overridden to search subtask
        of this class.  This function lazily loads the subtask if it is not
        already instantiated.
        
        @param task_path - list of strings that correspond to a task's location
                           within a task heirarchy
        @param clean - a new (clean) instance of the task is requested.
        
        @returns a tuple containing the consumed portion of the task path and
                    the task matches the request.
        """
        if len(task_path) == 1:
            if task_path[0] == self.__class__.__name__:
                return task_path, self
            else:
                raise TaskNotFoundException("Task not found: %s" % task_path)
        
        # discard old version
        if clean:
            self._subtask = None
        
        #recurse down into the child
        consumed, subtask = self.subtask._get_subtask(task_path[1:])
        return task_path[:2], subtask

    def request_workers(self):
        """
        Create work requests for all planned subtasks.
        
        This function eagerly creates all planned work requests in one shot,
        using `get_work_units()` to create all work units.
        
        More complex `Task` subclasses, like `MapReduceTask`, may employ a
        more sophisticated algorithm that permits cross worker dependencies.
        """
        
        for data, index in self.get_work_units():
            self.logger.debug('Paralleltask - assigning remote work: key=%s, args=%s'
                % ('--', index))
            self.parent.request_worker(self.subtask.get_key(), {'data': data},
                index)

    def get_work_units(self):
        """
        Yield a series of work units.
        
        This function returns *all* work units, one by one. For each work
        unit, the data of the unit is stored in `_data_in_progress`.
        
        Warning: This method will take the instance lock as needed, but should
        not be locked during yields.
        
        :return: tuple(data, index)
        """
        # XXX needs to have a delayable path as well
        slicer = self.datasource.unpack()
        
        while True:
            data, index = next(slicer), self._workunit_count
        
            yield data, index
        
            with self._lock:
                self._workunit_count += 1
                self._data_in_progress[index] = data

    def _stop(self):
        """
        Overridden to call stop on all children
        """
        Task._stop(self)
        self.subtask._stop()

    def _work(self, **kwargs):
        """
        Work function overridden to delegate workunits to other Workers.
        """
        # request initial workers
        self.request_workers()
        self.logger.debug('Paralleltask - initial work assigned!')

    def _batch_complete(self, results):
        for workunit, result, failed in results:
            if not failed:
                self._work_unit_complete(results, workunit)

    def _work_unit_complete(self, results, index):
        """
        A work unit completed.  Handle the common management tasks to remove the data
        from in_progress.  Also call task specific work_unit_complete(...)
        
        This method *MUST* lock while it is altering the lists of data
        """
        self.logger.debug('Paralleltask - Work unit completed')
        with self._lock:
            # run the task specific post process
            
            self.work_unit_complete(self._data_in_progress[index], results)
            
            # remove the workunit from _in_progress
            del self._data_in_progress[index]
            
            #check stop flag
            if self.STOP_FLAG:
                self.task_complete(None)
            
            # no data left in progress, release 1 worker.  when there is work in
            # the queue the waiting worker will be selected automatically by
            # the scheduler.  Releasing it must be explicit though.
            if not self._data_in_progress:
                self.logger.debug('ParallelTask - releasing a worker')
                self.get_worker().request_worker_release()
            
            self._workunit_completed += 1
            
            #check for more work
            if not self._data_in_progress:
                #all work is done, call the task specific function to combine the results 
                self.logger.debug('Paralleltask - all workunits complete, calling task post process')
                results = self.work_complete()
                self._complete(results)
                return

    def _worker_failed(self, index):
        """
        A worker failed while working.  re-add the data to the list
        """
        self.logger.warning('Paralleltask - Worker failure during workunit')
        with self._lock:
            #remove data from in progress
            del self._data_in_progress[index]

    @staticmethod
    def from_subtask(cls, *args, **kwargs):
        """
        Creates a new ParallelTask with the specified class, args, and
        kwargs as its subtask.
        """
        pt = ParallelTask()
        pt.set_subtask(cls, *args, **kwargs)
        return pt

    def progress(self):
        """
        progress - returns the progress as a number 0-100.
        
        A parallel task's progress is a derivitive of its workunits:
           COMPLETE_WORKUNITS / TOTAL_WORKUNITS
        """
        total = self._workunit_completed + len(self._data_in_progress)
        
        if total == 0:
            return 0
        
        return 100 * self._workunit_completed  / total

    def set_subtask(self, class_, *args, **kwargs):
        """
        Sets the subtask for this paralleltask.  The class, args, and kwargs
        are stored so that they may be lazily instantiated when needed.
        """
        self._subtask_class = class_
        self._subtask_args = args
        self._subtask_kwargs = kwargs

    def start_subtask(self, task, subtask_key, workunit, kwargs, callback, \
                      callback_args):
        """
        Launch a specified subtask.
        
        Only called from the subtask's `Worker`, as the final step before
        actually letting the subtask do its work.
        
        :Parameters:
            task : `Task`
                The subtask instance to be run.
            workunit : dict
                The keyword arguments to be passed to the task.
        """
        # Delayable?
        if self.datasource.delayable:
            workunit["data"] = self.datasource.unpack()
        
        task._start(workunit, callback, callback_args)

    def work_complete(self):
        """
        Method stub for method called to post process completion of task.  This
        must be overridden by users for their task specific work
        """
        pass

    def work_unit_complete(self, workunit, results):
        """
        Method stub for method called to post process results.  This
        is implemented by users that want to include automatic post-processing
        
        @param workunit - key and other args sent when assigning the workunit
        @param results - results sent by the completed subtask
        """
        pass

예제 #4

0

파일 보기

파일: init.py 프로젝트: brianmartin/Pydra

 def test_recursive(self):
     ds = DataSource(IterSlicer, (IterSlicer, [range(10) for i in range(10)]))
     expected = range(10) * 10
     for i, unpacked in enumerate(ds.unpack()):
         self.assertEqual(expected[i], unpacked)

예제 #5

0

파일 보기

파일: test_init.py 프로젝트: MostAwesomeDude/pydra

 def test_backend(self):
     ds = DataSource(_TestSlicer, DataSource(SQLBackend,"sqlite3",":memory:"))
     unpacked = list(ds.unpack())
     self.assertTrue(unpacked[0].connected)