Exemplo n.º 1
0
    def doPageRank(self, ranks):
        self.totalResult = None
        self.single_ResultList = []
        self.partitionFile(self.current_WorkingOn_FileName)

        self.setPartionsToWorker(self.partitions)
        # print 'self.partitions: %s'% self.partitions

        output = StringIO.StringIO()
        pickler = cloudpickle.CloudPickler(output)
        pickler.dump(ranks)
        rankSent = output.getvalue()

        for worker in self.workers:
            # print 'call next worker start work'

            try:
                c = zerorpc.Client(timeout=9999999)
                c.connect('tcp://' + worker['ip'] + ':' + worker['port'])
                print colored('[Master:]', 'white'), colored(
                    'call worker: %s start working', 'red') % worker['port']
                c.doPageRank(rankSent, async=True)

            except TimeoutExpired:
                continue
            except LostRemote:
                continue
        while not self.totalResult:
            gevent.sleep(0.5)
        return self.totalResult
Exemplo n.º 2
0
    def setJob_async(self, pickle_object):

        # unpickle
        input = StringIO.StringIO(pickle_object)
        unpickler = pickle.Unpickler(input)
        intermediateRDD = unpickler.load()

        input_filename = intermediateRDD.get_ancester().get_input_filename()

        workerlist = self.create_RDD_workerlist()

        print workerlist
        intermediateRDD.set_params_recv(workerlist)

        output = StringIO.StringIO()
        pickler = cloudpickle.CloudPickler(output)
        pickler.dump(intermediateRDD)
        pickle_object = output.getvalue()

        # save pickle_object for failure handling
        self.current_intermediateRDD = intermediateRDD

        for w in self.workers:
            self.assign_rdd_to_worker(w, pickle_object)

        print "in setjob_async"
        pass
Exemplo n.º 3
0
def dumps(o, enable_trace=True, use_zlib=False):
    # type: (...) -> bytes
    """For internal use only; no backwards-compatibility guarantees."""
    with _pickle_lock:
        with io.BytesIO() as file:
            pickler = cloudpickle.CloudPickler(file)
            try:
                pickler.dispatch_table[type(flags.FLAGS)] = _pickle_absl_flags
            except NameError:
                pass
            pickler.dump(o)
            s = file.getvalue()

    # Compress as compactly as possible (compresslevel=9) to decrease peak memory
    # usage (of multiple in-memory copies) and to avoid hitting protocol buffer
    # limits.
    # WARNING: Be cautious about compressor change since it can lead to pipeline
    # representation change, and can break streaming job update compatibility on
    # runners such as Dataflow.
    if use_zlib:
        c = zlib.compress(s, 9)
    else:
        c = bz2.compress(s, compresslevel=9)
    del s  # Free up some possibly large and no-longer-needed memory.

    return base64.b64encode(c)
Exemplo n.º 4
0
    def test_save_unsupported(self):
        sio = StringIO()
        pickler = cloudpickle.CloudPickler(sio, 2)

        with pytest.raises(pickle.PicklingError) as excinfo:
            pickler.save_unsupported("test")

        assert "Cannot pickle objects of type" in str(excinfo.value)
Exemplo n.º 5
0
def test_cloudpickle_to_file(EN):
    f = tempfile.NamedTemporaryFile(delete=False)
    p = cloudpickle.CloudPickler(f)
    p.dump(EN)
    f.close()
    loaded_en = cloudpickle.load(open(f.name))
    os.unlink(f.name)
    doc = loaded_en(unicode('test parse'))
    assert len(doc) == 2
Exemplo n.º 6
0
 def __execute(self, stage, conn):
     '''
     Send the stage task to worker and execute this stage in the worker
     '''
     output = StringIO.StringIO()
     pickler = cloudpickle.CloudPickler(output)
     pickler.dump(stage)
     objstr = output.getvalue()
     conn.run(objstr)
Exemplo n.º 7
0
def log_query(filename, keyword):
    logfile = TextFile(filename)
    fil = Filter(logfile, lambda x: (keyword in x))
    fil.rdd_collect()

    log_output = StringIO.StringIO()
    pickler = cloudpickle.CloudPickler(log_output)
    pickler.dump(fil)
    return log_output.getvalue()
Exemplo n.º 8
0
    def dump_plugins(self):
        """Save the plugins into a pickle object."""

        message("Dumping plugins", force=True)

        f_handler = open(CmdManager.dumped_plugin_path, "wb")

        pick = cloudpickle.CloudPickler(f_handler)
        pick.dump((self.cmd_obj_list, self.parser))
        f_handler.close()
Exemplo n.º 9
0
def word_count(filename):
    textfile = TextFile(filename)
    flat = FlatMap(textfile, lambda x: x.split())
    map = Map(flat, lambda x: (x, 1))
    red = ReduceByKey(map, lambda x, y: x + y)
    red.rdd_collect(
    )  #set the collect to be true then we will collect all the data to driver

    wc_output = StringIO.StringIO()
    pickler = cloudpickle.CloudPickler(wc_output)
    pickler.dump(red)
    return wc_output.getvalue()
Exemplo n.º 10
0
    def sendResultToDriver(self, result, pName):
        if type(result) is int:
            objstr = result
        else:
            output = StringIO.StringIO()
            pickler = cloudpickle.CloudPickler(output)
            pickler.dump(result)
            objstr = output.getvalue()
        # print '6@~~~'

        c = zerorpc.Client()
        c.connect('tcp://0.0.0.0:4040')
        c.getSingleResult_FromWorker(objstr, self.port, pName, async=True)
Exemplo n.º 11
0
    def collect(self):
        #1. send serialized RDD to each worker
        #2. worker do the calculation
        #3. collect all results
        output = StringIO.StringIO()
        pickler = cloudpickle.CloudPickler(output)
        pickler.dump(self)#rdd
        pickle_object = output.getvalue()

        # if self.if_context is params.NO_CONTEXT:
        #     master_addr = self.master_address
        # elif self.if_context is params.USE_CONTEXT:
        #     master_addr = Context().getMasterAddress()
        # else:
        #     return "Unknown Type:" + str(self.if_context)
        #
        master_addr = self.master_address

        c = zerorpc.Client(timeout=params.GENERAL_TIMEOUT)
        c.connect("tcp://"+master_addr)
        c.set_job(pickle_object)

        worker_ips = c.result_is_ready()

        #print "####worker_ips for collect: " + str(worker_ips)

        final_results = []

        if worker_ips is None:
            print "#############################################"
            print "Insufficient worker(s) to finish partitions "
            print "#############################################"
            return None
        else:
            for w in worker_ips:
                c = zerorpc.Client(timeout=params.GENERAL_TIMEOUT)
                c.connect("tcp://"+w)

                result = c.getResults()
                if isinstance(result, int):
                    if isinstance(final_results, list):
                        final_results = 0
                    final_results += int(result)
                else:
                    final_results += result
            return final_results
Exemplo n.º 12
0
 def setUp(self):
     self.file_obj = StringIO()
     self.cloudpickler = cloudpickle.CloudPickler(self.file_obj, 2)
Exemplo n.º 13
0
    def ping_worker(self, w, type):
        while True:
            if self.workerState[w] != "LOSS":
                try:
                    if type == WORKER_NORMAL:
                        self.workers[w].ping()
                    else:
                        self.workers_standby[w].ping()
                except Exception:
                    if type == WORKER_NORMAL:
                        self.workerState[w] = "LOSS"
                        print "lost connection"
                        print "old workers:" + str(self.workers)
                        # remove lost worker
                        self.workers.pop(w, None)
                        # select new worker from standby list
                        selected_worker = self.select_worker_from_standby()
                        # pop from standby list
                        c = self.workers_standby.pop(selected_worker, None)
                        if c is not None:
                            # add select_worker to worker list
                            self.workers[selected_worker] = c
                            self.workerState[selected_worker] = "READY"
                            print "new workers: " + str(self.workers)
                            # update workerlist_for_RDD
                            print "old workerlist:" + str(
                                self.workerlist_for_RDD)
                            if self.workerlist_for_RDD is None or len(
                                    self.workerlist_for_RDD.keys()) is 0:
                                continue
                            # 1.pop loss worker with index value
                            index = self.workerlist_for_RDD.pop(w[0] + ":" +
                                                                w[1])
                            # 2. assign select_worker with this index
                            self.workerlist_for_RDD[selected_worker[0] + ":" +
                                                    selected_worker[1]] = index
                            print "new workerlist:" + str(
                                self.workerlist_for_RDD)

                            self.current_intermediateRDD.set_params_recv(
                                self.workerlist_for_RDD)
                            output = StringIO.StringIO()
                            pickler = cloudpickle.CloudPickler(output)
                            pickler.dump(self.current_intermediateRDD)
                            pickle_object = output.getvalue()

                            print "####" + str(
                                self.current_intermediateRDD.workerlist)

                            self.set_job_for_single_worker(
                                selected_worker, pickle_object)
                            gevent.spawn(
                                self.ping_worker,
                                (selected_worker[0], selected_worker[1]),
                                WORKER_NORMAL)

                            self.update_RDD_workerlists()

                        else:
                            """
                            since there is no enough standby workers.
                            Master may need to cancel the job, and let driver knows
                            """
                            print "########################## should shut down"
                            self.broadcast_threatening()
                            pass
                        print "new workers:" + str(self.workers)
                        print "standby: " + str(self.workers_standby)
                    else:
                        pass
                        """
                            the loss of standby worker can be caused by
                            either worker is down,
                            or being removed from the list.
                        """
                    break
            else:
                break
            gevent.sleep(1)
Exemplo n.º 14
0
    def controller(self):
        gevent.sleep(10)
        # print '1~~'
        while True:

            # print '2~~'
            if self.method != 'repl':
                # print '3~~'

                printList = []
                for worker in self.workers:

                    # print '4~~'
                    try:
                        # print '5~~'
                        c = zerorpc.Client(timeout=1)
                        c.connect('tcp://' + worker['ip'] + ':' +
                                  worker['port'])
                        c.ping()
                        printList.append(worker['port'])
                    except TimeoutExpired:
                        # print '6~~~'

                        # if not worker['current_task']:
                        #     # print '7~~~'
                        #     # continue
                        # print '1test~~~~'
                        self.workers.remove(worker)
                        if worker['current_task']:
                            # print '2test~~~~'
                            temp = []
                            for p in worker['current_task']:
                                if not p['status']:
                                    temp.append(p)

                            # print 'temp %s' % temp

                            # partiton_num = len(temp)
                            # worker_num = len(self.workers)

                            if len(temp) % len(self.workers) == 0:
                                pop_num = len(temp) / len(self.workers)
                            else:
                                pop_num = len(temp) / len(self.workers) + 1

                            # print 'pop_num: %s' % pop_num

                            for w in self.workers:
                                if worker['status'] == StatusEnum.Status_Down:
                                    continue
                                else:
                                    # print '3test~~~'

                                    i = 0
                                    partition_List_toSent_inRecovery = []
                                    partitonName_toPrint = []

                                    if len(temp) == 0:
                                        break
                                    else:
                                        while i < pop_num:

                                            temp_p = temp.pop()
                                            partition_List_toSent_inRecovery.append(
                                                temp_p)
                                            partitonName_toPrint.append(
                                                temp_p['partition_name'])

                                            i += 1
                                            if len(temp) == 0:
                                                break
                                        try:

                                            c = zerorpc.Client()
                                            c.connect('tcp://' + w['ip'] +
                                                      ':' + w['port'])

                                            # print w['current_task']
                                            w['status'] = StatusEnum.Status_Working

                                            output = StringIO.StringIO()
                                            pickler = cloudpickle.CloudPickler(
                                                output)
                                            pickler.dump(
                                                partition_List_toSent_inRecovery
                                            )
                                            sent = output.getvalue()

                                            c.getPartitionList(sent)
                                            print colored(
                                                '[Master:]', 'white'
                                            ), colored(
                                                'Sent partitons:%s to worker: %s',
                                                'red') % (partitonName_toPrint,
                                                          w['port'])

                                            worker['current_task'].extend(
                                                partition_List_toSent_inRecovery
                                            )
                                        except LostRemote:
                                            continue
                                        except TimeoutExpired:
                                            continue
                        print colored(
                            '[Master:]',
                            'white'), colored('(%s:%s) down', 'red') % (
                                worker['ip'], worker['port'])
                    except LostRemote:
                        print 'LostRemote event ignored'
                        continue
                # print printList

            gevent.sleep(1)
Exemplo n.º 15
0
    def setPartionsToWorker(self, partitions):
        partitions_usedTosetTask = copy.deepcopy(partitions)
        # print '1'
        worker_num = len(self.workers)
        # print worker_num
        partition_num = len(partitions_usedTosetTask)
        # print partition_num

        if partition_num % worker_num == 0:
            pop_num = partition_num / worker_num
        else:
            pop_num = partition_num / worker_num + 1

        for worker in self.workers:
            # print '-------'
            # print partitions_usedTosetTask
            # print '-------'
            if len(partitions_usedTosetTask) == 0:
                break
            # print '2'
            i = 0
            while i < pop_num:
                p = partitions_usedTosetTask.pop()
                worker['current_task'].append(p)
                i += 1
                # print i
                if len(partitions_usedTosetTask) == 0:
                    break
            # print 'test2'
            # print worker['current_task']
            # # sent partitions to workers
            worker['status'] = StatusEnum.Status_Working
            partition_List_toSent = []
            p_name_toPrint = []
            for ct in worker['current_task']:
                # print ct['status']
                if not ct['status']:
                    partition_List_toSent.append(ct)
                    p_name_toPrint.append(ct['partition_name'])
            try:

                c = zerorpc.Client()
                c.connect('tcp://' + worker['ip'] + ':' + worker['port'])

                output = StringIO.StringIO()
                pickler = cloudpickle.CloudPickler(output)
                pickler.dump(partition_List_toSent)
                sent = output.getvalue()

                print colored(
                    '[Master:]',
                    'white'), colored('Sent partitons:%s to worker: %s',
                                      'red') % (p_name_toPrint, worker['port'])

                c.getPartitionList(sent, async=True)

                output = StringIO.StringIO()
                pickler = cloudpickle.CloudPickler(output)
                pickler.dump(self.rddLineageList)
                objstr = output.getvalue()

                print colored('[Master:]', 'white'), colored(
                    'sent rdd lineage to worker: %s', 'red') % worker['port']

                c.getRddLineage(objstr, async=True)
            except TimeoutExpired:
                print colored(
                    '[Exception:]exception happened when sent partition and rdd lineage to workers!',
                    'red')
                continue
            except LostRemote:
                continue
Exemplo n.º 16
0
import StringIO

import zerorpc

import cloudpickle
from src.rdd.rdd import *

r = TextFile('myfile')
m = Map(r, lambda s: s.split())
f = Filter(m, lambda a: int(a[1]) > 2)

output = StringIO.StringIO()
pickler = cloudpickle.CloudPickler(output)
pickler.dump(f)
objstr = output.getvalue()

c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")

print c.hello(objstr)