def load_data(self): self.original_labels = [] if (not os.path.exists(self.original_labels_filename) or self.original_labels_filename is None): raise OSError( "original_labels_filename %s does not exist or None." " Please specify path to file with labels. If you don't have " "pickle with labels, generate it with preparation_imagenet.py" % self.original_labels_filename) if (not os.path.exists(self.count_samples_filename) or self.count_samples_filename is None): raise OSError( "count_samples_filename %s does not exist or None. Please " "specify path to file with count of samples. If you don't " "have json file with count of samples, generate it with " "preparation_imagenet.py" % self.count_samples_filename) if (not os.path.exists(self.samples_filename) or self.samples_filename is None): raise OSError("samples_filename %s does not exist or None. Please " "specify path to file with samples. If you don't " "have dat file with samples, generate it with " "preparation_imagenet.py" % self.samples_filename) with open(self.original_labels_filename, "rb") as fin: for lbl in pickle.load(fin): self.original_labels.append(lbl) self.labels_mapping[int(lbl)] = int(lbl) self.info("Labels (min max count): %d %d %d", numpy.min(self.original_labels), numpy.max(self.original_labels), len(self.original_labels)) with open(self.count_samples_filename, "r") as fin: for key, value in (json.load(fin)).items(): set_type = {"test": 0, "val": 1, "train": 2} self.class_lengths[set_type[key]] = value self.info("Class Lengths: %s", str(self.class_lengths)) if self.total_samples != len(self.original_labels): raise error.Bug( "Number of labels missmatches sum of class lengths") with open(self.matrixes_filename, "rb") as fin: matrixes = pickle.load(fin) self.mean.mem = matrixes[0] self.rdisp.mem = matrixes[1].astype( opencl_types.dtypes[root.common.engine.precision_type]) if numpy.count_nonzero(numpy.isnan(self.rdisp.mem)): raise ValueError("rdisp matrix has NaNs") if numpy.count_nonzero(numpy.isinf(self.rdisp.mem)): raise ValueError("rdisp matrix has Infs") if self.mean.shape != self.rdisp.shape: raise ValueError("mean.shape != rdisp.shape") if self.mean.shape[0] != self.sy or self.mean.shape[1] != self.sx: raise ValueError("mean.shape != (%d, %d)" % (self.sy, self.sx)) self.file_samples = open(self.samples_filename, "rb") if (self.file_samples.seek(0, 2) // (self.sx * self.sy * self.channels) != len(self.original_labels)): raise error.Bug("Wrong data file size")
def load_data(self): if (self.original_labels_filename is None or not os.path.exists(self.original_labels_filename)): raise OSError( "original_labels_filename %s does not exist or None." " Please specify path to file with labels. If you don't have " "pickle with labels, generate it with preparation_imagenet.py" % self.original_labels_filename) if (self.count_samples_filename is None or not os.path.exists(self.count_samples_filename)): raise OSError( "count_samples_filename %s does not exist or None. Please " "specify path to file with count of samples. If you don't " "have json file with count of samples, generate it with " "preparation_imagenet.py" % self.count_samples_filename) if (self.samples_filename is None or not os.path.exists(self.samples_filename)): raise OSError("samples_filename %s does not exist or None. Please " "specify path to file with samples. If you don't " "have dat file with samples, generate it with " "preparation_imagenet.py" % self.samples_filename) with open(self.original_labels_filename, "rb") as fin: for lbls in pickle.load(fin): txt_lbl, int_lbl = lbls self._original_labels_.append(txt_lbl) self.labels_mapping[txt_lbl] = int(int_lbl) for _ in range(len(self.labels_mapping)): self.reversed_labels_mapping.append(None) for key, val in self.labels_mapping.items(): self.reversed_labels_mapping[val] = key with open(self.count_samples_filename, "r") as fin: for key, value in (json.load(fin)).items(): set_type = {"test": 0, "val": 1, "train": 2} self.class_lengths[set_type[key]] = value self.info("Class Lengths: %s", str(self.class_lengths)) for lbl in self._original_labels_[self.class_lengths[0] + self.class_lengths[1]:]: self._train_different_labels_[lbl] += 1 if self.total_samples != len(self._original_labels_): raise error.Bug( "Number of labels missmatches sum of class lengths") self._file_samples_ = open(self.samples_filename, "rb") number_of_samples = (self._file_samples_.seek(0, 2) // (self.sx * self.sy * self.channels)) if number_of_samples != len(self._original_labels_): raise error.Bug( "Wrong data file size: %s (original data) != %s (original " "labels)" % (number_of_samples, len(self._original_labels_)))
def class_index_by_sample_index(self, index): for class_index, class_offset in enumerate( self.effective_class_end_offsets): if index < class_offset: return class_index, class_offset - index raise error.Bug("Could not convert sample index to class index, " "probably due to incorrect class_end_offsets.")
def class_ended(self): for offset in self.effective_class_end_offsets: if self.global_offset == offset: return True if self.global_offset < offset: return False raise error.Bug("global_offset %d is out of bounds %s" % (self.global_offset, self.effective_class_end_offsets))
def _gpu_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.unmap_vectors(self.states, self.output) self.cl_const[0] = nbytes // bytes_per_round self.set_arg(1, self.cl_const) self.execute_kernel(self._global_size, self._local_size)
def apply_data_from_slave(self, data, slave): if slave is None: # Partial update return try: self.minibatch_offset, self.minibatch_size = \ self.pending_minibatches_[slave.id].pop() except KeyError: raise error.Bug("pending_minibatches_ does not contain %s" % slave.id) self._on_successful_serve() if not self.has_data_for_slave: self.has_data_for_slave = self.last_minibatch
def numpy_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.states.map_write() self.output.map_invalidate() n_rounds = nbytes // bytes_per_round u64 = numpy.array([1181783497276652981], dtype=numpy.uint64) s0 = numpy.zeros(1, dtype=numpy.uint64) s1 = numpy.zeros(1, dtype=numpy.uint64) states = self.states.mem.view(dtype=numpy.uint64) states = states.reshape(states.size // 16, 16) output = self.output.mem.view(dtype=numpy.uint64) for i in range(self.num_states): offs = i s = states[i] self.p = 0 for _round in range(n_rounds): for _iter in range(16): output[offs] = self._next_rand(s, s0, s1, u64) offs += self.num_states
def fill_minibatch(self): # minibatch was filled in fill_indices, so fill_minibatch not need raise error.Bug("Control should not go here")
class VelesProtocol(StringLineReceiver, IDLogger): """A communication controller from client to server. Attributes: FSM_DESCRIPTION The definition of the Finite State Machine of the protocol. """ def onFSMStateChanged(self, e): """ Logs the current state transition. """ self.debug("state: %s, %s -> %s", e.event, e.src, e.dst) FSM_DESCRIPTION = { 'initial': 'INIT', 'events': [ { 'name': 'disconnect', 'src': '*', 'dst': 'ERROR' }, { 'name': 'close', 'src': '*', 'dst': 'END' }, { 'name': 'reconnect', 'src': '*', 'dst': 'INIT' }, { 'name': 'request_id', 'src': ['INIT', 'WAIT'], 'dst': 'WAIT' }, { 'name': 'send_id', 'src': 'INIT', 'dst': 'WAIT' }, { 'name': 'request_job', 'src': ['WAIT', 'POSTPONED'], 'dst': 'GETTING_JOB' }, { 'name': 'obtain_job', 'src': 'GETTING_JOB', 'dst': 'BUSY' }, { 'name': 'refuse_job', 'src': 'GETTING_JOB', 'dst': 'END' }, { 'name': 'postpone_job', 'src': 'GETTING_JOB', 'dst': 'POSTPONED' }, { 'name': 'complete_job', 'src': 'BUSY', 'dst': 'WAIT' }, ], 'callbacks': { 'onchangestate': onFSMStateChanged } } def __init__(self, addr, host): """ Initializes the protocol. Parameters: addr The address of the server (reported by Twisted). factory The factory which produced this protocol. """ super(VelesProtocol, self).__init__(logger=host.logger) self.addr = addr self.host = host self._last_update = None self.state = host.state self._current_deferred = None self._power_upload_time = 0 self._power_upload_threshold = 60 self.rand = get_rg() def connectionMade(self): self.info("Connected in %s state", self.state.current) self.disconnect_time = None if self.id is None: self.request_id() return self.send_id() self.state.send_id() def connectionLost(self, reason): self.debug("Connection was lost") if self._current_deferred is not None: self._current_deferred.cancel() def lineReceived(self, line): self.debug("lineReceived: %s", line) msg = json.loads(line.decode("utf-8")) if not isinstance(msg, dict): self.error("Could not parse the received line, dropping it") return err = msg.get("error") if err: self.disconnect("Server returned error: '%s'", err) return if self.state.current == "WAIT": if msg.get("reconnect") == "ok": if self.id is None: self.error("Server returned a successful reconnection, " "but my ID is None") self.request_id() return self.request_job() return cid = msg.get("id") if cid is None: self.error("No ID was received in WAIT state") self.request_id() return self.id = cid self.debug("Received ID") log_id = msg.get("log_id") if log_id is None: self.error("No log ID was received in WAIT state") self.request_id() return self.host.on_id_received(self.id, log_id) endpoint = msg.get("endpoint") if endpoint is None: self.error("No endpoint was received") self.request_id() return self.host.zmq_connection = self.zmq_connection = ZmqDealer( cid, self, ZmqEndpoint("connect", endpoint)) self.info("Connected to ZeroMQ endpoint %s", endpoint) data = msg.get('data') if data is not None: self._set_deferred( self.host.workflow.apply_initial_data_from_master, data) self.request_job() return self.disconnect("disconnect: invalid state %s", self.state.current) def job_received(self, job): if not job: # False, None or empty string mean job refusal self.info("Job was refused") self.state.refuse_job() elif job == b"NEED_UPDATE": self.debug("Master returned NEED_UPDATE, will repeat the job " "request in update_result_received()") self.state.postpone_job() else: try: self.state.obtain_job() except fysom.FysomError as e: self.warning("Job was received too late or too early: %s", e) return update = self._last_update if self.host. async and update is not None: self.request_update() if job == b"NEED_UPDATE": return if not job: # No jobs are available => terminate itself self.host.launcher.stop() return try: if self.host.death_probability > 0 and \ self.rand.random() < self.host.death_probability: raise error.Bug("This slave has randomly crashed (death " "probability was %f)" % self.host.death_probability) now = time.time() if now - self._power_upload_time > self._power_upload_threshold: self._power_upload_time = now self.sendLine({ 'cmd': 'change_power', 'power': self.host.workflow.computing_power }) # workflow.do_job may hang, so launch it in the thread pool self._set_deferred(self.host.workflow.do_job, job, update, self.job_finished) except: errback(Failure())