def wait_available_gpu(free_memory=4000): all_gpus = gpu_info() if not len(all_gpus): log.warning("None GPU available") raise Exception("None GPU available") # GLOBAL_GPU_LOCK.acquire() while True: all_gpus = gpu_info() gpu_list = sorted( [(gpu['index'], gpu['mem_total'] - gpu['mem_used']) for gpu in all_gpus if (gpu['mem_total'] - gpu['mem_used']) >= free_memory], key=lambda x: x[1]) if len(gpu_list) == 0: log.info("GPU wait 30 seconds") time.sleep(10) continue best_gpu = int(gpu_list[-1][0]) with nvml_context(): gpu_data = device_status(best_gpu) for gpu in all_gpus: if int(gpu["index"]) == best_gpu: gpu_data.update(gpu) break # GLOBAL_GPU_LOCK.release() log.info(f"Best GPU: {gpu_data}") return gpu_data
def on_exchange_declareok(self, unused_frame): log.info('Exchange declared') log.info('Declaring queue') self._channel.queue_declare(queue='', callback=self.on_queue_declareok, exclusive=True, auto_delete=True)
def import_dataset(self, filename=None): """ Import dataset from folder with base and diffs """ if not filename: if self.id: filename = self.id + ".dset" elif self.dataset_name: filename = self.dataset_name + ".dset" else: raise Exception(f'Must define import filename') export_data, self.parms, self.examples, self.images = Dataset.import_from_hdf5( filename, self.id) self.dataset_name = str(self.parms["name"]) self.id = str(self.parms["_id"]) if len(self.examples) != export_data["num_examples"]: raise Exception( f"Invalid number of examples. Imported: {len(self.examples)} - Exported: {export_data['num_examples']}" ) if len(self.images) != len(export_data["images_list"]): raise Exception( f"Invalid number of images. Imported: {len(self.images)} - Exported: {len(export_data['images_list'])}" ) log.info("Dataset imported from file %d examples" % len(self.examples))
def export_dataset(self, filename=None): """ Export dataset to folder adding only differences if a base export exists """ if not filename: filename = self.id + ".dset" self.load_all_images() export_data = { "export_version": "2", "export_date": pytz.utc.localize(datetime.datetime.now()), "dataset": self.dataset_name, "dataset_id": self.id, "num_examples": len(self.examples), "images_list": list(self.images.keys()) } # Dataset.export_to_file(filename) Dataset.export_to_hdf5(export_data, self.parms, self.examples, self.images, filename, self.id) # if self._app_token is not None: log.info(f"Dataset exported to file {len(self.examples)} examples")
def get_dataset(app_token, dataset_id): try: log.info(f"Get dataset {dataset_id}") endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] msg_headers = {'Authorization': f'Bearer {app_token}'} response = requests.get(f"{endpoint}/dataset/{dataset_id}", headers=msg_headers) if response.status_code != 200: log.error(f"Failing get dataset: {response.json()}") return None dataset = response.json() if dataset["dataset_parms"]: return dataset else: log.warning(f"Failing get dataset: {response.json()}") return None except requests.ConnectionError as error: log.error( f'Failing get dataset_id: {dataset_id}. Connection error: {error}') return None except requests.Timeout as error: log.error(f'Failing get dataset_id: {dataset_id}. Timeout: {error}') return None except Exception as excp: log.error(f'Failing get dataset_id: {dataset_id} - {excp}') return None
def on_message(self, unused_channel, basic_deliver, properties, body): log.info('Received message # %s from %s: %s', basic_deliver.delivery_tag, properties.app_id, body) parms = json.loads(body.decode()) self._message_call(parms) # Acknowledge the message delivery from RabbitMQ by sending a Basic.Ack RPC method for the delivery tag. self._channel.basic_ack(basic_deliver.delivery_tag)
def stop(self): log.info('Stopping') self._closing = True for channel in self._channels: channel.stop_consuming() self._connection.ioloop.stop() log.info('Stopped')
def on_queue_declareok(self, method_frame): log.info('Queue declare ok: consumer_count={}, message_count={}, queue={}'.format(method_frame.method.consumer_count, method_frame.method.message_count, method_frame.method.queue)) self._queue_name = method_frame.method.queue log.info('Binding %s to %s with %s' % (self._topic, self._queue_name, self._routing_key)) self._channel.queue_bind(exchange=self._topic, routing_key=self._routing_key, queue=self._queue_name, callback=self.on_bindok)
def get_flow(app_token, flow_id): try: log.info(f"Get flow {flow_id}") if not Path(CONFIG["flow_folder"]).is_dir(): Path(CONFIG["flow_folder"]).mkdir(parents=True, exist_ok=True) local_cache = os.path.join(CONFIG["flow_folder"], flow_id + '.json') endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] msg_headers = {'Authorization': f'Bearer {app_token}'} response = requests.get(f"{endpoint}/flow/{flow_id}", headers=msg_headers) if response.status_code != 200: log.error(f"Failing get flow from edge: {response.json()}") if os.path.isfile(local_cache): with open(local_cache) as fp: flow = json.load(fp) return flow return None flow = response.json()["flow"] if "_id" in flow: if os.path.isfile(local_cache): os.remove(local_cache) with open(local_cache, 'w') as fp: json.dump(flow, fp, default=str) return flow else: log.warning(f"Failing get flow: {response.json()}") return None except requests.ConnectionError as error: log.error(f'Failing get flow_id: {flow_id}. Connection error: {error}') return None except requests.Timeout as error: log.error(f'Failing get flow_id: {flow_id}. Timeout: {error}') return None except Exception as excp: log.error(f'Failing get flow_id: {flow_id} - {excp}') return None
def load_data(self): """ Load dataset data from ws """ if not self._app_token: raise Exception('AppToken not set') dataset = edge_client.get_dataset(self._app_token, self.id) if not dataset: raise Exception(f'Fail loading dataset_id {self.id}') self.dataset_name = dataset["dataset_parms"].get("name") self.parms = dataset["dataset_parms"] self.examples = dataset["annotations"] self.update_default_parms() log.info(f"Load dataset from database {len(self.examples)} examples")
def on_channel_open(self, channel): log.info('Channel opened') self._channel = channel log.info('Adding channel close callback') self._channel.add_on_close_callback(self.on_channel_closed) log.info('Declaring queue %s' % self._queue) self._channel.queue_declare(queue=self._queue, callback=self.on_queue_declareok, durable=True) log.info('Adding process consumer cancellation callback') self._channel.add_on_cancel_callback(self.on_consumer_cancelled)
def get_train(app_token, dataset_id, train_id, train_folder): try: log.info(f"Get train {dataset_id}-{train_id}") folder_path = Path(train_folder + '/' + dataset_id + '/' + train_id) if not folder_path.is_dir(): folder_path.mkdir(parents=True, exist_ok=True) endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] url = f"{endpoint}/model-hist/{dataset_id}/{train_id}" msg_headers = {'Authorization': f'Bearer {app_token}'} payload = {"download_url": True} response = requests.get(url, headers=msg_headers, params=payload) if response.status_code != 200: log.error(f"Failing get train: {response.json()}") return None train_doc = response.json() dest_filename = os.path.join(str(folder_path), train_id + ".tar.gz") download_file(train_doc["download_url"], dest_filename) # expand_file call(["tar", "-xf", dest_filename, "--directory", str(folder_path)]) os.remove(dest_filename) return train_id except requests.ConnectionError as error: log.error( f'Failing get train_id: {train_id}. Connection error: {error}') return None except requests.Timeout as error: log.error(f'Failing get train_id: {train_id}. Timeout: {error}') return None except Exception as excp: log.error(f'Failing get train_id: {train_id} - {excp}') return None
def get_edge_data(app_token): try: log.info(f"Get edge_data") endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] msg_headers = {'Authorization': f'Bearer {app_token}'} response = requests.get(f"{endpoint}", headers=msg_headers) if response.status_code != 200: log.error(f"Failing get edge_data: {response.json()}") return None return response.json()["edge_data"] except requests.ConnectionError as error: log.error(f'Failing get edge_data. Connection error: {error}') return None except requests.Timeout as error: log.error(f'Failing get edge_data. Timeout: {error}') return None except Exception as excp: log.error(f'Failing get edge_data - {excp}') return None
def on_connection_open(self, unused_connection): log.info('Connection opened') log.info('Adding connection close callback') self._connection.add_on_close_callback(self.on_connection_closed) if self._queues is not None: for queue in self._queues: log.info('Creating queue channel: ' + queue[0]) new_channel = QueueChannel(queue[0], queue[1], self) self._channels.append(new_channel) self._connection.channel(on_open_callback=new_channel.on_channel_open) elif self._topic is not None: log.info('Creating topic channel: ' + self._topic[0]) new_channel = TopicChannel(self._topic[0], self._topic[1], self._topic[2], self) self._channels.append(new_channel) self._connection.channel(on_open_callback=new_channel.on_channel_open)
def purge_files(self, max_files=800, max_days=None): """ Purge local and cloud files to max_files from de older to newer """ local_files = self._get_list_files_info(self._local_folder) if len(local_files) > max_files: date_list = [(l_file["filename"], l_file["modified_date"]) for l_file in local_files] exclude_list = sorted( date_list, key=lambda x: x[1])[:len(local_files) - max_files] log.info(f"Purge local files: {len(exclude_list)}") for filename, _ in exclude_list: try: os.remove(os.path.join(self._local_folder, filename)) except: pass if self._cloud_obj is None or self._cloud_folder is None: return cloud_files = self._cloud_obj.list_files_info( folder=self._cloud_folder, resource_id=self.resource_id) if len(cloud_files) > max_files: date_list = [(l_file["filename"], l_file["modified_date"]) for l_file in cloud_files] exclude_list = sorted( date_list, key=lambda x: x[1])[:len(cloud_files) - max_files] log.info(f"Purge cloud files: {len(exclude_list)}") for filename, _ in exclude_list: try: self._cloud_obj.delete_file( folder=self._cloud_folder, resource_id=self.resource_id, filename=filename, ) except: pass
def on_channel_open(self, channel): log.info('Channel opened') self._channel = channel log.info('Adding channel close callback') self._channel.add_on_close_callback(self.on_channel_closed) log.info('Declaring exchange %s' % self._topic) self._channel.exchange_declare(exchange=self._topic, callback=self.on_exchange_declareok, durable=True, exchange_type='topic')
def close_connection(self): log.info('Closing connection') self._connection.close()
def on_bindok(self, unused_frame): log.info('Queue bound') log.info('Adding process consumer cancellation callback') self._channel.add_on_cancel_callback(self.on_consumer_cancelled) self._consumer_tag = self._channel.basic_consume(queue=self._queue_name, on_message_callback=self.on_message)
def upload_extract(app_token, dataset_id, extract_folder, max_files=MAX_EXTRACT_FILES, thumb_size=THUMB_SIZE): try: log.info(f"Upload extract {dataset_id}") folder_path = os.path.join(extract_folder, dataset_id) if not os.path.isdir(folder_path): raise Exception(f"Extract folder doesn't exists: {folder_path}") clear_log(folder_path, max_files) generate_extract_thumbs(folder_path, thumb_size) files_data = [] file_list = [f for f in os.listdir(folder_path)] for filename in file_list: exp_id = filename[:24] if filename.endswith('_data.json') and ( exp_id + ".jpg") in file_list and ( exp_id + "_thumb.jpg") in file_list: try: filepath = os.path.join(folder_path, filename) with open(filepath, 'r') as json_file: data = json.load(json_file) if "date" in data: data["date"] = {"$date": data["date"]} if "_id" in data: data["_id"] = {"$oid": data["_id"]} files_data.append(data) except: pass if not files_data: log.warning( f'Cannot upload post upload_extract: {dataset_id}. No files.') return dataset_id extract_files = {"files_data": files_data} with open(os.path.join(folder_path, 'extract_files.json'), 'w', newline='', encoding='utf8') as file_p: json.dump(extract_files, file_p, ensure_ascii=False, default=str) dest_filename = os.path.join(extract_folder, dataset_id + ".tar.gz") if os.path.isfile(dest_filename): os.remove(dest_filename) wd = os.getcwd() os.chdir(folder_path) with tarfile.open(dest_filename, "w:gz") as tar: for filename in os.listdir(folder_path): if filename.endswith( '.jpg') or filename == 'extract_files.json': tar.add(filename) os.chdir(wd) endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] msg_headers = {'Authorization': f'Bearer {app_token}'} url = f"{endpoint}/dataset/{dataset_id}/extract" files = {'extract': open(dest_filename, 'rb')} values = {'dataset_id': dataset_id, 'extract_files': extract_files} response = requests.post(url, files=files, data=values, headers=msg_headers) if response.status_code != 201: raise Exception(f"Failing upload extract files: {response.json()}") os.remove(dest_filename) return dataset_id except requests.ConnectionError as error: log.error( f'Failing post upload_extract: {dataset_id}. Connection error: {error}' ) return None except requests.Timeout as error: log.error( f'Failing post upload_extract: {dataset_id}. Timeout: {error}') return None except Exception as excp: log.error(f'Failing post upload_extract: {dataset_id} - {excp}') return None
def on_consumer_cancelled(self, method_frame): log.info('Consumer was cancelled remotely, shutting down: %r', method_frame) if self._channel: self._channel.close()
def stop_consuming(self): if self._channel: log.info('Sending a Basic.Cancel RPC command to RabbitMQ') self._channel.basic_cancel(self.on_cancelok, self._consumer_tag)
def get_model_component(app_token, model_component_id, model_component_folder): local_doc = None try: log.info(f"Get model_component {model_component_id}") folder_path = Path(model_component_folder + '/' + model_component_id) if not folder_path.is_dir(): folder_path.mkdir(parents=True, exist_ok=True) local_cache = os.path.join(model_component_folder, model_component_id + '.json') if os.path.isfile(local_cache): with open(local_cache) as fp: local_doc = json.load(fp) endpoint = jwt.decode(app_token, options={"verify_signature": False})['endpoint'] url = f"{endpoint}/model-component/{model_component_id}" msg_headers = {'Authorization': f'Bearer {app_token}'} payload = {"download_url": False} response = requests.get(url, headers=msg_headers, params=payload) if response.status_code != 200: if local_doc: return local_doc log.error(f"Failing get model_component: {response.json()}") return None model_component_doc = response.json() if local_doc and model_component_doc["version"] == local_doc["version"]: return local_doc payload = {"download_url": True} response = requests.get(url, headers=msg_headers, params=payload) if response.status_code != 200: if local_doc: return local_doc log.error(f"Failing get model: {response.json()}") return None model_component_doc = response.json() dest_filename = os.path.join(folder_path, model_component_id + ".tar.gz") download_file(model_component_doc["download_url"], dest_filename) # expand_file call(["tar", "-xf", dest_filename, "--directory", folder_path]) os.remove(dest_filename) if os.path.isfile(local_cache): os.remove(local_cache) with open(local_cache, 'w') as fp: json.dump(model_component_doc, fp, default=str) return model_component_id except requests.ConnectionError as error: if local_doc: return local_doc log.error( f'Failing get model_component: {model_component_id}. Connection error: {error}' ) return None except requests.Timeout as error: if local_doc: return local_doc log.error( f'Failing get model_component: {model_component_id}. Timeout: {error}' ) return None except Exception as excp: if local_doc: return local_doc log.error( f'Failing get model_component: {model_component_id} - {excp}') return None
def on_cancelok(self, unused_frame): log.info('RabbitMQ acknowledged the cancellation of the consumer') log.info('Closing the channel') self._channel.close()
def sync_files(self, origin, file_list=None): """ Synchronizes files between local and cloud folder If file_list then synchronize only files in this list """ if self._cloud_obj is None: raise Exception("Cloud provider not defined") if self._cloud_folder is None: raise Exception("Cloud folder not defined") if origin not in ["cloud", "local", "both"]: raise Exception( f"'origin'={origin} must be one of: ('cloud', 'local', 'both')" ) try: local_files = self._get_list_files_info(self._local_folder) cloud_files = self._cloud_obj.list_files_info( folder=self._cloud_folder, resource_id=self.resource_id) if origin in ["cloud", "both"]: download_files = [] for c_file in cloud_files: if file_list is not None and c_file[ "filename"] not in file_list: continue for l_file in local_files: if l_file["filename"] == c_file["filename"] and \ l_file["file_size"] == c_file["file_size"] and \ l_file["modified_date"] >= c_file["modified_date"]: break else: download_files.append(c_file) if download_files: log.info( f"Storage: {self.storage} - Resource-ID: {self.resource_id} - Download {len(download_files)} files" ) for c_file in download_files: file_data = self._cloud_obj.download_file( folder=self._cloud_folder, resource_id=self.resource_id, filename=c_file["filename"]) file_path = os.path.join(self._local_folder, c_file["filename"]) dirname = os.path.dirname(file_path) if not os.path.isdir(dirname): os.makedirs(dirname, exist_ok=True) with open(file_path, "wb") as fp: fp.write(file_data) if origin in ["local", "both"]: upload_files = [] for l_file in local_files: if file_list is not None and l_file[ "filename"] not in file_list: continue for c_file in cloud_files: if l_file["filename"] == c_file["filename"] and \ l_file["file_size"] == c_file["file_size"] and \ l_file["modified_date"] <= c_file["modified_date"]: break else: upload_files.append(l_file) if upload_files: log.info( f"Storage: {self.storage} - Resource-ID: {self.resource_id} - Upload {len(upload_files)} files" ) for l_file in upload_files: file_path = os.path.join(self._local_folder, l_file["filename"]) with open(file_path, "rb") as fp: file_data = fp.read() if self._cloud_obj.is_file( folder=self._cloud_folder, resource_id=self.resource_id, filename=l_file["filename"], ): self._cloud_obj.delete_file( folder=self._cloud_folder, resource_id=self.resource_id, filename=l_file["filename"], ) file_data = self._cloud_obj.upload_file( folder=self._cloud_folder, resource_id=self.resource_id, filename=l_file["filename"], data=file_data) # Set local file modified time to now() to prevent download the same file after date = datetime.datetime.now() mod_time = time.mktime(date.timetuple()) try: os.utime(file_path, (mod_time, mod_time)) except: pass except Exception as excp: log.error(traceback.format_exc()) log.error(f"sync_files error: {excp}") raise excp
def connect(self): log.info('Connecting to %s', self._url) self._connection = pika.SelectConnection(pika.URLParameters(self._url), on_open_callback=self.on_connection_open, on_open_error_callback=self.on_connection_error)