def consumer(self, que): a = [] b = [] while True: can_read, _, _ = select.select(que, [], []) for r in can_read: print(threading.current_thread().getName(), "============") item = r.get() print(item) if item == "start": print("启动服务") logger.info( "启动程序 HeartSchedule >>> {} ClusterServer >>>{}". format(":".join(CONFIG["sch_grpc_server"]), CONFIG.get("master_port"))) p = spawn(target=login_and_update, name="heartbeat") cs = ClusterServer( addr="0.0.0.0:{}".format(CONFIG.get("master_port"))) cs.start() a.append(p) b.append(cs) else: print("关闭服务") logger.info("关闭程序>>>HeartSchedule>>>ClusterServer") if b or a: b.pop().stop() stop_thread(a.pop())
def start_heart_cluster(self): # 循环检测需要一个标识判断 flag = True a = [] b = [] while True: time.sleep(.5) if up_cluster_event.is_set() and flag: logger.info( "启动程序 HeartSchedule >>> {} ClusterServer >>>{}". format(":".join(CONFIG["sch_grpc_server"]), CONFIG.get("master_port"))) p = spawn(target=login_and_update, name="heartbeat") cs = ClusterServer( addr="0.0.0.0:{}".format(CONFIG.get("master_port"))) cs.start() a.append(p) b.append(cs) flag = False elif not up_cluster_event.is_set(): flag = True if all([a, b]): logger.info("关闭程序>>>HeartSchedule>>>ClusterServer") b.pop().stop() stop_thread(a.pop())
def run_server(): p = spawn(target=vip_load, name="find_vip") try: server = grpc.server(ThreadPoolExecutor(40)) # 将对应的任务处理函数添加到rpc server中 raft_grpc_pb2_grpc.add_RaftServiceServicer_to_server( raft_grpc_server.RaftService(), server) # 这里使用的非安全接口,世界gRPC支持TLS/SSL安全连接,以及各种鉴权机制 server.add_insecure_port("0.0.0.0:{}".format( CONFIG.get("raft_grpc_port"))) server.start() # 开启服务 # TODO 开启进程选举会报错 app.run( host='0.0.0.0', # 8586端口 只是在muster启动 port=CONFIG.get("raft_http_port"), ) except Exception as e: logger.info(e) finally: stop_thread(p) dc_vip.vip.set_vip("down") exit("退出")
def heart_beat(): from cluster_master.utils.client_mongo import cli, Mongo mongo_host = CONFIG.get("mongo_center") mongo = Mongo(host=mongo_host, db="cluster", table="machines") heart_beat_body = {} LOCAL_HOST = local_ip() grpc_public_port = CONFIG.get("grpc_public_port") heart_beat_body["callbackAddress"] = "{sch_ip}:{port}".format( sch_ip=CONFIG["sch_callback_ip"], port=CONFIG["grpc_public_port"] if grpc_public_port else int(CONFIG["sch_callback_port_prefix"]) + int(LOCAL_HOST.split(".")[-1]), ) heart_beat_body["clusterID"] = CONFIG.get("cluster_id") heart_beat_body["gpus"] = [{ "model": k, "count": v } for k, v in mongo.use_gpu_by_num().items()] return heart_beat_body
def vip_load(): logger.info(vip_event.is_set()) with grpc.insecure_channel("0.0.0.0:{}".format( CONFIG.get("raft_grpc_port"))) as chan: stub = raft_grpc_pb2_grpc.RaftServiceStub(channel=chan) # p = multiprocessing.Process(target=send_status_to_schedule, args=()) # p.daemon = True from cluster_master.cluster import q1 while True: time.sleep(3) ts = int(time.time()) try: res_f = stub.GetStatus.future( raft_grpc_pb2.GetStatusReq(ts=str(ts)), timeout=3) if res_f.result().ts == str(ts): raft_status = json.loads(res_f.result().status) # raft_status = raft_init.raft_obj.getStatus() logger.info( "vip_event {},leader {}, self_node {},isReady {}". format(vip_event.is_set(), raft_status['leader'], raft_status['self'], raft_status["isReady"])) if vip_event.is_set( ) and raft_status['leader'] == raft_status[ 'self'] and raft_status["state"] == 2: dc_vip.vip.set_vip("up") vip_event.clear() up_cluster_event.set() q1.put("start") # logger.info("启动>>>cluster_server") # cs = ClusterServer(addr="0.0.0.0:8300") # cs.start() # cli.append(cs) if not vip_event.is_set( ) and raft_status['leader'] != raft_status['self']: dc_vip.vip.set_vip("down") vip_event.set() up_cluster_event.clear() q1.put("stop") # cli.pop().stop() except Exception as e: logger.info(e) # logger.info("停止>>>cluster_server") # for i in cli: # i.stop() continue
def test_rpc_task(): try: with grpc.insecure_channel("192.168.137.200:{}".format( CONFIG.get("master_port"))) as channel: stub = sch_pb2_grpc.SkylarkStub(channel=channel) # time.sleep(3) status = random.choice(["start", "stop"]) id = hashlib.md5(bytes(random.randint(0, 1000))).hexdigest() print(id) proj_fields_map = { "taskID": "{}".format(id), "taskType": 3, "taskName": "task_name", "projectHash": "QmPhoTxquhjH14hb5S82jnDtu8FcLnGzNZEvgN1jCtN15P", "gpus": random.choices([{ "model": "GeForce GTX 1080 Ti", "count": 1 }, { "model": "GeForce GTX 1070 Ti", "count": 1 }]), "engine": 2, "mainRelativePath": "pytorch_demo/demo_s_d/torch_mnist_demo.py", "runParam": "", "projectName": "pytorch_demo.zip", "outputPath": "pytorch_demo/demo_s_d/out", "status": "stop" } res = stub.TaskStatus(sch_pb2.Proto( version=1, seq=1, timestamp=int(time.time()), body=json.dumps(proj_fields_map).encode()), timeout=5) print(json.loads(res.body)) except Exception as e: print("error", e)
def test_rpc_heart(): try: with grpc.insecure_channel("192.168.137.200:{}".format( CONFIG.get("master_port"))) as channel: stub = sch_pb2_grpc.SkylarkStub(channel=channel) # time.sleep(3) a = { "machineID": "_001|_{:03}".format(random.randint(0, 1000)), "clusterID": "_003", "ctrlAvailable": "N", "taskType": "start", "gpus": [{ "model": "GeForce GTX 1070 Ti", "id": 1, "status": None }, { "model": "GeForce GTX 1080 Ti", "id": 0, "status": None }, { "model": "GeForce GTX 1080 Ti", "id": 1, "status": None }], "callbackAddress": "61.142.213.210:22004", "intranetAddress": "192.168.137.4:28801", "geoCode": "0757-1", "status": 2, } res = stub.HeartBeat(sch_pb2.Proto(version=1, seq=1, timestamp=int(time.time()), body=json.dumps(a).encode()), timeout=5) print(json.loads(res.body)) except Exception as e: print("error", e)
from datetime import datetime import re import sqlalchemy from flask_sqlalchemy import SQLAlchemy from app import app from utils import HTTPRequestError from conf import CONFIG app.config['SQLALCHEMY_DATABASE_URI'] = CONFIG.get_db_url() app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db = SQLAlchemy(app) class DeviceTemplate(db.Model): __tablename__ = 'templates' id = db.Column(db.Integer, db.Sequence('template_id'), primary_key=True) label = db.Column(db.String(128), nullable=False) created = db.Column(db.DateTime, default=datetime.now) updated = db.Column(db.DateTime, onupdate=datetime.now) attrs = db.relationship("DeviceAttr", back_populates="template", lazy='joined', cascade="delete") devices = db.relationship("Device", secondary='device_template', back_populates="templates") def __repr__(self): return "<Template(label='%s')>" % self.label
#!/usr/bin/env python3 # from conf import CONFIG import psycopg2 from bs4 import BeautifulSoup DBCONF = CONFIG().CONF_DB DBCONN = { 'host': DBCONF['host'], 'database': DBCONF['database_example'], 'user': DBCONF['user'], 'password': DBCONF['password'] } COUNT = 3857 def meta_parser(entry, tag): tag_soup = BeautifulSoup(str(entry), 'lxml') return [ts.string for ts in tag_soup.find_all(tag)] def tag_parser(td_list, tag): t_list = list() for td in td_list[1:]: tag_soup = BeautifulSoup(str(td), 'lxml').find_all(tag) tag_list = [ td.attrs['title'] if 'title' in td.attrs.keys() else td.string for td in tag_soup ]
"N", "gpus": [{ "model": "GeForce GTX 1070 Ti", "id": 2, "status": None }, { "model": "GeForce GTX 1080 Ti", "id": 0, "status": None }, { "model": "GeForce GTX 1080 Ti", "id": 3, "status": None }], } mongo_host = CONFIG.get("mongo_center") cli = Mongo(host=mongo_host) # cli.add_data(a) task_id = None GPU_1 = "GeForce GTX 1070 Ti" GPU_2 = "GeForce GTX 1080 Ti" def use_gpu_by_model(): rule = {'ctrlAvailable': 'N', "gpus.status": "adf"} print(list(cli.find_data(rule))) cli.table.update_one({""}, rule)