class supervised: def __init__(self, args): # Read dict predicatesEncoded f = open(predicatesEncodeDictPath, 'r') a = f.read() self.predicatesEncodeDict = eval(a) f.close() # Read all tablenames and get tablename-number mapping tables = [] f = open(shortToLongPath, 'r') a = f.read() short_to_long = eval(a) f.close() for i in short_to_long.keys(): tables.append(i) tables.sort() self.table_to_int = {} for i in range(len(tables)): self.table_to_int[tables[i]] = i # The dimension of the network input vector self.num_inputs = len(tables) * len(tables) + len(self.predicatesEncodeDict["1a"]) # The dimension of the vector output by the network self.num_output = 5 self.args = args self.right = 0 # build up the network self.value_net = ValueNet(self.num_inputs, self.num_output) # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.dataList = [] self.testList = [] # Parsing query plan def hint2matrix(self, hint): tablesInQuery = hint.split(" ") matrix = np.mat(np.zeros((len(self.table_to_int), len(self.table_to_int)))) stack = [] difference = 0 for i in tablesInQuery: if i == ')': tempb = stack.pop() tempa = stack.pop() _ = stack.pop() b = tempb.split('+') a = tempa.split('+') b.sort() a.sort() indexb = self.table_to_int[b[0]] indexa = self.table_to_int[a[0]] matrix[indexa, indexb] = (len(tablesInQuery) + 2) / 3 - difference difference += 1 stack.append(tempa + '+' + tempb) # print(stack) else: stack.append(i) return matrix # Divide training set and test set def pretreatment(self, path): # Load data uniformly and randomly select for training file_test = open(path) line = file_test.readline() while line: queryName = line.split(",")[0].encode('utf-8').decode('utf-8-sig').strip() hint = line.split(",")[1] matrix = self.hint2matrix(hint) predicatesEncode = self.predicatesEncodeDict[queryName] state = matrix.flatten().tolist()[0] state = state + predicatesEncode runtime = line.split(",")[2].strip() if runtime == 'timeout': runtime = ?? # Depends on your settings else: runtime = int(float(runtime)) temp = data(state, runtime) self.dataList.append(temp) line = file_test.readline() self.dataList.sort(key=lambda x: x.time, reverse=False) for i in range(self.dataList.__len__()): self.dataList[i].label = int(i / (self.dataList.__len__() / self.num_output + 1)) # print(self.dataList[i].label) for i in range(int(self.dataList.__len__() * 0.3)): index = random.randint(0, len(self.dataList) - 1) temp = self.dataList.pop(index) self.testList.append(temp) print("size of test set:", len(self.testList), "\tsize of train set:", len(self.dataList)) testpath = "./data/testdata.sql" file_test = open(testpath, 'wb') pickle.dump(len(self.testList), file_test) for value in self.testList: pickle.dump(value, file_test) file_test.close() trainpath = "./data/traindata.sql" file_train = open(trainpath, 'wb') pickle.dump(len(self.dataList), file_train) for value in self.dataList: pickle.dump(value, file_train) file_train.close() # functions to train the network def supervised(self): self.load_data() optim = torch.optim.SGD(self.value_net.parameters(), lr=0.01) # loss_func = torch.nn.MSELoss() # loss_func = torch.nn.CrossEntropyLoss() loss_func = torch.nn.NLLLoss() loss1000 = 0 count = 0 for step in range(1, 16000001): index = random.randint(0, len(self.dataList) - 1) state = self.dataList[index].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = torch.log(self.value_net(state_tensor) + 1e-10) predictionRuntime = predictionRuntime.view(1,-1) label = [] label.append(self.dataList[index].label) label_tensor = torch.tensor(label) loss = loss_func(predictionRuntime, label_tensor) optim.zero_grad() loss.backward() optim.step() loss1000 += loss.item() if step % 1000 == 0: print('[{}] Epoch: {}, Loss: {:.5f}'.format(datetime.now(), step, loss1000)) loss1000 = 0 self.test_network() print('[{}] Epoch: {}, Loss: {:.5f}'.format(datetime.now(), step, loss1000)) if step % 200000 == 0: torch.save(self.value_net.state_dict(), self.args.save_dir + 'supervised.pt') self.test_network() # functions to test the network def test_network(self): self.load_data() model_path = self.args.save_dir + 'supervised.pt' self.actor_net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() correct = 0 for step in range(self.testList.__len__()): state = self.testList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) label = self.testList[step].label #print(maxindex, "\t", label) if maxindex == label: correct += 1 print(correct, self.testList.__len__(), correct/self.testList.__len__(), end = ' ') correct1 = 0 for step in range(self.dataList.__len__()): state = self.dataList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) # prediction = predictionRuntime.detach().cpu().numpy()[0] prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) label = self.dataList[step].label #print(maxindex, "\t", label) if maxindex == label: correct1 += 1 print(correct1, self.dataList.__len__(), correct1/self.dataList.__len__()) self.right = correct / self.testList.__len__() def load_data(self): if self.dataList.__len__() != 0: return testpath = "./data/testdata.sql" file_test = open(testpath, 'rb') l = pickle.load(file_test) for _ in range(l): self.testList.append(pickle.load(file_test)) file_test.close() trainpath = "./data/traindata.sql" file_train = open(trainpath, 'rb') l = pickle.load(file_train) for _ in range(l): self.dataList.append(pickle.load(file_train)) file_train.close()
class supervised: def __init__(self, args): self.num_inputs = 856 # 预测网络输入的向量的维度 self.num_output = 5 # 网络输出的向量的维度 self.args = args self.right = 0 self.dataList = [] self.testList = [] # build up the network self.actor_net = ValueNet(self.num_inputs, self.num_output) if self.args.cuda: self.actor_net.cuda() # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # 读取predicatesEncoded编码 f = open(predicatesEncodeDictPath, 'r') a = f.read() self.predicatesEncodeDict = eval(a) f.close() # 读取所有表名,获取表名数字映射 tables = [] f = open(shortToLongPath, 'r') a = f.read() short_to_long = eval(a) f.close() for i in short_to_long.keys(): tables.append(i) tables.sort() self.table_to_int = {} for i in range(len(tables)): self.table_to_int[tables[i]] = i def load_data(self): if self.dataList.__len__() != 0: return testpath = "./data/testdata.sql" file_test = open(testpath, 'rb') l = pickle.load(file_test) for _ in range(l): self.testList.append(pickle.load(file_test)) file_test.close() trainpath = "./data/traindata.sql" file_train = open(trainpath, 'rb') l = pickle.load(file_train) for _ in range(l): self.dataList.append(pickle.load(file_train)) file_train.close() def print_data(self): for i in range(len(self.dataList)): print(self.dataList[i].state) def hint2matrix(self, hint): # 解构query plan tablesInQuery = hint.split(" ") # 对查询计划编码 matrix = np.mat(np.zeros((28, 28))) stack = [] step = 0 # 按照hint的格式,每次连接结果由一对括号包围,最开始的连接的连接结果(即单表)没有括号 # 可按下公式计算表的数量 num_table = (len(tablesInQuery) + 2) / 3 for i in tablesInQuery: if i == ')': tempb = stack.pop() tempa = stack.pop() # 弹出左括号 _ = stack.pop() # 分割得到已进行连接的表 b = tempb.split('+') a = tempa.split('+') # 排序以后取用左边编号最小的那个表作为代表 b.sort() a.sort() indexb = self.table_to_int[b[0]] indexa = self.table_to_int[a[0]] # 记录本次连接次序 matrix[indexa, indexb] = num_table - step step += 1 # 用'+'表示已进行连接 stack.append(tempa + '+' + tempb) else: stack.append(i) return matrix def pretreatment(self, path): # 统一读入数据 随机抽取进行训练 file_test = open(path) line = file_test.readline() while line: # 解构训练集 queryName = line.split(",")[0].encode('utf-8').decode( 'utf-8-sig').strip() hint = line.split(",")[1] matrix = self.hint2matrix(hint) predicatesEncode = self.predicatesEncodeDict[queryName] state = matrix.flatten().tolist()[0] state = predicatesEncode + state runtime = line.split(",")[2].strip() if runtime == 'timeout': # 5 min = 300 s = 300 000 ms runtime = 300000 else: runtime = int(float(runtime)) temp = data(state, runtime) self.dataList.append(temp) line = file_test.readline() self.dataList.sort(key=lambda x: x.time, reverse=False) for i in range(self.dataList.__len__()): self.dataList[i].label = int( i / (self.dataList.__len__() / self.num_output + 1)) print(self.dataList[i].label) for i in range(int(self.dataList.__len__() * 0.3)): index = random.randint(0, len(self.dataList) - 1) temp = self.dataList.pop(index) self.testList.append(temp) print("size of test set:", len(self.testList), "\tsize of train set:", len(self.dataList)) testpath = "./data/testdata.sql" file_test = open(testpath, 'wb') pickle.dump(len(self.testList), file_test) for value in self.testList: pickle.dump(value, file_test) file_test.close() trainpath = "./data/traindata.sql" file_train = open(trainpath, 'wb') pickle.dump(len(self.dataList), file_train) for value in self.dataList: pickle.dump(value, file_train) file_train.close() def supervised(self): self.load_data() optim = torch.optim.SGD(self.actor_net.parameters(), lr=self.args.critic_lr) # loss_func = torch.nn.CrossEntropyLoss() # loss_func = torch.nn.BCEWithLogitsLoss() # loss_func = torch.nn.MSELoss() loss_func = torch.nn.NLLLoss() loss1000 = 0 count = 0 for step in range(1, 1600001): index = random.randint(0, len(self.dataList) - 1) state = self.dataList[index].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) # 网络预测输出 # temp = [0 for i in range(self.num_output)] # temp[self.dataList[index].label] = 1 # label_tensor = torch.tensor(temp, dtype=torch.float32) # 目标 temp = [self.dataList[index].label] label_tensor = torch.tensor(temp, dtype=torch.long) loss = loss_func(predictionRuntime.view(1, 5), label_tensor) optim.zero_grad() # 清空梯度 loss.backward() # 计算梯度 optim.step() # 应用梯度,并更新参数 loss1000 += loss.item() if step % 100000 == 0: # 每训练100000次,保存当前模型 torch.save( self.actor_net.state_dict(), self.args.save_dir + 'supervised{:d}-{:.5f}.pt'.format(count, loss1000)) count = count + 1 # self.test_network() if step % 1000 == 0: # 每训练1000次,输出1000次训练的总损失 print('[{}] Epoch: {}, Loss: {:.5f}'.format( datetime.now(), step, loss1000)) loss1000 = 0 # functions to test the network def test_network(self): self.load_data() model_path = self.args.save_dir + 'supervised15-569.90684.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() correct = 0 for step in range(self.testList.__len__()): state = self.testList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) label = self.testList[step].label if maxindex == label: correct += 1 print(correct, self.testList.__len__(), correct / self.testList.__len__(), end=' ') correct1 = 0 for step in range(self.dataList.__len__()): state = self.dataList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) label = self.dataList[step].label if maxindex == label: correct1 += 1 print(correct1, self.dataList.__len__(), correct1 / self.dataList.__len__()) self.right = correct / self.testList.__len__() def test_hintcost(self, queryName, hint): model_path = self.args.save_dir + 'supervised.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() matrix = self.hint2matrix(hint) predicatesEncode = self.predicatesEncodeDict[queryName] state = matrix.flatten().tolist()[0] state = predicatesEncode + state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) print(maxindex)
class supervised: def __init__(self, args): self.num_inputs = 856 # 预测网络输入的向量的维度 self.num_output = 2 # 网络输出的向量的维度 self.args = args # build up the network self.actor_net = ValueNet(self.num_inputs, self.num_output) self.actor_net.apply(self.init_weights) if self.args.cuda: self.actor_net.cuda() # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # 读取字典-predicatesEncoded编码 f = open(queryEncodedDictPath, 'r') a = f.read() self.queryEncodedDict = eval(a) f.close() # 读取所有表名,获取表名数字映射 tables = [] f = open(shortToLongPath, 'r') a = f.read() short_to_long = eval(a) f.close() for i in short_to_long.keys(): tables.append(i) tables.sort() self.table_to_int = {} for i in range(len(tables)): self.table_to_int[tables[i]] = i self.datasetnumber = 10 self.trainList = [] self.testList = [] def pretreatment(self, path): print("Pretreatment running...") # 统一读入数据 随机抽取进行训练 file_test = open(path) line = file_test.readline() dataList = [] while line: # 解构训练集 queryName = line.split(",")[0].encode('utf-8').decode( 'utf-8-sig').strip() state = self.queryEncodedDict[queryName] origintime = int(float(line.split(",")[1].strip())) neotime = int(float(line.split(",")[2].strip())) qpopttime = int(float(line.split(",")[3].strip())) label = int(line.split(",")[4].strip()) temp = data(queryName, state, origintime, neotime, qpopttime, label) dataList.append(temp) line = file_test.readline() random.shuffle(dataList) listtemp = [] for i in range(self.datasetnumber): temptemp = [] listtemp.append(temptemp) for i in range(dataList.__len__()): listtemp[i % listtemp.__len__()].append(dataList[i]) for i in range(listtemp.__len__()): filepath = "./data/data" + str(i) + ".sql" file = open(filepath, 'wb') pickle.dump(len(listtemp[i]), file) for value in listtemp[i]: pickle.dump(value, file) file.close() print("Pretreament data done.") def printdata(self): for i in self.trainList: print(i) def supervised(self): # model_path = self.args.save_dir + 'supervised.pt' # self.actor_net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) # self.actor_net.eval() self.load_data() optim = torch.optim.SGD(self.actor_net.parameters(), lr=0.0005) loss_func = torch.nn.MSELoss() loss1000 = 0 count = 0 # starttime = datetime.now() for step in range(1, 300001): index = random.randint(0, len(self.trainList) - 1) state = self.trainList[index].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) label = [0 for _ in range(self.num_output)] label[self.trainList[index].label] = 1 label_tensor = torch.tensor(label, dtype=torch.float32) loss = loss_func(predictionRuntime, label_tensor) optim.zero_grad() # 清空梯度 loss.backward() # 计算梯度 optim.step() # 应用梯度,并更新参数 loss1000 += loss.item() if step % 1000 == 0: print('[{}] Epoch: {}, Loss: {:.5f}'.format( datetime.now(), step, loss1000)) loss1000 = 0 # if step % 2000000 == 0: # torch.save(self.actor_net.state_dict(), self.args.save_dir + 'supervised.pt') # self.test_network() torch.save(self.actor_net.state_dict(), self.args.save_dir + 'supervised.pt') # functions to test the network def test_network(self): self.load_data() model_path = self.args.save_dir + 'supervised.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() # 测试集 correct = 0 for step in range(self.testList.__len__()): state = self.testList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) prediction = self.actor_net(state_tensor).detach().cpu().numpy() maxindex = np.argmax(prediction) print(self.testList[step].queryname, ",", self.testList[step].origintime, ",", self.testList[step].neotime, ",", self.testList[step].qpopttime, ",", self.testList[step].label, ",", maxindex) if maxindex == self.testList[step].label: correct += 1 print("测试集:", correct, "\t", self.testList.__len__()) # 训练集 correct1 = 0 for step in range(self.trainList.__len__()): state = self.trainList[step].state state_tensor = torch.tensor(state, dtype=torch.float32) predictionRuntime = self.actor_net(state_tensor) prediction = predictionRuntime.detach().cpu().numpy() maxindex = np.argmax(prediction) label = self.trainList[step].label # print(self.trainList[step].queryname.strip(), "\t", label, "\t", maxindex) if maxindex == label: correct1 += 1 print("训练集", correct1, "\t", self.trainList.__len__()) def load_data(self, testnum=0): if self.trainList.__len__() != 0: return testpath = "./data/data" + str(testnum) + ".sql" file_test = open(testpath, 'rb') l = pickle.load(file_test) for _ in range(l): self.testList.append(pickle.load(file_test)) file_test.close() for i in range(self.datasetnumber): if i == testnum: continue trainpath = "./data/data" + str(i) + ".sql" file_train = open(trainpath, 'rb') l = pickle.load(file_train) for _ in range(l): self.trainList.append(pickle.load(file_train)) file_train.close() print("load data\ttrainSet:", self.trainList.__len__(), " testSet:", self.testList.__len__()) def init_weights(self, m): if type(m) == torch.nn.Linear: torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0.01)