def merge(ecs_logs,flavors_config,flavors_unique,training_start_time,training_end_time,predict_start_time,predict_end_time): predict = {}.fromkeys(flavors_unique) for f in flavors_unique: predict[f] = 0 virtual_machine_sum = 0 mapping_index = get_flavors_unique_mapping(flavors_unique) R = [] X_trainS_raw,Y_trainS_raw,X_testS = features_building(ecs_logs,flavors_config,flavors_unique,training_start_time,training_end_time,predict_start_time,predict_end_time) # penalty = [1,1,1,1,0.5,0.5] X_trainS = fancy(X_trainS_raw,None,(0,-1),None) # X_trainS = X_trainS_raw Y_trainS = fancy(Y_trainS_raw,None,(0,-1)) # Y_trainS = Y_trainS_raw X_valS = fancy(X_trainS_raw,None,(-1,),None) Y_valS = fancy(Y_trainS_raw,None,(-1,)) #adjustable #5 Ridge Regression alpha clf = Ridge(alpha=1) from model_selection import grid_search_cv_early_stoping test = [] train = [] val = [] for i in range(len(flavors_unique)): X = X_trainS[i] y = Y_trainS[i] # clf = grid_search_cv(Ridge,{"alpha":[0.001,0.01,0.1,0.4,0.7,1,1.5,2]},X,y,cv=20,random_state=42,is_shuffle=True,verbose=True) clf = early_stoping(Ridge,{"alpha":sorted([0.01,0.02,0.1,0.4,0.7,1,1.5,2])[::-1]},X,y,X_valS[i],Y_valS[i],verbose=False) # clf = grid_search_cv_early_stoping(Ridge,{"alpha":sorted([0.01,0.02,0.1,0.4,0.7,1,1.5,2])[::-1]},X,y,X_valS[i],Y_valS[i],cv=10,random_state=42,is_shuffle=True,verbose=True) # clf = Ridge(alpha=(clf_1.alpha + clf_2.alpha)) # clf = Ridge(alpha=1) # clf.fit(X,y) train.append(clf.predict(X)) val.append(clf.predict(X_valS[i])) test.append(clf.predict(X_testS[i])) # print("shape(train)",shape(train)) train = matrix_transpose(train) Y_trainS = matrix_transpose(Y_trainS) R.extend(test) print("training_score-->",official_score(train,Y_trainS)) val = matrix_transpose(val) Y_valS = matrix_transpose(Y_valS) print("validation_score-->",official_score(val,Y_valS)) result = flatten(R) result = [0 if r<0 else r for r in result] for f in flavors_unique: p = result[mapping_index[f]] predict[f] = int(round(p)) virtual_machine_sum += int(round(p)) return predict,virtual_machine_sum
def train_cv(self, X, y, shuffle=False, cv='full'): assert (type(cv) == int or cv == 'full') assert (dim(X) == 2 and dim(y) == 2) self.shape_Y = shape(y) for i in range(shape(y)[1]): max_score = None best_clf = None best_keep = None y_ = fancy(y, -1, i) for _ in range(self.max_iter): clf = self.estimator(**(self.parameter)) X_, keep = self._rand_X(X) clf.fit(X_, y_) score = cross_val_score(clf, X, y, return_mean=True, cv=cv, shuffle=shuffle) if not max_score or max_score < score: max_score = score best_clf = clf best_keep = keep self.keeps.append(best_keep) self.clfs.append(best_clf)
def get_feature_grid(sample,i,fill_na='mean',max_na_rate=1,col_count=None,with_test=True): assert(fill_na=='mean' or fill_na=='zero') col = fancy(sample,None,i) R = [] for j in range(len(col)): left = [None for _ in range(len(col)-j)] right = col[:j] r = [] r.extend(left) r.extend(right) R.append(r) def _mean_with_none(A): if len(A)==0: return 0 else: count = 0 for i in range(len(A)): if A[i]!=None: count+=A[i] return count/float(len(A)) means = [] for j in range(shape(R)[1]): means.append(_mean_with_none(fancy(R,None,j))) width = int((1-max_na_rate) * shape(R)[1]) R = fancy(R,None,(width,)) for _ in range(shape(R)[0]): for j in range(shape(R)[1]): if R[_][j]==None: if fill_na=='mean': R[_][j] = means[j] elif fill_na=='zero': R[_][j]=0 if with_test: if col_count!=None: return fancy(R,None,(-col_count,)) else: return R else: if col_count!=None: return fancy(R,(0,-1),(-col_count,)) else: return R[:-1]
def _rand_X(self, X): N = shape(X)[1] keep_length = math.ceil((1 - self.drop_out) * N) keep_set = set() while len(keep_set) != keep_length: i = random.randrange(N) if i not in keep_set: keep_set.add(i) keep = [True if i in keep_set else False for i in range(N)] X_ = fancy(X, -1, keep) return X_, keep
def minmax_scaling(X, axis=1): assert (axis == 1) R = [] for j in range(shape(X)[1]): col = fancy(X, None, j) max_ = max(col) min_ = min(col) mean_ = mean(col) if max_ - min_ == 0: R.append(col) else: R.append([(x - mean_) / (max_ - min_) for x in col]) return matrix_transpose(R)
def train(self, X, y, X_val, Y_val): assert (dim(X) == 2 and dim(y) == 2) self.shape_Y = shape(y) for i in range(shape(y)[1]): max_score = None best_clf = None best_keep = None y_ = fancy(y, -1, i) for _ in range(self.max_iter): clf = self.estimator(**(self.parameter)) X_, keep = self._rand_X(X) clf.fit(X_, y_) score = clf.score(self._get_keep_X(X_val, keep), fancy(Y_val, -1, i)) if not max_score or max_score < score: max_score = score best_clf = clf best_keep = keep self.keeps.append(best_keep) self.clfs.append(best_clf)
def maxabs_scaling(X, y=None, axis=1): assert (axis == 1) R = [] for j in range(shape(X)[1]): col = fancy(X, None, j) max_ = max(abs(col)) mean_ = mean(col) if max_ == 0: R.append(col) else: if not y: R.append([(x - mean_) / (max_) for x in col]) else: R.append([(x - mean_) * max(y) / (max_) for x in col]) return matrix_transpose(R)
def standard_scaling(X, y=None, axis=1): if axis == 0: return matrix_transpose(standard_scaling(matrix_transpose(X), axis=1)) R = [] for j in range(shape(X)[1]): col = fancy(X, None, j) mean_ = mean(col) std = sqrt(mean(square(minus(col, mean_)))) if y != None: std_y = sqrt(mean(square(minus(y, mean(y))))) if std == 0: R.append(col) else: R.append([(x - mean_) * std_y / std for x in col]) return matrix_transpose(R)
def retrain(self, X, y): assert (len(self.keeps) != 0) for i in range(self.shape_Y[1]): X_ = self._get_keep_X(X, self.keeps[i]) self.clfs[i].fit(X_, fancy(y, -1, i))
def _get_keep_X(self, X, keep): return fancy(X, -1, keep)
def features_building(ecs_logs,flavors_config,flavors_unique,training_start_time,training_end_time,predict_start_time,predict_end_time): mapping_index = get_flavors_unique_mapping(flavors_unique) predict_days = (predict_end_time-predict_start_time).days sample = resampling(ecs_logs,flavors_unique,training_start_time,predict_start_time,frequency=predict_days,strike=1,skip=0) def outlier_handling(sample,method='mean',max_sigma=3): assert(method=='mean' or method=='zero' or method=='dynamic') sample = matrix_copy(sample) std_ = stdev(sample) mean_ = mean(sample,axis=1) for i in range(shape(sample)[0]): for j in range(shape(sample)[1]): if sample[i][j]-mean_[j] >max_sigma*std_[j]: if method=='mean': sample[i][j] = mean_[j] elif method=='zero': sample[i][j] = 0 elif method=='dynamic': sample[i][j] = (sample[i][j] + mean_[j])/2.0 return sample sample = outlier_handling(sample,method='mean',max_sigma=3) # sample = exponential_smoothing(sample,alpha=0.2) Ys = sample[1:] def flavor_clustering(sample,k=3,variance_threshold=None): corrcoef_sample = corrcoef(sample) clustering_paths = [] for i in range(shape(sample)[1]): col = corrcoef_sample[i] col_index_sorted = argsort(col)[::-1] if variance_threshold!=None: col_index_sorted = col_index_sorted[1:] index = [i for i in col_index_sorted if col[i]>variance_threshold] else: index = col_index_sorted[1:k+1] clustering_paths.append(index) return clustering_paths,corrcoef_sample # adjustable # 1 variance_threshold = 0.6 #76.234 clustering_paths,coef_sample = flavor_clustering(sample,variance_threshold=variance_threshold) def get_feature_grid(sample,i,fill_na='mean',max_na_rate=1,col_count=None,with_test=True): assert(fill_na=='mean' or fill_na=='zero') col = fancy(sample,None,i) R = [] for j in range(len(col)): left = [None for _ in range(len(col)-j)] right = col[:j] r = [] r.extend(left) r.extend(right) R.append(r) def _mean_with_none(A): if len(A)==0: return 0 else: count = 0 for i in range(len(A)): if A[i]!=None: count+=A[i] return count/float(len(A)) means = [] for j in range(shape(R)[1]): means.append(_mean_with_none(fancy(R,None,j))) width = int((1-max_na_rate) * shape(R)[1]) R = fancy(R,None,(width,)) for _ in range(shape(R)[0]): for j in range(shape(R)[1]): if R[_][j]==None: if fill_na=='mean': R[_][j] = means[j] elif fill_na=='zero': R[_][j]=0 if with_test: if col_count!=None: return fancy(R,None,(-col_count,)) else: return R else: if col_count!=None: return fancy(R,(0,-1),(-col_count,)) else: return R[:-1] # def get_rate_X(sample,j): # sum_row = sum(sample,axis=1) # A = [sample[i][j]/float(sum_row[i]) if sum_row[i]!=0 else 0 for i in range(shape(sample)[0])] # return A # def get_cpu_rate_X(sample,i): # cpu_config,mem_config = get_machine_config(flavors_unique) # sample_copy = matrix_copy(sample) # for i in range(shape(sample_copy)[0]): # for j in range(shape(sample_copy)[1]): # sample_copy[i][j] *= cpu_config[j] # sample = sample_copy # sum_row = sum(sample,axis=1) # A = [sample[i][j]/float(sum_row[i]) if sum_row[i]!=0 else 0 for i in range(shape(sample)[0])] # return A # def get_men_rate_X(sample,i): # cpu_config,mem_config = get_machine_config(flavors_unique) # sample_copy = matrix_copy(sample) # for i in range(shape(sample_copy)[0]): # for j in range(shape(sample_copy)[1]): # sample_copy[i][j] *= mem_config[j] # sample = sample_copy # sum_row = sum(sample,axis=1) # A = [sample[i][j]/float(sum_row[i]) if sum_row[i]!=0 else 0 for i in range(shape(sample)[0])] # return A X_trainS,Y_trainS,X_test_S = [],[],[] # adjustable # 2 col_count = 5 # n_feature for f in flavors_unique: X = get_feature_grid(sample,mapping_index[f],col_count=col_count,fill_na='mean',max_na_rate=1,with_test=True) X_test = X[-1:] X = X[:-1] y = fancy(Ys,None,(mapping_index[f],mapping_index[f]+1)) clustering = True # 1.data clustering if clustering: print(clustering_paths[mapping_index[f]]) # improve weights of X and y X.extend(X) y.extend(y) for cluster_index in clustering_paths[mapping_index[f]]: X_cluster = get_feature_grid(sample,mapping_index[f],col_count=col_count,fill_na='mean',max_na_rate=1,with_test=False) y_cluster = fancy(Ys,None,(cluster_index,cluster_index+1)) w = coef_sample[mapping_index[f]][cluster_index] # important X_cluster = apply(X_cluster,lambda x:x*w) y_cluster = apply(y_cluster,lambda x:x*w) X.extend(X_cluster) y.extend(y_cluster) # do not delete X.extend(X_test) # --------------------------------------------------------- # add_list= [X] # add_list = [] # add_list.extend([sqrt(X)]) add_list.extend([apply(X,lambda x:math.log1p(x))]) # important X = hstack(add_list) # --------------------------------------------------------- # def multi_exponential_smoothing(A,list_of_alpha): R = A for a in list_of_alpha: R = exponential_smoothing(R,alpha=a) return R # #adjustable #3 smoothing degree # # 77.291 3 # # 77.405 no.63 # depth = 3 # #adjustable #4 smoothing weights # # base = [0.3,0.5,0.7,0.8] # 3.0.6,0.7,0.8 77.163 # # base = [0.1,0.3,0.5] # 3.0.6,0.7,0.8 77.163 base = [0.6,0.7,0.8] depth = 3 # base = [0.7,0.8,0.9] alphas = [[ base[i] for _ in range(depth)]for i in range(len(base))] X_data_list = [multi_exponential_smoothing(X[:-1],a) for a in alphas] Y_data_list = [multi_exponential_smoothing(y,a) for a in alphas] X_data_list.extend([X]) Y_data_list.extend([y]) X = vstack(X_data_list) y = vstack(Y_data_list) # # # --------------------------------------------------------- # # -----------------------------------------------------------# y = flatten(y) X = normalize(X,y=y,norm='l1') assert(shape(X)[0]==shape(y)[0]+1) X_trainS.append(X[:-1]) X_test_S.append(X[-1:]) Y_trainS.append(y) return X_trainS,Y_trainS,X_test_S
def predict_flavors(ecs_logs, flavors_config, flavors_unique, training_start, training_end, predict_start, predict_end): predict_days = (predict_end - predict_start).days #check hours = ((predict_end - predict_start).seconds / float(3600)) if hours >= 12: predict_days += 1 skip_days = (predict_start - training_end).days # print(skip_days) #checked # print(predict_days) #checked # sample = resampling(ecs_logs,flavors_unique,training_start,training_end,frequency=predict_days,strike=predict_days,skip=0) sample = resampling(ecs_logs, flavors_unique, training_start, training_end, frequency=1, strike=1, skip=0) def outlier_handling(sample, method='mean', max_sigma=3): assert (method == 'mean' or method == 'dynamic') std_ = stdev(sample) mean_ = mean(sample, axis=0) for i in range(shape(sample)[0]): for j in range(shape(sample)[1]): if sample[i][j] - mean_[j] > max_sigma * std_[j]: if method == 'mean': sample[i][j] = mean_[j] elif method == 'dynamic': if i < len(sample) / 2.0: sample[i][j] = (mean_[j] + sample[i][j]) / 2.0 return sample # sample = outlier_handling(sample,method='dynamic',max_sigma=3) # sample = outlier_handling(sample,method='mean',max_sigma=3.5) # from preprocessing import exponential_smoothing # sample = exponential_smoothing(exponential_smoothing(sample,alpha=0.2),alpha=0.2) skip_days -= 1 prediction = [] for i in range(shape(sample)[1]): clf = Ridge(alpha=1, fit_intercept=True) X = reshape(list(range(len(sample))), (-1, 1)) y = fancy(sample, None, (i, i + 1)) X_test = reshape( list(range(len(sample), len(sample) + skip_days + predict_days)), (-1, 1)) X_list = [X] X = hstack(X_list) X_test_list = [X_test] X_test = hstack(X_test_list) clf.fit(X, y) p = clf.predict(X_test) prediction.append(sum(flatten(p))) prediction = [int(round(p)) if p > 0 else 0 for p in prediction] return prediction