def waveletScaleZero(order, nr_ploy, x): from getfilter import getfilter from scaling import scaling Sum = 0 G0 = getfilter("G", 0, nr_ploy) G1 = getfilter("G", 1, nr_ploy) for i in range(nr_ploy): # print("G0[", order ,",",i,"]" ,"=" ,G0[order,i]) # print("scaling(",i,",",1,",",0,",",x,")","=",sc.scaling(i, 1, 0, x) ) # print("restult",G0[order, i]*sc.scaling(i, 1, 0, x)/np.sqrt(2) ) # Check if this should be a pluss or minus wavelet = (G0[order, i]*scaling(i, 1, 0, x)) + \ (G1[order, i]*scaling(i, 1, 1, x)) Sum += wavelet return Sum
def shell(self, r=100, dr=1, dust2gas=0.01, verbose=0): ''' evaluate rate of particles in a shell ''' n = 0 rate = 0.0 t0 = time() if r == 0: r2min = 0.0 r2max = 1e30 else: r2min = (r - dr)**2 r2max = (r + dr)**2 for id in (self.particles).keys(): part = self.particles[id] p = part['p'] p2 = sum(p**2) if p2 > r2min and p2 < r2max: v = part['v'] w = part['w'] rate -= sum(p * v) / sqrt(p2) * w n = n + 1 if not hasattr(self, 'gas_mass'): self.get_gas_mass(verbose=verbose) rate = rate / (2. * dr) * self.gas_mass self.rate = rate if verbose: units = scaling() rate1 = dust2gas * rate * cgs.yr / units.t * units.m / cgs.m_earth s = 'rate:{:9.2e} ={:9.2e} M_E/yr, based on {} particles.'.format( rate, rate1, n) s = s + ' Time used: {:.1f} sec.'.format(time() - t0) print(s)
def umax_rt(p, courant, cdtd): ''' Return the umax value. Input is a patch, the courant number and cdtd p.umax_rt = du.umax_rt(p) ''' from scaling import scaling, cgs sc = scaling(cgs) u_max = sc.stefan * np.pi * 16. / 3. * p.fmax_rt.max() * courant / cdtd return u_max
def fmax_rt(p, kappa): ''' Return a data block, containing the f_max(RT) values. input is a patch and kappa in cgs units. p.fmax_rt = du.fmax_rt(p,kappa) ''' from scaling import scaling, cgs sc = scaling(cgs) k = kappa * sc.m / sc.l**2 fmax = (p.pg / p.d)**4 / p.pg * (k * p.d * p.dx.min()) fmax = fmax / (1. + (k * p.d * p.dx.min())**2) return fmax
test = pd.read_csv(os.path.join(data_dir, 'test_features.csv')) train_drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv')) submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) x_train = train.copy() x_test = test.copy() y_train = targets_scored.copy() cp_features = ['cp_type', 'cp_time', 'cp_dose'] genes_features = [column for column in train.columns if 'g-' in column] cells_features = [column for column in train.columns if 'c-' in column] # scale the data, like RankGauss x_train, x_test = scaling(x_train, x_test, scale=cfg_fe.scale, n_quantiles=cfg_fe.scale_n_quantiles, seed=cfg_fe.seed) # fe_stats x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features) x_train.head() # group the drug using kmeans if runty == 'traineval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c,
from regression import regression from test_train import test_train from scaling import scaling from data_preprocess import data_preprocess from predict_plot import predict_plot import pandas as pd df = pd.read_csv("a.us.txt") df.set_index('Date',inplace=True) df = data_preprocess(df) print(df.head()) features , labels = scaling(df) features_train,labels_train,features_test,labels_test = test_train(features,labels) reg = regression(features_train,features_test,labels_train,labels_test) predict_plot(reg , features,df)
geocoder_params = { "apikey": "40d1649f-0493-4b70-98ba-98533de7710b", "geocode": toponym_to_find, "format": "json"} response = requests.get(geocoder_api_server, params=geocoder_params) if not response: pass json_response = response.json() toponym = json_response["response"]["GeoObjectCollection"][ "featureMember"][0]["GeoObject"] toponym_coodrinates = toponym["Point"]["pos"] toponym_longitude, toponym_lattitude = toponym_coodrinates.split(" ") l_c, u_c = toponym["boundedBy"]["Envelope"]["lowerCorner"], toponym["boundedBy"]["Envelope"]["upperCorner"] map_params = { "ll": ",".join([toponym_longitude, toponym_lattitude]), "spn": scaling(u_c, l_c), "l": "map", "pt": ",".join([toponym_longitude, toponym_lattitude]) + "," + "pm2rdm" } map_api_server = "http://static-maps.yandex.ru/1.x/" response = requests.get(map_api_server, params=map_params) Image.open(BytesIO( response.content)).show()
import pandas as pd from scaling import scaling data = pd.read_csv('Seals.csv') data = scaling(data, a = (0, 8), b = (10, 12), columnIndices = (0, 1)) data.to_csv('SealsScaled.csv')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--alpha", type=float, default=params.ALPHA, help="Set alpha") parser.add_argument("-i", "--nbItr", type=int, default=params.NB_ITERATIONS, help="Set number of iterations") parser.add_argument("-f", "--feature", type=str, default=params.FEATURES, help="Set feature name") parser.add_argument("-l", "--label", type=str, default=params.LABELS, help="Set label name") parser.add_argument("-c", "--cost", action='store_true', help="Cost visu") parser.add_argument("-s", "--accuracyScore", action='store_true', help="Accuracy score") parser.add_argument("-p", "--dataPath", type=str, default=(params.DIR_PATH + params.DATA_PATH + params.DATA_NAME), help="Data absolute path") args = parser.parse_args() alpha = args.alpha nbItr = args.nbItr featureName = args.feature labelName = args.label dataPath = args.dataPath accuracyScore = args.accuracyScore # get data try: data = pd.read_csv(dataPath) except FileNotFoundError: print(colors.FAIL + 'Data not found.' + colors.ENDC) exit(0) # get features and labels try: x = np.array(data[featureName]) y = np.array(data[labelName]) except KeyError: print( colors.FAIL + 'Wrong features or labels\nPlease use [-f/--feature] FEATURE and/or [-l/--label] LABEL' + colors.ENDC) exit(0) print('Parameters :') print(' alpha :', alpha) print(' iterations :', nbItr) print(' accuracyScore :', accuracyScore, '\n') # create thetas theta = np.array([[0], [0]], float) # scale input scaledX = scaling(x) # train theta, cost_history = fit_with_cost(scaledX, y, theta, alpha, nbItr) if args.cost: fig = plt.figure() ax = plt.axes() ax.plot(np.arange(len(cost_history)), cost_history) ax.set(xlabel='number of iterations', ylabel='cost', title='Cost') plt.show() # scaling theta[1] = theta[1] / (np.amax(x) - np.amin(x)) print('Thetas :', theta, '\n') try: with open(params.DIR_PATH + params.JSON_PATH + params.JSON_NAME, 'w') as f: json.dump(theta.tolist(), f) except FileNotFoundError: print(colors.FAIL + "'" + params.DIR_PATH + params.JSON_PATH + "'" + ' directory not found.' + colors.ENDC) exit(0) if accuracyScore: accuracy(data, featureName, labelName, theta) visualizeRegression(theta, x, y, featureName, labelName) print(colors.OKGREEN + 'Thetas writtent in :', params.DIR_PATH + params.JSON_PATH + params.JSON_NAME + colors.ENDC)
def ORRT_D1_lambdas(train, test, S=1000, gamma=512): # Scale the training subset (train, Mtree, mtree) = scaling(train) n_train = len(train) p = int(len(train[0]) - 1) # Rescale the test subset test = rescaling(test, Mtree, mtree) n_test = len(test) # Set the grid of values of lambda^L and lambda^G lambdasL = np.append(0, np.exp2(list(range(-6, 6)))) / (3 * p) nlambdasL = len(lambdasL) lambdasG = np.append(0, np.exp2(list(range(-6, 6)))) / p nlambdasG = len(lambdasG) # Definition of the objective function def f(x, train, gamma, p, lambdaL, lambdaG): calc = -(np.dot(train[:, :p], x[0:p] - x[ (4 * p + 3):(5 * p + 3)]) / p - x[p]) * gamma P2 = np.zeros(len(calc)) P2[calc < 600] = 1 / (1 + np.exp(calc[calc < 600])) P3 = 1 - P2 Q2 = np.dot(train[:, :p], x[(p + 1):(2 * p + 1)] - x[(5 * p + 3):(6 * p + 3)]) - x[(3 * p + 1)] Q3 = np.dot(train[:, :p], x[(2 * p + 1):(3 * p + 1)] - x[(6 * p + 3):(7 * p + 3)]) - x[(3 * p + 2)] errorp = np.square(P2 * Q2 + P3 * Q3 - train[:, p]) meanerrorp = np.mean(errorp) meanerrorplasso = meanerrorp + lambdaG * np.sum(x[ (3 * p + 3):(4 * p + 3)]) + lambdaL * (np.sum(x[0:p]) + np.sum(x[ (p + 1):(3 * p + 1)]) + np.sum(x[(4 * p + 3):(7 * p + 3)])) if np.isnan(meanerrorplasso): exit() return meanerrorplasso # Definition of bounds def lulb(p): lu = np.concatenate( (np.zeros(p), -np.ones(1), np.repeat(0, 2 * p), np.repeat(-50, 2), np.repeat(0, p), np.repeat(0, 3 * p))) lb = np.concatenate( (np.ones(p + 1), np.repeat(50, 2 * p + 2), np.repeat(np.inf, p), np.repeat(1, p), np.repeat(50, 2 * p))) return (lu, lb) d = lulb(p) bounds = Bounds(d[0], d[1]) # Definition of gradient def gradient(x, train, gamma, p, lambdaL, lambdaG): calc = -(np.dot(train[:, :p], x[0:p] - x[ (4 * p + 3):(5 * p + 3)]) / p - x[p]) * gamma p1 = np.zeros(len(calc)) p1[calc < 600] = 1 / (1 + np.exp(calc[calc < 600])) exponencial1 = np.exp(600) * np.ones(len(calc)) exponencial1[calc < 600] = np.exp(calc[calc < 600]) P2 = p1 P3 = 1 - p1 Q2 = np.dot(train[:, :p], x[(p + 1):(2 * p + 1)] - x[(5 * p + 3):(6 * p + 3)]) - x[(3 * p + 1)] Q3 = np.dot(train[:, :p], x[(2 * p + 1):(3 * p + 1)] - x[(6 * p + 3):(7 * p + 3)]) - x[(3 * p + 2)] g = P2 * Q2 + P3 * Q3 - train[:, p] der = np.zeros_like(x) m1 = 2 * g * exponencial1 * np.square(p1) * (Q2 - Q3) der[0:p] = gamma / p * np.mean(np.transpose(train[:, :p]) * m1, axis=1) + np.repeat(lambdaL, p) der[p] = -gamma * np.mean(m1) der[(p + 1):(2 * p + 1)] = np.mean(2 * g * np.transpose(train[:, :p]) * P2, axis=1) + np.repeat(lambdaL, p) der[(2 * p + 1):(3 * p + 1)] = np.mean(2 * g * np.transpose(train[:, :p]) * P3, axis=1) + np.repeat(lambdaL, p) der[(3 * p + 1)] = -np.mean(2 * g * P2) der[(3 * p + 2)] = -np.mean(2 * g * P3) der[(3 * p + 3):(4 * p + 3)] = np.repeat(lambdaG, p) der[(4 * p + 3):(5 * p + 3)] = -gamma / p * np.mean( np.transpose(train[:, :p]) * m1, axis=1) + np.repeat(lambdaL, p) der[(5 * p + 3):(6 * p + 3)] = -np.mean(2 * g * np.transpose(train[:, :p]) * P2, axis=1) + np.repeat(lambdaL, p) der[(6 * p + 3):(7 * p + 3)] = -np.mean(2 * g * np.transpose(train[:, :p]) * P3, axis=1) + np.repeat(lambdaL, p) return der # Definition of constraints and jacobian jacons = np.zeros((3 * p, 7 * p + 3)) jacons[0:p, 0:p] = -np.eye(p) jacons[0:p, (4 * p + 3):(5 * p + 3)] = -np.eye(p) jacons[0:p, (3 * p + 3):(4 * p + 3)] = np.eye(p) jacons[p:2 * p, (p + 1):(2 * p + 1)] = -np.eye(p) jacons[p:2 * p, (5 * p + 3):(6 * p + 3)] = -np.eye(p) jacons[p:2 * p, (3 * p + 3):(4 * p + 3)] = np.eye(p) jacons[2 * p:3 * p, (2 * p + 1):(3 * p + 1)] = -np.eye(p) jacons[2 * p:3 * p, (6 * p + 3):(7 * p + 3)] = -np.eye(p) jacons[2 * p:3 * p, (3 * p + 3):(4 * p + 3)] = np.eye(p) lambdaL = 0 lambdaG = 0 ineq_cons = { 'type': 'ineq', 'fun': lambda x: np.concatenate((x[(3 * p + 3):(4 * p + 3)] - x[0:p] - x[ (4 * p + 3):(5 * p + 3)], x[(3 * p + 3):(4 * p + 3)] - x[ (p + 1):(2 * p + 1)] - x[(5 * p + 3):(6 * p + 3)], x[ (3 * p + 3):(4 * p + 3)] - x[(2 * p + 1):(3 * p + 1)] - x[ (6 * p + 3):(7 * p + 3)])), 'jac': lambda x: jacons, 'arg': (train, gamma, p, lambdaL, lambdaG) } # Set the grid of S random initial solutions np.random.seed(1) x0 = np.zeros((S, 7 * p + 3)) x0[:, p] = 2 * np.random.random(S) - 1 a1iaux = -700 / gamma + x0[:, p] a1iaux2 = np.maximum(a1iaux, -1) a1iaux3 = np.transpose(np.tile(a1iaux2, (p, 1))) a1iaux4 = (1 - a1iaux3) * np.random.random((S, p)) + a1iaux3 x0[:, 0:p] = np.maximum(a1iaux4, 0) x0[:, (p + 1):(2 * p + 1)] = np.random.random((S, p)) x0[:, (2 * p + 1):(3 * p + 1)] = np.random.random((S, p)) x0[:, (3 * p + 1)] = np.random.random(S) x0[:, (3 * p + 2)] = np.random.random(S) x0[:, (3 * p + 3):(4 * p + 3)] = np.random.random((S, p)) x0[:, (4 * p + 3):(5 * p + 3)] = np.maximum(-a1iaux4, 0) x0[:, (5 * p + 3):(6 * p + 3)] = np.random.random((S, p)) x0[:, (6 * p + 3):(7 * p + 3)] = np.random.random((S, p)) # Define the function to be parallelized def funcion(valores): [ f, x0nn, nn, train, gamma, p, lambdasG, lambdasL, gradient, bounds, ineq_cons ] = valores nlambdasG = len(lambdasG) nlambdasL = len(lambdasL) objetivo = 1000000 * np.ones((nlambdasL, nlambdasG)) sol = np.zeros((nlambdasL, nlambdasG, 7 * p + 3)) for ll in range(nlambdasL): for gg in range(nlambdasG): try: print(ll, gg, nn) print('local', ll) print('global', gg) res = minimize(f, x0nn, args=(train, gamma, p, lambdasL[ll], lambdasG[gg]), method='SLSQP', jac=gradient, options={ 'ftol': 1e-5, 'disp': False, 'maxiter': 300 }, bounds=bounds, constraints=ineq_cons) objetivo[ll, gg] = res.fun sol[ll, gg, :] = res.x x0nn[0:p] = res.x[0:p] x0nn[p] = res.x[p] x0nn[(p + 1):(2 * p + 1)] = res.x[(p + 1):(2 * p + 1)] x0nn[(2 * p + 1):(3 * p + 1)] = res.x[(2 * p + 1):(3 * p + 1)] x0nn[(3 * p + 1)] = res.x[(3 * p + 1)] x0nn[(3 * p + 2)] = res.x[(3 * p + 2)] x0nn[(3 * p + 3):(4 * p + 3)] = res.x[(3 * p + 3):(4 * p + 3)] x0nn[(4 * p + 3):(5 * p + 3)] = res.x[(4 * p + 3):(5 * p + 3)] x0nn[(5 * p + 3):(6 * p + 3)] = res.x[(5 * p + 3):(6 * p + 3)] x0nn[(6 * p + 3):(7 * p + 3)] = res.x[(6 * p + 3):(7 * p + 3)] except: pass return (objetivo, sol) values = [([ f, x0[nn], nn, train, gamma, p, lambdasL, lambdasG, gradient, bounds, ineq_cons ]) for nn in range(S)] # Solve Problem (1) for a grid of lambda^L and lambda^G results = Parallel(n_jobs=8)(delayed(funcion)(value) for value in values) objetivos = [results[i][0] for i in range(S)] xs = [results[i][1] for i in range(S)] # Obtain the parameters of the SORRT with depth D = 1 for the grid of # values of lambda^L and lambda^G, as well as the performance over # the training and test subsets. objetivopt = np.zeros((nlambdasL, nlambdasG)) indexopt = np.zeros((nlambdasL, nlambdasG), dtype=int) xopt = np.zeros((nlambdasL, nlambdasG, 7 * p + 3)) a1opt = np.zeros((p, nlambdasL, nlambdasG)) a2opt = np.zeros((p, nlambdasL, nlambdasG)) a3opt = np.zeros((p, nlambdasL, nlambdasG)) betaopt = np.zeros((p, nlambdasL, nlambdasG)) mu1opt = np.zeros((nlambdasL, nlambdasG)) mu2opt = np.zeros((nlambdasL, nlambdasG)) mu3opt = np.zeros((nlambdasL, nlambdasG)) predtrain = np.zeros((n_train, nlambdasL, nlambdasG)) errortrain = np.zeros((n_train, nlambdasL, nlambdasG)) msetrain = np.zeros((nlambdasL, nlambdasG)) R2train = np.zeros((nlambdasL, nlambdasG)) predtest = np.zeros((n_test, nlambdasL, nlambdasG)) errortest = np.zeros((n_test, nlambdasL, nlambdasG)) msetest = np.zeros((nlambdasL, nlambdasG)) R2test = np.zeros((nlambdasL, nlambdasG)) coefsnonulos = np.zeros((nlambdasL, nlambdasG)) numberofeatures = np.zeros((nlambdasL, nlambdasG)) localsparsity = np.zeros((nlambdasL, nlambdasG)) globalsparsity = np.zeros((nlambdasL, nlambdasG)) for ll in range(nlambdasL): for gg in range(nlambdasG): obj = [objetivos[i][ll, gg] for i in range(S)] objetivopt[ll, gg] = np.min(obj) indexopt[ll, gg] = np.nanargmin(obj) xopt[ll, gg, :] = xs[indexopt[ll, gg]][ll, gg] a1opt[:, ll, gg] = xopt[ll, gg, 0:p] - xopt[ll, gg, (4 * p + 3):(5 * p + 3)] a2opt[:, ll, gg] = xopt[ll, gg, (p + 1):(2 * p + 1)] - xopt[ll, gg, (5 * p + 3):(6 * p + 3)] a3opt[:, ll, gg] = xopt[ll, gg, (2 * p + 1):(3 * p + 1)] - xopt[ll, gg, (6 * p + 3):(7 * p + 3)] betaopt[:, ll, gg] = xopt[ll, gg, (3 * p + 3):(4 * p + 3)] mu1opt[ll, gg] = xopt[ll, gg, p] mu2opt[ll, gg] = xopt[ll, gg, (3 * p + 1)] mu3opt[ll, gg] = xopt[ll, gg, (3 * p + 2)] (predtrain[:, ll, gg], errortrain[:, ll, gg], msetrain[ll, gg], R2train[ll, gg]) = predict(train, a1opt[:, ll, gg], mu1opt[ll, gg], a2opt[:, ll, gg], mu2opt[ll, gg], a3opt[:, ll, gg], mu3opt[ll, gg], gamma) (predtest[:, ll, gg], errortest[:, ll, gg], msetest[ll, gg], R2test[ll, gg]) = predict(test, a1opt[:, ll, gg], mu1opt[ll, gg], a2opt[:, ll, gg], mu2opt[ll, gg], a3opt[:, ll, gg], mu3opt[ll, gg], gamma) coefsnonulos[ll, gg] = np.sum( np.absolute(np.around(a1opt[:, ll, gg], decimals=3)) >= 0.001, axis=0) + np.sum( np.absolute(np.around(a2opt[:, ll, gg], decimals=3)) >= 0.001, axis=0) + np.sum(np.absolute( np.around(a3opt[:, ll, gg], decimals=3)) >= 0.001, axis=0) numberofeatures[ll, gg] = np.sum(np.logical_or( np.absolute(np.around(a1opt[:, ll, gg], decimals=3)) >= 0.001, np.logical_or( np.absolute(np.around(a2opt[:, ll, gg], decimals=3)) >= 0.001, np.absolute(np.around(a3opt[:, ll, gg], decimals=3)) >= 0.001)), axis=0) localsparsity[ll, gg] = 100 * (3 * p - coefsnonulos[ll, gg]) / (3 * p) globalsparsity[ll, gg] = 100 * (p - numberofeatures[ll, gg]) / p return (a1opt, mu1opt, a2opt, mu2opt, a3opt, mu3opt, betaopt, gamma, predtrain, errortrain, msetrain, R2train, predtest, errortest, msetest, R2test, localsparsity, globalsparsity)
def read_shell(self, r=100, dr=1, dust2gas=0.01, save=True, verbose=0): ''' Read particles in a shell, by default saving them ''' dict = {} n = 0 npatch = 0 rate = 0.0 start = time() t0 = start if r == 0: r2min = 0.0 r2max = 1e30 else: r2min = (r - dr)**2 r2max = (r + dr)**2 # open each patch file and check if relevant for file in self.files: p = Patch(file) rc = p.corner_radii() # If relevant, open the .peb file, and read the data if rc.min() < (r + dr) and rc.max() > (r - dr) or r == 0: npatch += 1 idx, dd = self.read_file(file) # If in the shell, sum up rate contribution, and add particle to dict if (r > 0): for i in range(size(idx)): id = idx[i] d = dd[id] p = d['p'] p2 = sum(p**2) if p2 > r2min and p2 < r2max: v = d['v'] w = d['w'] rate -= sum(p * v) / sqrt(p2) * w n = n + 1 if save: dict[id] = d else: for i in range(size(idx)): id = idx[i] dict[id] = dd[id] if verbose > 1: now = time() print('{:.3f} sec'.format(now - start)) start = now if r == 0: self.particles = dict print('{:.3f} sec'.format(time() - start)) return if not hasattr(self, 'gas_mass'): self.get_gas_mass(verbose=verbose) rate = rate / (2.0 * dr) * self.gas_mass self.rate = rate if save: self.particles = dict if verbose > 1: print('{} self.particles saved'.format(size(dict.keys()))) if verbose: units = scaling() rate1 = dust2gas * rate * cgs.yr / units.t * units.m / cgs.m_earth s = 'rate:{:9.2e} ={:9.2e} M_E/yr'.format(rate, rate1) s = s + ', based on {} particles from {} patches'.format(n, npatch) print(s + ' ({:.1f} sec)'.format(time() - t0))
import numpy as np import matplotlib.pyplot as pl import EOS from scaling import scaling, cgs #%% Void object class void(): pass evol = void() #%% Soft gravity sc = scaling() m_planet = 5.0 a_planet = 1.0 def force(r, rsm): if r > rsm: f = cgs.grav * cgs.m_earth * m_planet / r**2 else: f = cgs.grav * cgs.m_earth * m_planet / rsm**2 * (4. * (r / rsm) - 3. * (r / rsm)**2) return f #%% plot force pl.figure(4)
def ORRT_D1(train, test, S=1000, gamma=512): # Scale the training subset (train, Mtree, mtree) = scaling(train) N = len(train) p = int(len(train[0]) - 1) # Rescale the test subset test = rescaling(test, Mtree, mtree) # Definition of the objective function def f(x, train, gamma, p): calc = -(np.dot(train[:, :p], x[0:p]) / p - x[p]) * gamma P2 = np.zeros(len(calc)) P2[calc < 600] = 1 / (1 + np.exp(calc[calc < 600])) P3 = 1 - P2 Q2 = np.dot(train[:, :p], x[(p + 1):(2 * p + 1)]) - x[(3 * p + 1)] Q3 = np.dot(train[:, :p], x[(2 * p + 1):(3 * p + 1)]) - x[(3 * p + 2)] errorp = np.square(P2 * Q2 + P3 * Q3 - train[:, p]) meanerrorp = np.mean(errorp) if np.isnan(meanerrorp): exit() return meanerrorp # Definition of bounds def lulb(p): lu = np.append(-np.ones(p + 1), np.repeat(-50, 2 * p + 2)) lb = np.append(np.ones(p + 1), np.repeat(50, 2 * p + 2)) return (lu, lb) d = lulb(p) bounds = Bounds(d[0], d[1]) # Definition of gradient def gradient(x, train, gamma, p): calc = -(np.dot(train[:, :p], x[0:p]) / p - x[p]) * gamma p1 = np.zeros(len(calc)) p1[calc < 600] = 1 / (1 + np.exp(calc[calc < 600])) exponencial1 = np.exp(600) * np.ones(len(calc)) exponencial1[calc < 600] = np.exp(calc[calc < 600]) P2 = p1 P3 = 1 - p1 Q2 = np.dot(train[:, :p], x[(p + 1):(2 * p + 1)]) - x[(3 * p + 1)] Q3 = np.dot(train[:, :p], x[(2 * p + 1):(3 * p + 1)]) - x[(3 * p + 2)] g = P2 * Q2 + P3 * Q3 - train[:, p] der = np.zeros_like(x) m1 = 2 * g * exponencial1 * np.square(p1) * (Q2 - Q3) der[0:p] = gamma / p * np.mean(np.transpose(train[:, :p]) * m1, axis=1) der[p] = -gamma * np.mean(m1) der[(p + 1):(2 * p + 1)] = np.mean(2 * g * np.transpose(train[:, :p]) * P2, axis=1) der[(2 * p + 1):(3 * p + 1)] = np.mean(2 * g * np.transpose(train[:, :p]) * P3, axis=1) der[(3 * p + 1)] = -np.mean(2 * g * P2) der[(3 * p + 2)] = -np.mean(2 * g * P3) return der # Set the grid of S random initial solutions np.random.seed(1) a1i = np.zeros((S, p)) mu1i = np.zeros(S) nn = 0 while (nn < S): a1i[nn, :] = 2 * np.random.random(p) - 1 mu1i[nn] = 2 * np.random.random(1) - 1 vale = True ii = 0 while (vale and ii < N): if (np.isinf( np.exp( -(np.sum(a1i[nn, :] * train[ii, :p]) / p - mu1i[nn]) * gamma))): vale = False else: ii = ii + 1 if vale: nn = nn + 1 x0 = np.zeros((S, 3 * p + 3)) x0[:, 0:p] = a1i x0[:, p] = mu1i x0[:, (p + 1):(2 * p + 1)] = np.random.random((S, p)) x0[:, (3 * p + 1)] = np.random.random(S) x0[:, (2 * p + 1):(3 * p + 1)] = np.random.random((S, p)) x0[:, (3 * p + 2)] = np.random.random(S) # Define the function to be parallelized def funcion(valores): [f, x0nn, train, gamma, p, gradient, bounds] = valores try: res = minimize(f, x0nn, args=(train, gamma, p), method='SLSQP', jac=gradient, options={ 'ftol': 1e-5, 'disp': False, 'maxiter': 300 }, bounds=bounds) objetivo = res.fun sol = res.x except: objetivo = 1e+300 sol = np.zeros(p) return (objetivo, sol) values = [([f, x0[nn], train, gamma, p, gradient, bounds]) for nn in range(S)] # Solve Problem (1) for the S initial solutions results = Parallel(n_jobs=8)(delayed(funcion)(value) for value in values) # Obtain the best solution objetivo = [results[i][0] for i in range(S)] indexopt = np.nanargmin(objetivo) xopt = results[indexopt][1] # Obtain the parameters of the SORRT with depth D = 1 a1opt = xopt[0:p] mu1opt = xopt[p] a2opt = xopt[(p + 1):(2 * p + 1)] a3opt = xopt[(2 * p + 1):(3 * p + 1)] mu2opt = xopt[(3 * p + 1)] mu3opt = xopt[(3 * p + 2)] # Performance over the training and test subsets (predtrain, errortrain, msetrain, R2train) = predict(train, a1opt, mu1opt, a2opt, mu2opt, a3opt, mu3opt, gamma) (predtest, errortest, msetest, R2test) = predict(test, a1opt, mu1opt, a2opt, mu2opt, a3opt, mu3opt, gamma) return (a1opt, mu1opt, a2opt, mu2opt, a3opt, mu3opt, gamma, predtrain, errortrain, msetrain, R2train, predtest, errortest, msetest, R2test)
def main(): cfg_fe = Config_FeatureEngineer() seed_everything(seed_value=cfg_fe.seed) data_dir = '/kaggle/input/lish-moa/' save_path = './' load_path = '/kaggle/input/moatabnetmultimodekfold/' runty = 'eval' train = pd.read_csv(os.path.join(data_dir, 'train_features.csv')) targets_scored = pd.read_csv( os.path.join(data_dir, 'train_targets_scored.csv')) test = pd.read_csv(os.path.join(data_dir, 'test_features.csv')) train_drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv')) submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) x_train = train.copy() x_test = test.copy() y_train = targets_scored.copy() genes_features = [column for column in x_train.columns if 'g-' in column] cells_features = [column for column in x_train.columns if 'c-' in column] # scale the data, like RankGauss x_train, x_test = scaling(x_train, x_test, scale=cfg_fe.scale, n_quantiles=cfg_fe.scale_n_quantiles, seed=cfg_fe.seed) # decompose data, like PCA if runty == 'traineval': x_train, x_test = decompo_process(x_train, x_test, decompo=cfg_fe.decompo, genes_variance=cfg_fe.genes_variance, cells_variance=cfg_fe.cells_variance, seed=cfg_fe.seed, pca_drop_orig=cfg_fe.pca_drop_orig, runty=runty, path=save_path) elif runty == 'eval': x_train, x_test = decompo_process(x_train, x_test, decompo=cfg_fe.decompo, genes_variance=cfg_fe.genes_variance, cells_variance=cfg_fe.cells_variance, seed=cfg_fe.seed, pca_drop_orig=cfg_fe.pca_drop_orig, runty=runty, path=load_path) # select feature, VarianceThreshold x_train, x_test = feature_selection( x_train, x_test, feature_select=cfg_fe.feature_select, variancethreshold_for_FS=cfg_fe.variancethreshold_for_FS) # fe_stats x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features) # group the drug using kmeans if runty == 'traineval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=save_path) elif runty == 'eval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=load_path) # one-hot encoding x_train = onehot_encoding(x_train) x_test = onehot_encoding(x_test) feature_cols = [ c for c in x_train.columns if (str(c)[0:5] != 'kfold' and c not in ['sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose']) ] target_cols = [x for x in y_train.columns if x != 'sig_id'] # label smoothing if cfg_fe.regularization_ls: y_train = ls_manual(y_train, ls_rate=cfg_fe.ls_rate) # merge drug_id and labels x_train = x_train.merge(y_train, on='sig_id') x_train = x_train.merge(train_drug, on='sig_id') # remove sig_id # x_train, x_test, y_train = remove_ctl(x_train, x_test, y_train) # make CVs target_cols = [x for x in targets_scored.columns if x != 'sig_id'] x_train = make_cv_folds(x_train, cfg_fe.seeds, cfg_fe.nfolds, cfg_fe.drug_thresh, target_cols) begin_time = datetime.datetime.now() if (runty == 'traineval'): test_preds_all = train_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, save_path) y_train = targets_scored[ train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) test_pred_final = pred_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, load_path='./', stacking=False) elif (runty == 'eval'): y_train = targets_scored[ train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) test_pred_final = pred_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, load_path, stacking=False) time_diff = datetime.datetime.now() - begin_time print(f'Total time is {time_diff}') # make submission all_feat = [col for col in submission.columns if col not in ["sig_id"]] # To obtain the same lenght of test_preds_all and submission # sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop=True) sig_id = test.sig_id tmp = pd.DataFrame(test_pred_final, columns=all_feat) tmp["sig_id"] = sig_id submission = pd.merge(test[["sig_id"]], tmp, on="sig_id", how="left") submission.fillna(0, inplace=True) submission[test["cp_type"] == "ctl_vehicle"] = 0. submission.to_csv("submission_tabbet.csv", index=None)