def examples_with_aggregated_gauss(subset): planets = get_planets(subset) os.makedirs('gauss_aggregated', exist_ok=True) for planet in tqdm(planets): pickle_path = 'gauss_aggregated/{}.pickle'.format(planet) if not os.path.exists(pickle_path): data = None planet_matrix = np.zeros((10, 55, 300), dtype=np.float32) for spot in range(1, 11): spot_matrix = np.zeros((55, 300), dtype=np.float32) for gaus in range(1, 11): data = parse_input( '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format( subset, planet, spot, gaus)) spot_matrix += data['matrix'] spot_matrix /= 10 planet_matrix[spot - 1] = 1 - spot_matrix data['matrix'] = planet_matrix out_path = '../database/params_train/{}_01_01.txt'.format(planet) if os.path.exists(out_path): data.update(parse_output(out_path)) data['planet'] = planet pickle.dump(data, open(pickle_path, 'wb'))
def create_meta_dataset(subset): planets = get_planets(subset) fcnn_errors = pnd.read_csv('per_planet_wmae.csv', index_col=1) bagging_errors = pnd.read_csv('bagging_errors.csv', index_col=0) eps = [0.01, 0.005, 0.001, 0.0005] models = ",".join("model{}".format(e) for e in eps) rows = [ 'planet,noise,std,star_temp,star_logg,star_logg,star_mass,star_k_magg,period,{}' .format(models) ] for planet in tqdm(planets): with open('completely_aggregated/{}.pickle'.format(planet), 'rb') as f: data = pickle.load(f) window_size = 1 denoised = np.zeros((55, 300)) for j in range(300): denoised[:, j] = np.mean(data['matrix'][:, max(0, j - window_size):j + window_size + 1], axis=1) maxes = np.mean(np.partition(denoised, -10, axis=1)[:, -10:], axis=1) std = np.std(maxes) offset = 1 noises = [] for c in range(55): r = data['matrix'][c] matrix = np.corrcoef(r[offset:], r[:-offset]) assert abs(matrix[0, 1] - matrix[1, 0]) < 10**-10 corr = max(matrix[0, 1], 0) # 0 or negative --> 0 noises.append(1 - corr) if subset == "train": targets = [] diff = bagging_errors.loc[ int(planet), 'baggingError'] - fcnn_errors.loc[int(planet), 'mean_wmae'] for e in eps: if abs(diff) < e: targets.append("any") elif diff > 0: targets.append("fcnn") else: targets.append("bagging") else: targets = ["?"] * len(eps) row = [planet, str(np.mean(noises)), str(std)] row += [str(x) for x in data['misc_inputs']] row += targets rows.append(','.join(row)) with open('meta_dataset_{}.csv'.format(subset), 'w') as f: for row in rows: print(row, file=f)
def aggregated_examples_with_tsfresh(): tsfresh_df = pd.read_pickle('tsfresh_all.pki') train_planets = get_planets('train') test_planets = get_planets('test') train_df = tsfresh_df[tsfresh_df.index.isin(train_planets)] singleton_columns = [ c for c in train_df.columns if len(set(train_df[c])) == 1 ] tsfresh_df.dropna(axis=1, inplace=True) tsfresh_df.drop(singleton_columns, axis=1, inplace=True) os.makedirs('aggregated_tsfresh', exist_ok=True) for planet in tqdm(train_planets | test_planets): data = pickle.load( open('completely_aggregated/{}.pickle'.format(planet), 'rb')) data['tsfresh'] = tsfresh_df.loc[[planet]].values[0] pickle.dump(data, open('aggregated_tsfresh/{}.pickle'.format(planet), 'wb'))
def calculate_tsfresh_features(): planets = get_planets('train') | get_planets('test') folder = 'completely_aggregated' values = {'planet_id': [], 'timestep': []} for c in range(55): values['channel_{}'.format(c + 1)] = [] for planet in tqdm(planets): with open('{}/{}.pickle'.format(folder, planet), 'rb') as f: data = pickle.load(f) for t in range(300): values['planet_id'].append(planet) values['timestep'].append(t) for c in range(55): values['channel_{}'.format(c + 1)].append(data['matrix'][c, t]) df = pd.DataFrame(values) features = tsfresh.extract_features(df, column_id='planet_id', column_sort='timestep', n_jobs=4) features.to_pickle('tsfresh_all.pki')
def examples_with_aggregated_matrices(subset): planets = get_planets(subset) os.makedirs('../database/completely_aggregated', exist_ok=True) for planet in tqdm(planets): data = None planet_matrix = np.zeros((10, 55, 300), dtype=np.float64) for spot in range(1, 11): spot_matrix = np.zeros((55, 300), dtype=np.float64) for gaus in range(1, 11): data = parse_input( '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format( subset, planet, spot, gaus)) spot_matrix += data['matrix'] spot_matrix /= 10 planet_matrix[spot - 1] = spot_matrix planet_matrix = 1 - np.median(planet_matrix, axis=0) data['matrix'] = planet_matrix window_size = 3 temp = np.zeros((55, 300)) for i in range(300): temp[:, i] = np.mean(planet_matrix[:, max(0, i - window_size):i + window_size], axis=1) data['maxes'] = np.max(temp, axis=1) data['relative_means'] = np.mean(temp, axis=1) / data['maxes'] frequencies = create_ariel_frequencies(equidistant_frequency=True) radiations = [ black_body_radiation(frequencies[i], data['misc_inputs'][0]) for i in NOISE_ORDER ] data['radiation'] = np.array(radiations) out_path = '../database/params_train/{}_01_01.txt'.format(planet) if os.path.exists(out_path): data.update(parse_output(out_path)) data['planet'] = planet pickle.dump( data, open('../database/completely_aggregated/{}.pickle'.format(planet), 'wb'))
def examples_with_maximums(subset, window_size=3): planets = get_planets(subset) os.makedirs('engineered', exist_ok=True) for planet in tqdm(planets): with open('completely_aggregated/{}.pickle'.format(planet), 'rb') as f: data = pickle.load(f) matrix = data['matrix'] for i in range(300): matrix[:, i] = np.mean(matrix[:, max(0, i - window_size):i + window_size], axis=1) data['maxes'] = np.max(matrix, axis=1) del data['matrix'] with open('engineered/{}.pickle'.format(planet), 'wb') as f: pickle.dump(data, f)
def stats_per_planet(subset): planets = get_planets(subset) stats = {} folder = 'completely_aggregated' window_size = 1 for planet in tqdm(planets): with open('{}/{}.pickle'.format(folder, planet), 'rb') as f: data = pickle.load(f) denoised = np.zeros((55, 300)) for j in range(300): denoised[:, j] = np.mean(data['matrix'][:, max(0, j - window_size):j + window_size + 1], axis=1) maxes = np.mean(np.partition(denoised, -10, axis=1)[:, -10:], axis=1) means = np.mean(denoised, axis=1) stats[planet] = (np.std(maxes), np.mean(maxes - means), data.get('radii', '?')) return stats
def __init__(self, all_folds, included_folds, train_or_test='train'): planets = list(get_planets(train_or_test)) planets.sort() window_size = 5 folder = 'completely_aggregated' # folder = 'aggregated_tsfresh' # folder = 'gauss_aggregated' # self.channel = 6 self.rows = [] for i, planet in tqdm(enumerate(planets)): if i % all_folds in included_folds: with open('{}/{}.pickle'.format(folder, planet), 'rb') as f: data = pickle.load(f) temp = np.zeros((55, 300)) for i in range(300): temp[:, i] = np.mean(data['matrix'][:, max(0, i-window_size):i+window_size], axis=1) data['maxes'] = np.max(temp, axis=1) data['relative_means'] = np.mean(temp, axis=1) / data['maxes'] # for STR # if 'radii' in data: # data['radii'] = data['radii'][self.channel] self.rows.append(data) # for per channel # for j in range(55): # d = {x: data[x] for x in data} # d['matrix'] = d['matrix'][:, j, :] # d['channel'] = j # if 'radii' in d: # d['radii'] = d['radii'][j] # self.rows.append(d) self.size = len(self.rows)
def histogram_arffs(): bins = 50 window_size = 5 data_type = "test" if False else "train" with open('custom_{}.arff'.format(data_type), 'w') as f: print('@relation planets', file=f) print('@attribute planet string', file=f) print('@attribute channel string', file=f) print('@attribute sma numeric', file=f) print('@attribute incl numeric', file=f) print('@attribute radius numeric', file=f) print('@attribute radiation numeric', file=f) print('@attribute max numeric', file=f) print('@attribute avg numeric', file=f) for j in range(bins): print('@attribute histo_{} numeric'.format(j), file=f) # for i in range(55): # print('@attribute radius{} numeric'.format(i+1), file=f) # for i in range(55): # print('@attribute radiation{} numeric'.format(i+1), file=f) # for i in range(55): # print('@attribute max{} numeric'.format(i+1), file=f) # print('@attribute avg{} numeric'.format(i+1), file=f) # for i in range(55): # for j in range(bins): # print('@attribute histo_{}_{} numeric'.format(i+1, j), file=f) print('\n@data', file=f) for planet in tqdm(get_planets(data_type)): with open( '../database/completely_aggregated/{}.pickle'.format( planet), 'rb') as g: data = pickle.load(g) matrix = data['matrix'] # features = [data['planet'], data['sma'], data['incl']] + list(data['radii']) + list(data['radiation']) # for c in range(55): # temp = np.zeros(300) # for i in range(300): # temp[i] = np.mean(matrix[c, max(0, i-window_size):i+window_size]) # m = np.max(temp) # avg = np.mean(temp) / m # features.append(m) # features.append(avg) # # for c in range(55): # h, _ = np.histogram(matrix[c], bins=bins, range=(0,0.1)) # features += list(h) # print(','.join([str(x) for x in features]), file=f) for c in range(55): # features = [data['planet'], c, data['sma'], data['incl'], data['radii'][c], data['radiation'][c]] features = [ data['planet'], c, '?', '?', '?', data['radiation'][c] ] temp = np.zeros(300) for i in range(300): temp[i] = np.mean(matrix[c, max(0, i - window_size):i + window_size]) m = np.max(temp) avg = np.mean(temp) / m features.append(m) features.append(avg) h, _ = np.histogram(matrix[c], bins=bins, range=(0, 0.1)) features += list(h) print(','.join([str(x) for x in features]), file=f)
def create_fully_aggregated_csv(gauss_aggregation, spot_aggregation, file_name): """ Creates a csv file that consists of aggregated values of time series for all planets (train and test). The csv format is as follows: ID,star_temp,star_logg,star_rad,star_mass,star_k_mag,period,sma,incl,m1,...,m300 0001_1,... ... 0001_55,... ... Missing values (values of sma, incl and target values for test set) are represented as empty strings. :param gauss_aggregation: function for aggregating the matrices <planet>_<spot>_<gauss to a single <planet>_<spot> matrix. Its signature is f(Iterable[float]) -> float. :param spot_aggregation: function for aggregating the matrices <planet> matrix, Its signature is f(Iterable[float]) -> float. :param file_name: the name of the output file (no path, just name, e.g., 'mean_mean.csv'). :return: """ out_dir = "../database/csv" os.makedirs(out_dir, exist_ok=True) columns = [ "ID,star_temp,star_logg,star_rad,star_mass,star_k_mag,period,sma,incl". split(','), ["m{}".format(i) for i in range(1, 301)], ["r"] ] data = {c: [] for cs in columns for c in cs} planets = [(planet_type, planet) for planet_type in ["train", "test"] for planet in sorted(get_planets(planet_type))] # planets = planets[:5] + planets[-5:] for planet_type, planet in tqdm(planets): planet_matrices = [] for spot in range(1, 11): spot_matrices = [] for gauss in range(1, 11): d = parse_input( '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format( planet_type, planet, spot, gauss)) spot_matrices.append(d['matrix']) planet_matrices.append( aggregate_matrices(np.array(spot_matrices), gauss_aggregation)) # matrix planet_matrix = 1 - aggregate_matrices( planet_matrices, spot_aggregation) # type: np.ndarray # additional parameters additional_parameters = dict( zip(d['misc_inputs_names'], d['misc_inputs'])) if planet_type == "train": d = parse_output('../database/params_{}/{}_01_01.txt'.format( planet_type, planet), join_misc_params=True) additional_parameters.update( dict(zip(d['misc_outputs_names'], d['misc_outputs']))) additional_parameters.update({'r': d['radii']}) for row in range(55): planet_id = "{}_{}".format(planet, row + 1) data[columns[0][0]].append(planet_id) # additional for c in columns[0][1:]: value = additional_parameters[ c] if c in additional_parameters else None data[c].append(value) # time series for c, value in zip(columns[1], planet_matrix[row]): data[c].append(value) # radius r = columns[2][0] value = additional_parameters[r][ row] if r in additional_parameters else None data[columns[2][0]].append(value) df = pd.DataFrame(data, columns=[c for cs in columns for c in cs]) df.to_csv(os.path.join(out_dir, file_name), index=False)
def examples_with_custom_features(subset): # @njit def sigmoid(x, a, b, c, d): return a / (1 + np.exp(-b * (x - c))) + d def ramp(x, peak, a, b): return np.piecewise(x, [x < a, (a < x) & (x < b), b < x], [0, lambda x: peak * (x - a) / (b - a), peak]) planets = get_planets(subset) os.makedirs('custom', exist_ok=True) for planet in tqdm(planets): planet_matrix = np.zeros((10, 10, 55, 300), dtype=np.float64) data = {'planet': planet} for spot in range(1, 11): for gaus in range(1, 11): input = parse_input( '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format( subset, planet, spot, gaus)) data.update(input) planet_matrix[gaus - 1, spot - 1] = 1 - input['matrix'] del data['matrix'] frequencies = create_ariel_frequencies(equidistant_frequency=True) radiations = [ black_body_radiation(frequencies[i], data['misc_inputs'][0]) for i in NOISE_ORDER ] data['radiation'] = np.array(radiations) gaus_agg = np.median(planet_matrix, axis=0) spot_agg = np.median(gaus_agg, axis=0) raw_curves = np.zeros((55, 3)) agg_curves = np.zeros((55, 3)) for channel in range(55): xdata = [i / 300 for i in range(150) ] + [i / 300 for i in range(149, -1, -1)] ydata_agg = spot_agg[channel] try: # popt_agg, _ = opt.curve_fit(sigmoid, xdata, ydata_agg, p0=(0, 1, 0.3, 0), maxfev=2*10**3) popt_agg, _ = opt.curve_fit(ramp, xdata, ydata_agg, p0=(0, 0.2, 0.4), maxfev=2 * 10**3) except RuntimeError: print('Failed agg for planet {} channel {}'.format( planet, channel)) popt_agg = (0, 0, 0, 0) agg_curves[channel] = popt_agg ydata_raw = np.concatenate( np.concatenate(planet_matrix[:, :, channel, :])) try: # popt_raw, _ = opt.curve_fit(sigmoid, xdata*100, ydata_raw, p0=(0, 1, 0.3, 0), maxfev=2*10**3) popt_raw, _ = opt.curve_fit(ramp, xdata * 100, ydata_raw, p0=(0, 0.2, 0.4), maxfev=2 * 10**3) except RuntimeError: print('Failed raw for planet {} channel {}'.format( planet, channel)) popt_raw = (0, 0, 0, 0) raw_curves[channel] = popt_raw data['raw_curves'] = raw_curves data['agg_curves'] = agg_curves out_path = '../database/params_train/{}_01_01.txt'.format(planet) if os.path.exists(out_path): data.update(parse_output(out_path)) pickle.dump(data, open('custom/{}.pickle'.format(planet), 'wb'))
#rc('font',**{'family':'sans-serif','sans-serif':['Arial']}) # Latex fonts, quick: #matplotlib.rcParams['mathtext.fontset'] = 'stix' #matplotlib.rcParams['font.family'] = 'STIXGeneral' # Latex fonts, slow (but accurate): rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) rc('text', usetex=True) matplotlib.rcParams.update({'font.size': 12}) plt.rc('legend', **{'fontsize': 7}) # Ticks to the outside: rcParams['axes.linewidth'] = 1.2 rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' M, Merr, R, Rerr, Teff, Rstar, sep = utils.get_planets() # Erase datapoints with no data: idx = np.where((M != -1) & (R != -1))[0] M = M[idx] Merr = Merr[idx] R = R[idx] Rerr = Rerr[idx] Teff = Teff[idx] sep = sep[idx] Rstar = Rstar[idx] # Erase datapoints with zero mass: idx = np.where((M != 0.))[0] M = M[idx] Merr = Merr[idx]