def ForwardPropagate(self): """ This function will (1) fetch observation data from self.o_data (size T x dim_o) (2) perform forward propagate (3) save results in self.s_data (size T x M x dim_s); also calculate error and save in error (size T x (K+1)) """ error = np.zeros((self.T,self.K+1)) # deal with t=0 feed = { self.o_t : self.o_data[0], self.s_old : self.sess.run(self.s_0), } s_pre, prob, o_forecast=self.sess.run([self.s_new,self.s_new_w,self.o_forecast], feed) for i in range(self.K+1): error[0,i] = np.sum((self.o_data[i] - np.array(o_forecast[i]))**2) util.resample(self.s_data[0,:],s_pre,prob[:,0]) # deal with t>0 for t in range(1,self.T-self.K): feed={ self.o_t : self.o_data[t], self.s_old : self.s_data[t-1,:,:], } s_pre, prob, o_forecast = self.sess.run([self.s_new,self.s_new_w,self.o_forecast],feed) for i in range(self.K+1): error[t,i] = np.sum((self.o_data[t+i] - np.array(o_forecast[i]))**2) util.resample(self.s_data[t,:],s_pre,prob[:,0]) return error
def create_input_files(tile_id): print "Getting extent of", tile_id xmin, ymin, xmax, ymax = uu.coords(tile_id) # # Soil tiles are already processed, so there's no need to include them here. # # Below is the old code for tile-izing the histosole soil raster. # # Leaving this in case I ever add in soil processing again. # print "clip soil" # extra_param = ['-tr', '.00025', '.00025', '-dstnodata', '0'] # clip_soil_tile = util.clip('hwsd_oc_final.tif', '{}_soil.tif'.format(tile_id), xmin, ymin, xmax, ymax, extra_param) # # print "removing no data flag from soil" # cmd = ['gdal_edit.py', '-unsetnodata', clip_soil_tile] # subprocess.check_call(cmd) # # print "uploading soil tile to s3" # util.upload(clip_soil_tile, cn.soil_C_processed_dir) print "Rasterizing ecozone" rasterized_eco_zone_tile = util.rasterize( 'fao_ecozones_bor_tem_tro.shp', "{}_fao_ecozones_bor_tem_tro.tif".format(tile_id), xmin, ymin, xmax, ymax, '.008', 'Byte', 'recode', '0') print "Resampling eco zone" resampled_ecozone = util.resample( rasterized_eco_zone_tile, "{0}_{1}.tif".format(tile_id, cn.pattern_fao_ecozone_processed)) print "Uploading processed ecozone" util.upload(resampled_ecozone, cn.fao_ecozone_processed_dir) print "Clipping srtm" tile_srtm = util.clip('srtm.vrt', '{}_srtm.tif'.format(tile_id), xmin, ymin, xmax, ymax) print "Resampling srtm" tile_res_srtm = util.resample( tile_srtm, '{0}_{1}.tif'.format(tile_id, cn.pattern_srtm)) print "Uploading processed srtm" util.upload(tile_res_srtm, cn.srtm_processed_dir) print "Clipping precipitation" clipped_precip_tile = util.clip('add_30s_precip.tif', '{}_clip_precip.tif'.format(tile_id), xmin, ymin, xmax, ymax) print "Resampling precipitation" resample_precip_tile = util.resample( clipped_precip_tile, '{0}_{1}.tif'.format(tile_id, cn.pattern_precip)) print "Uploading processed precipitation" util.upload(resample_precip_tile, cn.precip_processed_dir)
def summary_turnover(self, by=None): """Returns a turnover-related metrics summary Dataframe.""" index = ['turnover_t', 'turnover_h', 'turnover_d'] tvr_t, tvr_h, tvr_d = self.get_turnover() res = { 'turnover_t': util.resample(tvr_t, how='mean', by=by), 'turnover_h': util.resample(tvr_h, how='mean', by=by), 'turnover_d': util.resample(tvr_d, how='mean', by=by), } res = pd.Series(res) if by is None else pd.DataFrame(res).T res = res.reindex(index) return pd.DataFrame({'ALL': res}) if by is None else res
def fit(self, data, verbose=False): """ Fits a consensus matrix for each number of clusters Args: * data -> (examples,attributes) format * verbose -> should print or not """ N = data.shape[0] # number of points Mk = np.zeros((self.K_ - self.L_, N, N)) Is = np.zeros( (N, N)) # counter for each pair of points if they were used in resample data for current number of clusters for k in range(self.L_, self.K_): # for each number of clusters i_ = k - self.L_ if verbose: print("At k = %d, aka. iteration = %d" % (k, i_)) for h in range(self.H_): # resample H times if verbose: print("\tAt resampling h = %d, (k = %d)" % (h, k)) resampled_indices, resample_data = util.resample(data, self.resample_proportion_) Mh = self.cluster_(n_clusters=k).fit_predict(resample_data) # find indexes of elements from same clusters with bisection # on sorted array => this is more efficient than brute force search id_clusts = np.argsort(Mh) sorted_ = Mh[id_clusts] # 0000000000111111111111222222 for i in range(k): # for each cluster ia = bisect.bisect_left(sorted_, i) ib = bisect.bisect_right(sorted_, i) cluster_indices = id_clusts[ia:ib] is_ = resampled_indices[cluster_indices] ids_ = np.array(list(combinations(is_, 2))).T # get all pairs of i-th cluster # sometimes only one element is in a cluster (no combinations) if ids_.size != 0: Mk[i_, ids_[0], ids_[1]] += 1 # increment counts ids_2 = np.array(list(combinations(resampled_indices, 2))).T Is[ids_2[0], ids_2[1]] += 1 Is += Is.T Mk[i_] /= Is + 1e-8 # consensus matrix Mk[i_] += Mk[i_].T # Mk[i_] is upper triangular (with zeros on diagonal), we now make it symmetric Mk[i_] += np.eye(N) # Mk[i_, range(N), range(N)] = 1 # always with self, fill the diag Is.fill(0) # reset counter self.Mk = Mk # fits areas under the CDFs self.Ak = np.zeros(self.K_ - self.L_) for i, m in enumerate(Mk): hist, bins = np.histogram(m.ravel(), density=True) self.Ak[i] = np.sum(h * (b - a) for b, a, h in zip(bins[1:], bins[:-1], np.cumsum(hist))) # fits differences between areas under CDFs self.deltaK = np.array([(Ab - Aa) / Aa if i > 2 else Aa for Ab, Aa, i in zip(self.Ak[1:], self.Ak[:-1], range(self.L_, self.K_ - 1))]) self.bestK = np.argmax(self.deltaK) + \ self.L_ if self.deltaK.size > 0 else self.L_
def summary_turnover(self, by=None, freq='daily'): """Returns a turnover-related metrics summary Series/Dataframe. :param str freq: Which frequency of statistics is of interest? 'daily'(default): only returns turnover, AC1, rAC1; 'weekly': returns also AC5, rAC5; 'monthly': returns also AC20, rAC20 These metrics are: * turnover: average daily turnover * AC1: average daily 1-day auto-corrwithelation * AC5: average daily 5-day auto-corrwithelation * AC20: average daily 20-day auto-corrwithelation * rAC1: average daily 1-day rank auto-corrwithelation * rAC5: average daily 5-day rank auto-corrwithelation * rAC20: average daily 20-day rank auto-corrwithelation """ index = ['turnover', 'AC1', 'rAC1'] tmp = { 'turnover': util.resample(self.get_turnover(), how='mean', by=by), 'AC1': util.resample(self.get_ac(1), how='mean', by=by), 'rAC1': util.resample(self.get_ac(1, rank=True), how='mean', by=by), } if freq == 'weekly': index.extend(['AC5', 'rAC5']) tmp.update({ 'AC5': util.resample(self.get_ac(5), how='mean', by=by), 'rAC5': util.resample(self.get_ac(5, rank=True), how='mean', by=by), }) elif freq == 'monthly': index.extend(['AC5', 'rAC5', 'AC20', 'rAC20']) tmp.update({ 'AC5': util.resample(self.get_ac(5), how='mean', by=by), 'rAC5': util.resample(self.get_ac(5, rank=True), how='mean', by=by), 'AC20': util.resample(self.get_ac(20), how='mean', by=by), 'rAC20': util.resample(self.get_ac(20, rank=True), how='mean', by=by), }) res = pd.Series(tmp) if by is None else pd.DataFrame(tmp).T res = res.reindex(index) return res
def estimate(trainX, trainY, resample_num): sample_pos_means = [] sample_pos_covs = [] sample_neg_means = [] sample_neg_covs = [] for i in xrange(resample_num): [sampledX, sampledY] = util.resample(trainX, trainY) [positiveX, negativeX] = util.split(sampledX, sampledY) sample_pos_means.append(np.mean(positiveX, 0)) sample_neg_means.append(np.mean(negativeX, 0)) sample_pos_covs.append(np.cov(np.array(positiveX).T)) sample_neg_covs.append(np.cov(np.array(negativeX).T)) nominal_pos_mean = np.mean(sample_pos_means, 0) nominal_neg_mean = np.mean(sample_neg_means, 0) nominal_pos_cov = np.mean(sample_pos_covs, 0) nominal_neg_cov = np.mean(sample_neg_covs, 0) sample_pos_means_cov = np.cov(np.array(sample_pos_means).T) sample_neg_means_cov = np.cov(np.array(sample_neg_means).T) #log(sample_pos_means_cov) #log(sample_neg_means_cov) np.linalg.cholesky(sample_pos_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) np.linalg.cholesky(sample_neg_means_cov + np.eye(sample_neg_means_cov.shape[0]) * 1e-8) P_pos = np.linalg.inv(sample_pos_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX) P_neg = np.linalg.inv(sample_neg_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX) np.linalg.cholesky(P_pos + np.eye(sample_neg_means_cov.shape[0]) * 1e-3) np.linalg.cholesky(P_neg + np.eye(sample_neg_means_cov.shape[0]) * 1e-3) rho_pos = 0 rho_neg = 0 for cov_matrix in sample_pos_covs: dis = util.F_norm(cov_matrix - nominal_pos_cov) rho_pos = max(dis, rho_pos) for cov_matrix in sample_neg_covs: dis = util.F_norm(cov_matrix - nominal_neg_cov) rho_neg = max(dis, rho_neg) return [ nominal_pos_mean, P_pos, nominal_neg_mean, P_neg, nominal_pos_cov, rho_pos, nominal_neg_cov, rho_neg ]
def summary_ir(self, by=None, freq='daily'): """Returns a IR-related metrics summary Series/Dataframe. :param str freq: Which frequency of statistics is of interest? 'daily'(default): only returns IR1, rIR1; 'weekly': returns also IR5, rIR5; 'monthly': returns also IR20, rIR20 These metrics are: * IR1: mean(IC(1)) / std(IC(1)) * IR5: mean(IC(5)) / std(IC(5)) * IR20: mean(IC(20)) / std(IC(20)) * rIR1: mean(rank IC(1)) / std(rank IC(1)) * rIR5: mean(rank IC(5)) / std(rank IC(5)) * rIR20: mean(rank IC(20)) / std(rank IC(20)) """ index = ['days', 'IR1', 'rIR1'] tmp = { 'days': util.resample(self.get_ic(1), how='count', by=by), 'IR1': util.resample(self.get_ic(1), how='ir', by=by), 'rIR1': util.resample(self.get_ic(1, rank=True), how='ir', by=by), } if freq == 'weekly': index.extend(['IR5', 'rIR5']) tmp.update({ 'IR5': util.resample(self.get_ic(5), how='ir', by=by), 'rIR5': util.resample(self.get_ic(5, rank=True), how='ir', by=by), }) elif freq == 'monthly': index.extend(['IR5', 'rIR5', 'IR20', 'rIR20']) tmp.update({ 'IR5': util.resample(self.get_ic(5), how='ir', by=by), 'rIR5': util.resample(self.get_ic(5, rank=True), how='ir', by=by), 'IR20': util.resample(self.get_ic(20), how='ir', by=by), 'rIR20': util.resample(self.get_ic(20, rank=True), how='ir', by=by), }) res = pd.Series(tmp) if by is None else pd.DataFrame(tmp).T res = res.reindex(index) return res
def estimate(trainX, trainY, resample_num): sample_pos_means = [] sample_pos_covs = [] sample_neg_means = [] sample_neg_covs = [] for i in xrange(resample_num): [sampledX, sampledY] = util.resample(trainX, trainY) [positiveX, negativeX] = util.split(sampledX, sampledY) sample_pos_means.append(np.mean(positiveX, 0)) sample_neg_means.append(np.mean(negativeX, 0)) sample_pos_covs.append(np.cov(np.array(positiveX).T)) sample_neg_covs.append(np.cov(np.array(negativeX).T)) nominal_pos_mean = np.mean(sample_pos_means, 0) nominal_neg_mean = np.mean(sample_neg_means, 0) nominal_pos_cov = np.mean(sample_pos_covs, 0) nominal_neg_cov = np.mean(sample_neg_covs, 0) sample_pos_means_cov = np.cov(np.array(sample_pos_means).T) sample_neg_means_cov = np.cov(np.array(sample_neg_means).T) #log(sample_pos_means_cov) #log(sample_neg_means_cov) np.linalg.cholesky(sample_pos_means_cov+ np.eye(sample_pos_means_cov.shape[0]) * 1e-8) np.linalg.cholesky(sample_neg_means_cov+ np.eye(sample_neg_means_cov.shape[0]) * 1e-8) P_pos = np.linalg.inv(sample_pos_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX) P_neg = np.linalg.inv(sample_neg_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX) np.linalg.cholesky(P_pos+ np.eye(sample_neg_means_cov.shape[0]) * 1e-3) np.linalg.cholesky(P_neg+ np.eye(sample_neg_means_cov.shape[0]) * 1e-3) rho_pos = 0 rho_neg = 0 for cov_matrix in sample_pos_covs: dis = util.F_norm(cov_matrix - nominal_pos_cov) rho_pos = max(dis, rho_pos) for cov_matrix in sample_neg_covs: dis = util.F_norm(cov_matrix - nominal_neg_cov) rho_neg = max(dis, rho_neg) return [nominal_pos_mean, P_pos, nominal_neg_mean, P_neg, nominal_pos_cov, rho_pos, nominal_neg_cov, rho_neg]
def summary_ir(self, by=None, freq='daily'): """Returns a IR-related metrics summary Dataframe.""" index = ['days', 'IR_t', 'rIR_t', 'IR_h', 'rIR_h', 'IR_d', 'rIR_d'] ic_t, ic_h, ic_d = self.get_ic() ric_t, ric_h, ric_d = self.get_ic(rank=True) res = { 'days': util.resample(ic_t, how='count', by=by), 'IR_t': util.resample(ic_t, how='ir', by=by), 'rIR_t': util.resample(ric_t, how='ir', by=by), 'IR_h': util.resample(ic_h, how='ir', by=by), 'rIR_h': util.resample(ric_h, how='ir', by=by), 'IR_d': util.resample(ic_d, how='ir', by=by), 'rIR_d': util.resample(ric_d, how='ir', by=by), } res = pd.Series(res) if by is None else pd.DataFrame(res).T res = res.reindex(index) return pd.DataFrame({'ALL': res}) if by is None else res
def predict(path: str, data_x: np.ndarray): # Pretreatment data_x = [data_x] data_x, length = util.resample(data_x, 600) data_x = util.reshape(data_x, length) for i in range(len(data_x)): data_x[i, :, 0] = util.regularize(data_x[i, :, 0]) data_x[i, :, 1] = util.regularize(data_x[i, :, 1]) data_x[i, :, 2] = util.regularize(data_x[i, :, 2]) with tf.Session() as sess: saver = tf.train.import_meta_graph(os.path.join(path, '.meta')) saver.restore(sess, path) graph = tf.get_default_graph() placehold_x = graph.get_tensor_by_name('input/data_x:0') predict_value = graph.get_tensor_by_name('accuracy/predict:0') keep_prob = graph.get_tensor_by_name('keep_prob:0') return sess.run(predict_value, feed_dict={placehold_x: data_x, keep_prob: 1})[0][0]
def resampled_mv(X, alg, n_base_partitions=30, resample_proportion=0.8): """Majority voting with resampling""" N = X.shape[0] ca = np.zeros((N, N)) Is = np.zeros((N, N)) for h in range(n_base_partitions): resampled_indices, resampled_data = resample(X, resample_proportion) alg.fit(resampled_data) if hasattr(alg, 'predict'): Mh = alg.predict(resampled_data) else: Mh = alg.labels_ id_clusts = np.argsort(Mh) sorted_ = Mh[id_clusts] k = len(np.unique(sorted_)) for i in range(k): # for each cluster ia = bisect.bisect_left(sorted_, i) ib = bisect.bisect_right(sorted_, i) cluster_indices = id_clusts[ia:ib] is_ = resampled_indices[cluster_indices] ids_ = np.array(list(combinations(is_, 2))).T if ids_.size != 0: ca[ids_[0], ids_[1]] += 1 ids_2 = np.array(list(combinations(resampled_indices, 2))).T Is[ids_2[0], ids_2[1]] += 1 Is += Is.T ca = ca / (Is + 1e-8) ca += ca.T ca += np.eye(N) labels = mv_consensus(ca) return labels
def fit_from_cfg(self, data): self.X = data N = data.shape[0] # number of points Mk = np.zeros((N, N)) Is = np.zeros( (N, N)) # counter for each pair of points if they were used in resample data for current number of clusters for h in range(self.H_): # resample H times resampled_indices, resample_data = util.resample(data, self.resample_proportion_) self.cluster_.fit(resample_data) if hasattr(self.cluster_, 'predict'): Mh = self.cluster_.predict(resample_data) else: Mh = self.cluster_.labels_ id_clusts = np.argsort(Mh) sorted_ = Mh[id_clusts] # 0000000000111111111111222222 k = len(np.unique(sorted_)) for i in range(k): # for each cluster ia = bisect.bisect_left(sorted_, i) ib = bisect.bisect_right(sorted_, i) cluster_indices = id_clusts[ia:ib] is_ = resampled_indices[cluster_indices] ids_ = np.array(list(combinations(is_, 2))).T # get all pairs of i-th cluster # sometimes only one element is in a cluster (no combinations) if ids_.size != 0: Mk[ids_[0], ids_[1]] += 1 # increment counts ids_2 = np.array(list(combinations(resampled_indices, 2))).T Is[ids_2[0], ids_2[1]] += 1 Is += Is.T Mk /= Is + 1e-8 # consensus matrix Mk += Mk.T # Mk[i_] is upper triangular (with zeros on diagonal), we now make it symmetric Mk += np.eye(N) Is.fill(0) # reset counter self.Mk = Mk
def _resample(self, tmp_env, frame_rate): return resample(tmp_env, frame_rate, self.sr)
def create_hdf5(series_list, output_dir, resample=False, max_series=1e5): hdf5_fh = h5py.File(os.path.join(output_dir, 'data.hdf5'), 'a') for group_name in ('series', 'aneurysm_masks'): if group_name not in hdf5_fh: hdf5_fh.create_group('/{}'.format(group_name)) assert len(series_list) < 1e5, 'Too many series for 5-digit IDs.' for i, s in enumerate(series_list): if i >= max_series: break dset_path = '/series/{:05d}'.format(i + 1) if dset_path in hdf5_fh: continue print('Processing series {} from study {}...'.format( s.series_number, s.study_name)) pixel_arrays = [] is_valid_series = True for slice_name in tqdm(s.slice_names, total=len(s), unit=' slices'): # Process and write slices dcm_path = os.path.join(s.dcm_dir, slice_name + '.dcm') dcm = util.read_dicom(dcm_path) try: pixel_arrays.append(util.dcm_to_raw(dcm)) except NotImplementedError: print('Unsupported image format, not converting study: {}'. format(s.study_name)) is_valid_series = False break if not is_valid_series: continue volume = np.stack(pixel_arrays) aneurysm_mask_path = os.path.join(s.dcm_dir, 'aneurysm_mask.npy') if os.path.exists(aneurysm_mask_path): s.aneurysm_mask_path = aneurysm_mask_path aneurysm_mask = np.transpose(np.load(s.aneurysm_mask_path), [2, 0, 1]) else: s.aneurysm_mask_path = None aneurysm_mask = None assert aneurysm_mask is None or aneurysm_mask.shape == volume.shape, \ 'Mismatched aneurysm mask and volume shapes: {} and {}'.format(aneurysm_mask.shape, volume.shape) if len(s) > 0 and resample: util.print_err('Resampling volume... Shape before: {}'.format( volume.shape)) tick = time.time() dcm = util.read_dicom( os.path.join(s.dcm_dir, s.slice_names[0] + '.dcm')) volume, real_scale = util.resample(volume, dcm.SliceThickness, dcm.PixelSpacing, (1.5, 1., 1.)) util.print_err('Shape after: {}. Resample took {} s.'.format( volume.shape, time.time() - tick)) if aneurysm_mask is not None: util.print_err( 'Resampling mask... Shape before: {}, count before: {}.'. format(aneurysm_mask.shape, np.sum(aneurysm_mask > 0))) tick = time.time() aneurysm_mask, mask_scale = util.resample( aneurysm_mask, dcm.SliceThickness, dcm.PixelSpacing, (1.5, 1., 1.)) util.print_err( 'Mask shape after: {}, count after: {}. Resample took {} s.' .format(aneurysm_mask.shape, np.sum(aneurysm_mask > 0), time.time() - tick)) if not aneurysm_mask.any(): raise RuntimeError( 'Mask has zero volume after resampling.') if s.is_aneurysm: # Recompute slice numbers where the aneurysm lives s.aneurysm_bounds = get_aneurysm_range(aneurysm_mask) s.aneurysm_ranges = [s.aneurysm_bounds] s.absolute_range = [0, aneurysm_mask.shape[0]] # Create one dataset for the volume (int16), one for the mask (bool) s.dset_path = dset_path hdf5_fh.create_dataset(s.dset_path, data=volume, dtype='i2', chunks=True) if aneurysm_mask is not None: s.aneurysm_mask_path = '/aneurysm_masks/{:05d}'.format(i + 1) hdf5_fh.create_dataset(s.aneurysm_mask_path, data=aneurysm_mask, dtype='?', chunks=True) # Print summary util.print_err('Series: {}'.format(len(hdf5_fh['/series']))) util.print_err('Aneurysm Masks: {}'.format(len( hdf5_fh['/aneurysm_masks']))) # Dump pickle and JSON (updated dset_path and mask_path attributes) util.print_err('Dumping pickle file...') with open(os.path.join(output_dir, 'series_list.pkl'), 'wb') as pkl_fh: pickle.dump(series_list, pkl_fh) util.print_err('Dumping JSON file...') with open(os.path.join(output_dir, 'series_list.json'), 'w') as json_file: json.dump([dict(series) for series in series_list], json_file, indent=4, sort_keys=True, default=util.json_encoder) # Clean up hdf5_fh.close()
def calc_stoi(clean_sig, bad_sig, fs_signal): if len(clean_sig) != len(bad_sig): raise ValueError('the length of clean signal and bad signal not equal') x, y = np.array(clean_sig), np.array(bad_sig) fs = 10000 N_frame = 256 K = 512 J = 15 mn = 150 H, _ = _thirdoct(fs, K, J, mn) N = 30 Beta = -15 dyn_range = 40 if fs_signal != fs: x = util.resample(x, fs_signal, fs) y = util.resample(y, fs_signal, fs) x, y = _rm_silent_frame(x, y, dyn_range, N_frame, N_frame // 2) if len(x) <= 0: raise ValueError("Signal contains no speech fragments") x_hat = _stdft(x, N_frame, N_frame / 2, K) y_hat = _stdft(y, N_frame, N_frame / 2, K) x_hat = np.transpose(x_hat[:, 0:K // 2 + 1]) y_hat = np.transpose(y_hat[:, 0:K // 2 + 1]) X, Y = [], [] for i in range(x_hat.shape[1]): X.append(np.sqrt(H.dot(np.abs(x_hat[:, i])**2))) Y.append(np.sqrt(H.dot(np.abs(y_hat[:, i])**2))) X = np.array(X) Y = np.array(Y) X = X.T Y = Y.T c = 10**(-Beta / 20.) score, count = 0., 0 for m in range(N, X.shape[1] + 1): X_seg = X[:, m - N:m] Y_seg = Y[:, m - N:m] Y_square_sum = np.sum(np.square(Y_seg), axis=1) Y_square_sum[Y_square_sum <= 0] = np.finfo(np.float64).eps alpha = np.sqrt(np.sum(np.square(X_seg), axis=1) / Y_square_sum) alpha = np.reshape(alpha, [len(alpha), 1]) aY_seg = Y_seg * np.tile(alpha, [1, N]) for j in range(J): aX = X_seg[j, :] + X_seg[j, :].dot(c) Y_prime = [min(x, y) for x, y in zip(aY_seg[j, :], aX)] Y_prime = np.array(Y_prime) s = _correlation_coefficient(X_seg[j, :], Y_prime) score += s count += 1 score /= max(count, 1) return score
def get_ir(self, rank=False, by=None): ic_t, ic_h, ic_d = self.get_ic(rank=rank) return util.resample(ic_t, how='ir', by=by), util.resample( ic_h, how='ir', by=by), util.resample(ic_d, how='ir', by=by)
def train(): TIMESTAMP = "{0:%Y-%m-%d-%H-%M/}".format(datetime.now()) log.log_info('program start') data, num_good, num_bad = util.load_train_data(num_data // 2) log.log_debug('Data loading completed') # resample data, length = util.resample(data, 600) data = util.reshape(data, length) good_data_origin = data[:num_good, :] bad_data_origin = data[num_good:, :] # extract bad data for test and train permutation = list(np.random.permutation(len(bad_data_origin))) shuffled_bad_data = bad_data_origin[permutation, :] test_bad_data = shuffled_bad_data[:int(num_bad * 0.3), :] train_bad_data_origin = shuffled_bad_data[int(num_bad * 0.3):, :] # extract corresponding good data for test and train permutation = list(np.random.permutation(len(good_data_origin))) shuffled_good_data = good_data_origin[permutation, :] test_good_data = shuffled_good_data[:len(test_bad_data), :] train_good_data = shuffled_good_data[len(test_bad_data):, :] assert len(test_bad_data) == len(test_good_data) # construct test data test_y = np.array([1.] * len(test_good_data) + [0.] * len(test_bad_data), dtype=np.float).reshape( (len(test_bad_data) + len(test_good_data), 1)) test_x = np.vstack((test_good_data, test_bad_data)) # expand the number of bad data for train train_x = np.vstack((train_good_data, train_bad_data_origin)) train_y = np.array([1.] * len(train_good_data) + [0.] * len(train_bad_data_origin), dtype=np.float).reshape( (len(train_bad_data_origin) + len(train_good_data), 1)) train_x, train_y, num_expand = util.expand(train_x, train_y) # regularize for i in range(len(train_x)): train_x[i, :, 0] = util.regularize(train_x[i, :, 0]) train_x[i, :, 1] = util.regularize(train_x[i, :, 1]) train_x[i, :, 2] = util.regularize(train_x[i, :, 2]) for i in range(len(test_x)): test_x[i, :, 0] = util.regularize(test_x[i, :, 0]) test_x[i, :, 1] = util.regularize(test_x[i, :, 1]) test_x[i, :, 2] = util.regularize(test_x[i, :, 2]) # random train_x, train_y = util.shuffle_data(train_x, train_y) log.log_debug('prepare completed') log.log_info('convolution layers: ' + str(conv_layers)) log.log_info('filters: ' + str(filters)) log.log_info('full connected layers: ' + str(fc_layers)) log.log_info('learning rate: %f' % learning_rate) log.log_info('keep prob: ' + str(keep_prob)) log.log_info('the number of expanding bad data: ' + str(num_expand)) log.log_info('mini batch size: ' + str(mini_batch_size)) if mini_batch_size != 0: assert mini_batch_size <= len(train_x) cnn = Cnn(conv_layers, fc_layers, filters, learning_rate) (m, n_W0, n_C0) = train_x.shape n_y = train_y.shape[1] # construction calculation graph cnn.initialize(n_W0, n_C0, n_y) cost = cnn.cost() optimizer = cnn.get_optimizer(cost) predict, accuracy = cnn.predict() init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # log for tensorboard merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter("resource/tsb/train/" + TIMESTAMP, sess.graph) test_writer = tf.summary.FileWriter("resource/tsb/test/" + TIMESTAMP) if enable_debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.run(init) for i in range(1, num_epochs + 1): if mini_batch_size != 0: num_mini_batches = int(m / mini_batch_size) mini_batches = util.random_mini_batches(train_x, train_y, mini_batch_size) cost_value = 0 for mini_batch in mini_batches: (mini_batch_x, mini_batch_y) = mini_batch _, temp_cost = sess.run([optimizer, cost], feed_dict={cnn.x: mini_batch_x, cnn.y: mini_batch_y, cnn.keep_prob: keep_prob}) cost_value += temp_cost cost_value /= num_mini_batches else: _, cost_value = sess.run([optimizer, cost], feed_dict={cnn.x: train_x, cnn.y: train_y, cnn.keep_prob: keep_prob}) # disable dropout summary_train, train_accuracy = sess.run([merged, accuracy], feed_dict={cnn.x: train_x, cnn.y: train_y, cnn.keep_prob: 1}) summary_test, test_accuracy = sess.run([merged, accuracy], feed_dict={cnn.x: test_x, cnn.y: test_y, cnn.keep_prob: 1}) train_writer.add_summary(summary_train, i - 1) test_writer.add_summary(summary_test, i - 1) if print_detail and (i % 10 == 0 or i == 1): info = '\nIteration %d\n' % i + \ 'Cost: %f\n' % cost_value + \ 'Train accuracy: %f\n' % train_accuracy + \ 'Test accuracy: %f' % test_accuracy log.log_info(info) # stop when test>0.95 and train>0.99 if test_accuracy >= 0.95 and train_accuracy >= 0.99: info = '\nIteration %d\n' % i + \ 'Cost: %f\n' % cost_value + \ 'Train accuracy: %f\n' % train_accuracy + \ 'Test accuracy: %f' % test_accuracy log.log_info(info) saver.save(sess, "resource/model/" + TIMESTAMP) break saver.save(sess, "resource/model/" + TIMESTAMP) train_writer.close() test_writer.close() log.log_info('program end')
from util import resample, get_list_of_files import pandas as pd import matplotlib.pyplot as plt from tqdm import tqdm file_list = get_list_of_files('./', 'log') bids = [] asks = [] for file in tqdm(file_list): bid, ask = resample(file, '1Min') bids.append(bid) asks.append(ask) main = pd.DataFrame(bids[0]) main1 = pd.DataFrame(asks[0]) for bid in bids[1:]: main = main.append(bid) for ask in asks[1:]: main1 = main1.append(ask) main.sort_index(inplace=True) main1.sort_index(inplace=True) main.to_csv(r'bid_ohlc_1min.csv') main1.to_csv(r'ask_ohlc_1min.csv') # # PART 2
def get_ir(self, n=1, rank=False, by=None): return util.resample(self.get_ic(n=n, rank=rank), how='ir', by=by)