def load_ith_file(self): results_files = os.listdir(self.results_dir) print(results_files) dat = np.load(self.results_dir + results_files[self.i], allow_pickle=True) print("Loading results file {}".format(self.results_dir + results_files[self.i])) if self.verbose: print('configuration: {} '.format(dat['conf'])) self.pred_train = dat['y_prime_train'] self.truth_train = dat['y_gold_train'] self.disruptive_train = dat['disruptive_train'] self.pred_test = dat['y_prime_test'] self.truth_test = dat['y_gold_test'] self.disruptive_test = dat['disruptive_test'] self.shot_list_test = ShotList(dat['shot_list_test'][()]) self.shot_list_train = ShotList(dat['shot_list_train'][()]) self.saved_conf = dat['conf'][()] # all files must agree on T_warning due to output of truth vs. # normalized shot ttd. self.conf['data']['T_warning'] = self.saved_conf['data']['T_warning'] for mode in ['test', 'train']: print('{}: loaded {} shot ({}) disruptive'.format( mode, self.get_num_shots(mode), self.get_num_disruptive_shots(mode))) if self.verbose: self.print_conf()
def preprocess_from_files(self,shot_files,use_shots): #all shots, including invalid ones all_signals = self.conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) #empty used_shots = ShotList() use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,shot) in enumerate(pool.imap_unordered(self.preprocess_single_file,shot_list_picked)): #for (i,shot) in enumerate(map(self.preprocess_single_file,shot_list_picked)): sys.stdout.write('\r{}/{}'.format(i,len(shot_list_picked))) used_shots.append_if_valid(shot) pool.close() pool.join() print('Finished Preprocessing {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) print('Omitted {} shots of {} total.'.format(len(shot_list_picked) - len(used_shots),len(shot_list_picked))) print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),len(used_shots))) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data is complete and available at {}.".format(self.conf['paths']['signal_prepath'])) return used_shots
def load_shotlists(self): path = self.get_shot_list_path() data = np.load(path,encoding="latin1", allow_pickle=True) shot_list_train = data['shot_list_train'][()] shot_list_validate = data['shot_list_validate'][()] shot_list_test = data['shot_list_test'][()] if isinstance(shot_list_train,ShotList): return shot_list_train,shot_list_validate,shot_list_test else: return ShotList(shot_list_train),ShotList(shot_list_validate),ShotList(shot_list_test)
def create_shot_list_tmp(original_shot,time_points,sigs=None): shot_list_tmp = ShotList() T = len(original_shot.ttd) t_range = np.linspace(0,T-1,time_points,dtype=np.int) for t in t_range: new_shot = copy.copy(original_shot) assert(new_shot.augmentation_fn == None) new_shot.augmentation_fn = partial(hide_signal_data,t = t,sigs_to_hide=sigs) #new_shot.number = original_shot.number shot_list_tmp.append(new_shot) return shot_list_tmp,t_range
def train_on_files(self, shot_files, use_shots, all_machines, verbose=False): conf = self.conf all_signals = conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files, all_signals) shot_list_picked = shot_list.random_sublist(use_shots) previously_saved, machines_saved = self.previously_saved_stats() machines_to_compute = all_machines - machines_saved recompute = conf['data']['recompute_normalization'] if recompute: machines_to_compute = all_machines previously_saved = False if not previously_saved or len(machines_to_compute) > 0: if previously_saved: self.load_stats(verbose=True) print('computing normalization for machines {}'.format( machines_to_compute)) use_cores = max(1, mp.cpu_count() - 2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format( pool._processes)) start_time = time.time() for (i, stats) in enumerate( pool.imap_unordered(self.train_on_single_shot, shot_list_picked)): # for (i,stats) in # enumerate(map(self.train_on_single_shot,shot_list_picked)): if stats.machine in machines_to_compute: self.incorporate_stats(stats) self.machines.add(stats.machine) sys.stdout.write('\r' + '{}/{}'.format(i, len(shot_list_picked))) pool.close() pool.join() print( '\nFinished Training Normalizer on ', '{} files in {} seconds'.format(len(shot_list_picked), time.time() - start_time)) self.save_stats(verbose=True) else: self.load_stats(verbose=verbose) # print representation of trained Normalizer to stdout: # Machine, NormalizerName, per-signal normalization stats/params if verbose: g.print_unique(self)
def train_on_files(self,shot_files,use_shots,all_machines): conf = self.conf all_signals = conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) previously_saved,machines_saved = self.previously_saved_stats() machines_to_compute = all_machines - machines_saved recompute = conf['data']['recompute_normalization'] if recompute: machines_to_compute = all_machines previously_saved = False if not previously_saved or len(machines_to_compute) > 0: if previously_saved: self.load_stats() print('computing normalization for machines {}'.format(machines_to_compute)) use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,stats) in enumerate(pool.imap_unordered(self.train_on_single_shot,shot_list_picked)): #for (i,stats) in enumerate(map(self.train_on_single_shot,shot_list_picked)): if stats.machine in machines_to_compute: self.incorporate_stats(stats) self.machines.add(stats.machine) sys.stdout.write('\r' + '{}/{}'.format(i,len(shot_list_picked))) pool.close() pool.join() print('Finished Training Normalizer on {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) self.save_stats() else: self.load_stats() print(self)
##################################################### # NORMALIZATION # ##################################################### # TODO(KGF): identical in at least 3x files in examples/ # make sure preprocessing has been run, and is saved as a file if task_index == 0: # TODO(KGF): check tuple unpack (shot_list_train, shot_list_validate, shot_list_test) = guarantee_preprocessed(conf) comm.Barrier() (shot_list_train, shot_list_validate, shot_list_test) = guarantee_preprocessed(conf) shot_list = sum([l.filter_by_number([shot_num]) for l in [shot_list_train, shot_list_validate, shot_list_test]], ShotList()) assert(len(shot_list) == 1) # for s in shot_list.shots: # s.restore() def chunks(l, n): """Yield successive n-sized chunks from l.""" return[l[i:i + n] for i in range(0, len(l), n)] def hide_signal_data(shot, t=0, sigs_to_hide=None): for sig in shot.signals: if sigs_to_hide is None or ( sigs_to_hide is not None and sig in sigs_to_hide): shot.signals_dict[sig][t:, :] = shot.signals_dict[sig][t, :]
def apply_bleed_in(conf,shot_list_train,shot_list_validate,shot_list_test): np.random.seed(2) num = conf['data']['bleed_in'] new_shots = [] if num > 0: shot_list_bleed = ShotList() print('applying bleed in with {} disruptive shots\n'.format(num)) num_total = len(shot_list_test) num_d = shot_list_test.num_disruptive() num_nd = num_total - num_d assert(num_d >= num), "Not enough disruptive shots {} to cover bleed in {}".format(num_d,num) num_sampled_d = 0 num_sampled_nd = 0 while num_sampled_d < num: s = shot_list_test.sample_shot() shot_list_bleed.append(s) if conf['data']['bleed_in_remove_from_test']: shot_list_test.remove(s) if s.is_disruptive: num_sampled_d += 1 else: num_sampled_nd += 1 print("Sampled {} shots, {} disruptive, {} nondisruptive".format(num_sampled_nd+num_sampled_d,num_sampled_d,num_sampled_nd)) print("Before adding: training shots: {} validation shots: {}".format(len(shot_list_train),len(shot_list_validate))) assert(num_sampled_d == num) if conf['data']['bleed_in_equalize_sets']:#add bleed-in shots to training and validation set repeatedly print("Applying equalized bleed in") for shot_list_curr in [shot_list_train,shot_list_validate]: for i in range(len(shot_list_curr)): s = shot_list_bleed.sample_shot() shot_list_curr.append(s) elif conf['data']['bleed_in_repeat_fac'] > 1: repeat_fac = conf['data']['bleed_in_repeat_fac'] print("Applying bleed in with repeat factor {}".format(repeat_fac)) num_to_sample = int(round(repeat_fac*len(shot_list_bleed))) for i in range(num_to_sample): s = shot_list_bleed.sample_shot() shot_list_train.append(s) shot_list_validate.append(s) else: #add each shot only once print("Applying bleed in without repetition") for s in shot_list_bleed: shot_list_train.append(s) shot_list_validate.append(s) print("After adding: training shots: {} validation shots: {}".format(len(shot_list_train),len(shot_list_validate))) print("Added bleed in shots to training and validation sets") # if num_d > 0: # for i in range(num): # s = shot_list_test.sample_single_class(True) # shot_list_train.append(s) # shot_list_validate.append(s) # if conf['data']['bleed_in_remove_from_test']: # shot_list_test.remove(s) # else: # print('No disruptive shots in test set, omitting bleed in') # if num_nd > 0: # for i in range(num): # s = shot_list_test.sample_single_class(False) # shot_list_train.append(s) # shot_list_validate.append(s) # if conf['data']['bleed_in_remove_from_test']: # shot_list_test.remove(s) # else: # print('No nondisruptive shots in test set, omitting bleed in') return shot_list_train,shot_list_validate,shot_list_test
def apply_bleed_in(conf, shot_list_train, shot_list_validate, shot_list_test): np.random.seed(2) num = conf['data']['bleed_in'] # new_shots = [] if num > 0: shot_list_bleed = ShotList() print('applying bleed in with {} disruptive shots\n'.format(num)) # num_total = len(shot_list_test) num_d = shot_list_test.num_disruptive() # num_nd = num_total - num_d assert num_d >= num, ( "Not enough disruptive shots {} to cover bleed in {}".format( num_d, num)) num_sampled_d = 0 num_sampled_nd = 0 while num_sampled_d < num: s = shot_list_test.sample_shot() shot_list_bleed.append(s) if conf['data']['bleed_in_remove_from_test']: shot_list_test.remove(s) if s.is_disruptive: num_sampled_d += 1 else: num_sampled_nd += 1 print("Sampled {} shots, {} disruptive, {} nondisruptive".format( num_sampled_nd + num_sampled_d, num_sampled_d, num_sampled_nd)) print("Before adding: training shots: {} validation shots: {}".format( len(shot_list_train), len(shot_list_validate))) assert (num_sampled_d == num) # add bleed-in shots to training and validation set repeatedly if conf['data']['bleed_in_equalize_sets']: print("Applying equalized bleed in") for shot_list_curr in [shot_list_train, shot_list_validate]: for i in range(len(shot_list_curr)): s = shot_list_bleed.sample_shot() shot_list_curr.append(s) elif conf['data']['bleed_in_repeat_fac'] > 1: repeat_fac = conf['data']['bleed_in_repeat_fac'] print("Applying bleed in with repeat factor {}".format(repeat_fac)) num_to_sample = int(round(repeat_fac * len(shot_list_bleed))) for i in range(num_to_sample): s = shot_list_bleed.sample_shot() shot_list_train.append(s) shot_list_validate.append(s) else: # add each shot only once print("Applying bleed in without repetition") for s in shot_list_bleed: shot_list_train.append(s) shot_list_validate.append(s) print("After adding: training shots: {} validation shots: {}".format( len(shot_list_train), len(shot_list_validate))) print("Added bleed in shots to training and validation sets") # if num_d > 0: # for i in range(num): # s = shot_list_test.sample_single_class(True) # shot_list_train.append(s) # shot_list_validate.append(s) # if conf['data']['bleed_in_remove_from_test']: # shot_list_test.remove(s) # else: # print('No disruptive shots in test set, omitting bleed in') # if num_nd > 0: # for i in range(num): # s = shot_list_test.sample_single_class(False) # shot_list_train.append(s) # shot_list_validate.append(s) # if conf['data']['bleed_in_remove_from_test']: # shot_list_test.remove(s) # else: # print('No nondisruptive shots in test set, omitting bleed in') return shot_list_train, shot_list_validate, shot_list_test
def preprocess_from_files(self, shot_files, use_shots): # all shots, including invalid ones all_signals = self.conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files, all_signals) shot_list_picked = shot_list.random_sublist(use_shots) # empty used_shots = ShotList() # TODO(KGF): generalize the follwowing line to perform well on # architecutres other than CPUs, e.g. KNLs # min( <desired-maximum-process-count>, max(1,mp.cpu_count()-2) ) use_cores = max(1, mp.cpu_count() - 2) pool = mp.Pool(use_cores) print('Running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i, shot) in enumerate( pool.imap_unordered(self.preprocess_single_file, shot_list_picked)): # for (i,shot) in # enumerate(map(self.preprocess_single_file,shot_list_picked)): sys.stdout.write('\r{}/{}'.format(i, len(shot_list_picked))) used_shots.append_if_valid(shot) pool.close() pool.join() print('\nFinished preprocessing {} files in {} seconds'.format( len(shot_list_picked), time.time() - start_time)) print('Using {} shots ({} disruptive shots)'.format( len(used_shots), used_shots.num_disruptive())) print('Omitted {} shots of {} total shots'.format( len(shot_list_picked) - len(used_shots), len(shot_list_picked))) print( 'Omitted {} disruptive shots of {} total disruptive shots'.format( shot_list_picked.num_disruptive() - used_shots.num_disruptive(), shot_list_picked.num_disruptive())) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data " " is complete and available at {}.".format( self.conf['paths']['signal_prepath'])) return used_shots
custom_path = None if only_predict: custom_path = sys.argv[1] shot_num = int(sys.argv[2]) print("predicting using path {} on shot {}".format(custom_path,shot_num)) assert(only_predict) ##################################################### ####################Normalization#################### ##################################################### if task_index == 0: #make sure preprocessing has been run, and is saved as a file shot_list_train,shot_list_validate,shot_list_test = guarantee_preprocessed(conf) comm.Barrier() shot_list_train,shot_list_validate,shot_list_test = guarantee_preprocessed(conf) shot_list = sum([l.filter_by_number([shot_num]) for l in [shot_list_train,shot_list_validate,shot_list_test]],ShotList()) assert(len(shot_list) == 1) # for s in shot_list.shots: # s.restore() def chunks(l, n): """Yield successive n-sized chunks from l.""" return[ l[i:i + n] for i in range(0, len(l), n)] def hide_signal_data(shot,t=0,sigs_to_hide=None): for sig in shot.signals: if sigs_to_hide is None or (sigs_to_hide is not None and sig in sigs_to_hide): shot.signals_dict[sig][t:,:] = shot.signals_dict[sig][t,:] def create_shot_list_tmp(original_shot,time_points,sigs=None): shot_list_tmp = ShotList()