def build_rds(working_dir, reduced_dir, reduced_dataset, input_files, 
              jet_collection, check_file, do_test): 
    if not os.path.isdir(working_dir): 
        os.mkdir(working_dir)

    # --- rds part
    if not os.path.isdir(reduced_dir): 
        os.mkdir(reduced_dir)

    if not os.path.isfile(reduced_dataset): 
        double_variables, int_variables = rds.get_allowed_rds_variables(
            input_files = input_files, 
            jet_collection = jet_collection)


        pyprep.prep_ntuple(input_files = input_files, 
                           double_variables = double_variables, 
                           int_variables = int_variables, 
                           observer_discriminators = observer_discriminators, 
                           jet_collection = jet_collection, 
                           output_file = reduced_dataset, 
                           debug = do_test)
        
    profile_file = os.path.join(reduced_dir, 'profiled.root')
    if not os.path.isfile(profile_file): 
        profile.make_profile_file(reduced_dataset, profile_file)
    
    # mark as done
    open(check_file,'w').close()
Exemplo n.º 2
0
    def run(self): 

        reduced_dataset = self._reduced_dataset
        working_dir = self._working_dir
        training_variables = self._training_variables
        do_test = self._do_test

        # --- profiling 
        profile_dir = os.path.join(working_dir,'profile')
        if not os.path.isdir(profile_dir): 
            os.mkdir(profile_dir)

        profile_file = os.path.join(profile_dir, 'profiled.root')
        if not os.path.isfile(profile_file):
            rds_dir = os.path.split(reduced_dataset)[0]
            alt_profile_file = os.path.join(rds_dir, 'profiled.root')
            if os.path.isfile(alt_profile_file): 
                profile_file = alt_profile_file

        mean_rms_file = os.path.join(profile_dir, 'mean_rms.txt')
        if not do_test: 
            if not os.path.isfile(mean_rms_file): 
                if not os.path.isfile(profile_file): 
                    print '--- making profile file for normalization ---'
                    profile.make_profile_file(reduced_dataset, profile_file)
            
                profile.build_mean_rms_from_profile(
                    profile_file = profile_file, 
                    text_file_name = mean_rms_file)

        # --- training part 
        training_dir = os.path.join(working_dir,self._training_subdir)
        if not os.path.isdir(training_dir): 
            os.mkdir(training_dir)
    
        normalization_file = os.path.join(training_dir, 'normalization.txt')
        if not os.path.isfile(normalization_file) and not do_test: 
            if not os.path.isfile(profile_file): 
                profile.make_profile_file(reduced_dataset, profile_file)
    
            profile.make_normalization_file(
                profile_file, 
                normalization_file = normalization_file, 
                whitelist = training_variables)
                                            
        normalization_dict = {}
        if os.path.isfile(normalization_file): 
            with open(normalization_file) as norm_file: 
                for line in norm_file: 
                    line = line.strip()
                    if not line: continue
                    name = line.split()[0]
                    offset, scale = (float(n) for n in line.split()[1:])
                    normalization_dict[name] = (offset, scale)
    
        print 'normalization:'
        text_size = max(len(x) for x in normalization_dict.keys()) + 1
        for value, (offset, scale) in normalization_dict.iteritems(): 
            print '%-*s offset: % -10.4g scale: % -10.4g' % (
                text_size, value, offset, scale)
    
        weights_path = os.path.join(training_dir, 'weightMinimum.root')
        if not os.path.isfile(weights_path): 
            print '--- running training ---'
            training.run_training(reduced_dataset = reduced_dataset, 
                                  output_directory = training_dir, 
                                  normalization = normalization_dict, 
                                  flavor_weights = self._flavor_weights, 
                                  nodes = self._nodes, 
                                  debug = do_test, 
                                  events = self._n_training_events, 
                                  other_opt_dict = self._other_opt_dict)
    
        # --- diagnostics part 
        testing_dir = os.path.join(working_dir, self._testing_subdir)
        if not os.path.isdir(testing_dir): 
            os.mkdir(testing_dir)
    

        if not self._testing_ds: 
            self._testing_ds = reduced_dataset

        augmented_tree = os.path.join(testing_dir, 'perf_ntuple.root') 
        if not os.path.isfile(augmented_tree): 
            print '--- augmenting reduced dataset with nn classifiers ---'
            # should wrap this in a function to close the file when done
            
            from ROOT import TFile
            testing_ds_file = TFile(self._testing_ds)

            the_tree = testing_ds_file.Get('SVTree')
            all_vars_in_tree = utils.get_leaves_in_tree(the_tree)

            # --- filter out branches we don't care about
            output_subset = ['bottom','light','charm','JetPt','JetEta']
            subset_regex = '|'.join([
                    '^discriminator(?!Jet)', 
                    '^log[BCU][bcu]', 
                ])
            branch_filter = re.compile(subset_regex)
            for branch in itertools.chain(*all_vars_in_tree.values()): 
                if branch_filter.findall(branch): 
                    output_subset.append(branch)

            from jetnet import pynn
            pynn.augment_tree(
                in_file = self._testing_ds, 
                nn_file = weights_path, 
                out_file = augmented_tree, 
                ints = all_vars_in_tree['Int_t'], 
                doubles = all_vars_in_tree['Double_t'], 
                extension = self._augment_extension , 
                subset = output_subset, 
                show_progress = True) 

        profiled_path = os.path.splitext(augmented_tree)[0] + '_profile.root'
        
        if not os.path.isfile(profiled_path): 
            print '--- profiling performance ntuple ---'
            profile.make_profile_file(reduced_dataset = augmented_tree, 
                                      profile_file = profiled_path)

        output_paths = {
            'profile': profiled_path, 
            'perf_ntuple': augmented_tree, 
            }


        if self._do_more_diagnostics is not None: 
            warn("do_more_diagnostics doesn't do anything now, please remove", 
                 SyntaxWarning, 
                 stacklevel = 5, 
                 # stacklevel 5 needed to get through the multiprocess call
                 ) 
                 

        self.out_queue.put(output_paths)
                      '<reduced dataset>_profile.root')

    (options, args) = parser.parse_args(sys.argv[1:])
    
    if len(args) != 1: 
        sys.exit(parser.get_usage())

    reduced_dataset = args[0]

    if options.output_file is None: 
        profile_path = os.path.splitext(reduced_dataset)[0] + '_profile.root'
    else: 
        profile_path = options.output_file

    if not os.path.exists(profile_path): 
        profile.make_profile_file(reduced_dataset, 
                                  profile_file = profile_path)
    else: 
        sys.exit('{} exists, refusing to overwrite'.format(profile_path))

    profile_hists = profile.read_back_profile_file(profile_path)
    mean_rms_dict = profile.get_mean_rms_values(profile_hists)
    # -- we don't care about flavors here 
    flavor_tags = set(['light','charm','bottom'])
    for key in mean_rms_dict.keys(): 
        keyparts_set = set(key.split('_'))
        overlap = keyparts_set & flavor_tags
        if overlap: 
            del mean_rms_dict[key]

    profile.write_mean_rms_textfile(mean_rms_dict)