def initialize(self, instances, parallel_init=False, max_processes=None, **kwargs): """Initialize instances with the feature set. This can be optionally used by instances to preprocess for efficiency. """ num_instances = len(instances) if parallel_init: num_processes = multiprocessing.cpu_count() \ if max_processes is None \ else min(max_processes, multiprocessing.cpu_count()) print "Using", num_processes, "processes for initialization" pool = multiprocessing.Pool(processes=num_processes) args = [(idx, kwargs) for idx in range(len(instances))] self.instances = instances for i, returned_args in enumerate( pool.imap_unordered(self.per_instance_init, args)): idx, instance = returned_args self.instances[idx] = instance sys.stdout.write("Initializing " + str(num_instances) + " instances: " + str(i + 1) + '\r') print instances = self.instances delattr(self, 'instances') else: with timer.AvgTimer(num_instances): for i, instance in enumerate(instances): instance.initialize(self.features, **kwargs) sys.stdout.write("Initializing " + str(num_instances) + " instances: " + str(i + 1) + '\r') print
def train_single(self, instances, num_epochs=20, num_relaxed_epochs=0, **kwargs): """Trains a model with the perceptron-style algorithm following Collins (2002). Saves model state after every epoch. """ # External models that are needed for instance initialization and/or # decoding may be too large to save. We assume that these models # will be supplied directly to train(); these are then added to the # saved parameters for initialization and decoding. kwargs.update(self.params) self.initialize(instances, **kwargs) # Initialize instances with the feature set. This can be optionally # used by instances to preprocess for efficiency. num_instances = len(instances) for n in range(self.current_epoch, num_epochs): random.shuffle(instances) with timer.AvgTimer(num_instances): for i, instance in enumerate(instances): weight_update = self.decode_instance( instance, self.current_weights, i, epoch=n, num_instances=num_instances, relax=n < num_relaxed_epochs, **kwargs) self.update_weights(weight_update) print interrupt = self.finish_epoch(**kwargs) if interrupt: break
def evaluate(self, learner, partition='test', debug_idxs=None, skip_idxs=(), decoder='ilp', n_eval=(1, 2, 3, 4), streaming=True, overwritten_params=(), eval_path=None, output_path=None, lm_proxy=None, **kwargs): """Run the transduction model on designated test instances and report performance metrics. """ # When evaluating multiple iterations of the same model over a fixed # partition, decoding should ensure that initialization isn't # unnecessarily repeated. #print(kwargs) if partition == 'test' and kwargs['subcorpus'] is not None and kwargs[ 'subcorpus'] == 'final': #if kwargs['subcorpus'] == 'final': print("FINAL") eval_instances = self.get_instances(partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs) system_name = learner.name elif learner is not None: eval_instances = self.decode_instances(learner, partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs, decoder=decoder, streaming=streaming, overwritten_params=\ overwritten_params, **kwargs) system_name = learner.name else: eval_instances = self.get_instances(partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs) system_name = 'baseline' num_instances = len(eval_instances) # Record overwritten parameters in the filenames overwriting_str = None if len(overwritten_params) > 0: overwriting_str = '_OW-' i = 0 for param_name, value in overwritten_params.iteritems(): if isinstance(value, list) or isinstance(value, tuple): overwriting_str += '+'.join(str(v) for v in sorted(value)) else: overwriting_str += str(value) i += 1 if i < len(overwritten_params): overwriting_str += '-' if output_path is not None: output_filename = ''.join( (output_path, '/', '_'.join((partition, 'under', system_name)), overwriting_str if overwriting_str is not None else '', '_', decoder, '.out')) outf = open(output_filename, 'wb') # Determine the evaluations to run by looking at a representative # instance i = 0 while i < len(eval_instances) and \ not hasattr(eval_instances[i], 'output_sent'): i += 1 if i == len(eval_instances): print "WARNING: all instances failed; skipping evaluation" sys.exit() some_instance = eval_instances[i] has_labels = hasattr(some_instance, 'label_sentences') has_rasp = hasattr(some_instance.gold_sentences[0], 'relgraph') has_outtrees = hasattr(some_instance.output_sent, 'outtree') has_outframes = hasattr(some_instance.output_sent, 'outframes') # FIXME TEMPORARY! MUST MAKE "False" FOR TEST! skip_failed = False # Initialize the evaluations eval_obj = evaluation.Evaluation(title='TRANSDUCTION_EVAL') output_sents = [] with timer.AvgTimer(num_instances): for i, instance in enumerate(eval_instances): sys.stdout.write("Evaluating " + str(num_instances) + (" " + partition if partition is not None else "") + " instances: " + str(i + 1) + '\r') # Duration and failure status eval_obj.include( system=system_name, corpus='other', decode_time=instance.decode_times[-1], solution_time=instance.solution_times[-1] \ if len(instance.solution_times) > 0 else 0, inputs=len(instance.input_sents), _failed=int(not hasattr(instance, 'output_sent')), ) if skip_failed and not hasattr(instance, 'output_sent'): print "WARNING: Skipping failed instance", instance.idx continue # POS tag recall for use_labels in set([False]) | set([has_labels]): #for prefix in ('NN', 'VB', 'JJ', 'RB'): # p, r, f = instance.score_content_words( # use_labels=use_labels, prefixes=(prefix,)) # eval_obj.add_metrics( # precision=p, # recall=r, # system=system_name, # corpus=('LBLs ' + prefix) if use_labels \ # else ('GOLD ' + prefix), # ) p, r, f = instance.score_content_words( use_labels=use_labels, prefixes=('NN', 'VB')) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus=('LBLs ' + 'NN+VB') if use_labels \ else ('GOLD ' + 'NN+VB'), ) try: if lm_proxy is not None: output_tokens = instance.output_sent.tokens \ if hasattr(instance, 'output_sent') else [] eval_obj.include(system=system_name, corpus='other', lm=lm_proxy.score_sent(output_tokens)) except jsonrpc.RPCTransportError: print "ERROR: JSON-RPC hiccups; skipping LM scoring" pass if decoder.startswith('dp+'): # Record convergence of dual decomposition or # bisection. Will be 0 if neither are used. eval_obj.include( system=system_name, corpus='other', convergence_=int(instance.converged), iterations=instance.num_iterations, ) if len(instance.sentences) == 1: # Paraphrasing or compression-specific metrics eval_obj.include( system=system_name, corpus='STATS gold', comp_=instance.get_gold_compression_rate(), length=instance.avg_gold_len, proj_=avg( int(gold_sent.dparse.is_projective()) for gold_sent in instance.gold_sentences), overlap_=avg( instance.get_overlap(gold_sent) for gold_sent in instance.gold_sentences), ) eval_obj.include( system=system_name, corpus='STATS input', comp_=1.0, length=instance.avg_len, proj_=int( instance.sentences[0].dparse.is_projective()), overlap_=instance.get_overlap(instance.sentences[0])) eval_obj.include( system=system_name, corpus='STATS output', comp_=instance.get_compression_rate(), length=len(instance.output_sent.tokens) if hasattr( instance, 'output_sent') else 0, ) if hasattr(instance, 'output_sent') and has_outtrees: eval_obj.include( system=system_name, corpus='STATS output', proj_=int(instance.output_sent.\ outtree.is_projective()) if hasattr(instance.output_sent.outtree,\ 'is_projective') else 0, overlap_=instance.get_overlap( instance.output_sent, parse_type='outtree') ) # print "INSTANCE ", instance.idx # crossing_edges = \ # instance.output_sent.outtree.get_crossing_edges() # print "\n\nINPUT:", # self.dump_parse(instance.sentences[0]) # # for gs, gold_sent in enumerate( # instance.gold_sentences): # # get output indices for gold # gold_idxs = [] # i = 0 # for token in gold_sent.tokens: # while instance.sentences[0].tokens[i] != token: # i += 1 # gold_idxs.append((0,i)) # # print "\nGOLD:", gs, # self.dump_parse(gold_sent, # idx_mapper=gold_idxs) # # print "\n\nOUTPUT:", # self.dump_parse(instance.output_sent, # parse_type='outtree', # crossing_edges=crossing_edges, # idx_mapper=instance.output_idxs) # n-gram precision and recall for use_labels in set([False]) | set([has_labels]): for n in n_eval: p, r, f = instance.score_ngrams(n=n, use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus='LBLs n=' + str(n) if use_labels else 'GOLD n=' + str(n), ) if hasattr(instance, 'output_sent') and has_outframes: # Precision and recall for frames p, r, f = instance.score_frames(fes=False, frames_type='outframes', use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus="GOLD frames", ) # Precision and recall for frame elements p, r, f = instance.score_frames(fes=True, frames_type='outframes', use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus="GOLD fes", ) # Parse output sentences for syntactic evaluation. The # 100 token limit is intended for the Stanford parser. if hasattr(instance, 'output_sent') and \ len(instance.output_sent.tokens) <= 100: output_sents.append(instance.output_sent) # Write the output to a file if output_path is not None: outf.write(instance.get_display_string()) # print if output_path is not None: outf.close() # Parse-based evaluations try: parse_types = ['dparse'] if has_outtrees: parse_types.append('outtree') # Get annotations. Only run RASP if the inputs have RASP # annotations since it's slow annotations.annotate(output_sents, 'Stanford') if has_rasp: annotations.annotate(output_sents, 'Rasp') parse_types.append('relgraph') # Add dependency results to evaluations for i, instance in enumerate(eval_instances): if skip_failed and not hasattr(instance, 'output_sent'): print "WARNING: Skipping failed instance", print instance.idx, "again" continue for parse_type in parse_types: for use_labels in set([False]) | set([has_labels]): name = ('LBLs ' if use_labels else 'GOLD ') + \ parse_type p, r, f = instance.score_dependencies( parse_type=parse_type, use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus=name, _failed=int(not instance.has_output_parses( parse_type=parse_type))) except OSError: print "Skipping parser evaluations" print eval_obj.title print eval_obj.table(skip_single_keys=True) if eval_path is not None and debug_idxs is None: eval_filename = ''.join( (eval_path, '/', '_'.join((partition, 'under', system_name)), overwriting_str if overwriting_str is not None else '', '_', decoder, '.eval')) eval_obj.save(eval_filename, append=False)
def train_master_minibatch(self, instances, machine_idx=0, num_epochs=20, num_relaxed_epochs=0, master_oversees=False, **kwargs): """Run the master process of a round-robin distributed learner. """ # Sanity check assert machine_idx == 0 # Runtime parameters that are not saved with the model kwargs.update(self.params) # Divide instances into minibatches num_instances = len(instances) minibatches = self.machine_init_minibatch( instances, machine_idx=machine_idx, master_oversees=master_oversees, **kwargs) # Initialize slave proxies all_machines = self.master_init_slaves(len(self.current_weights), master_oversees=master_oversees, **kwargs) pool = multiprocessing.Pool(processes=len(all_machines), maxtasksperchild=1) # manager = multiprocessing.Manager() # instance_queue = manager.Queue() if not master_oversees else None # instance_queue = None for n in range(self.current_epoch, num_epochs): if n > 0: # for comparing and debugging random.shuffle(minibatches) if not master_oversees: print "WARNING: minibatching will duplicate the heap", print "for the master." print "Consider running with --master_oversees instead." self.temp_instances = instances self.temp_params = kwargs with timer.AvgTimer(num_instances): for b, minibatch in enumerate(minibatches): # Print current epoch and minibatch sys.stdout.write("[Epoch " + str(n) + "] Batch " + str(b + 1) + "/" + str(len(minibatches)) + "\r") # # Put instances assigned to the master in a queue # for master_idx in minibatch[0]: # instance_queue.put(instances[master_idx]) args = [ (n, num_relaxed_epochs, all_machines[m], instance_idxs) # self.current_weights, self.learning_rate.value(), # kwargs, instance_queue) for m, instance_idxs in enumerate(minibatch) ] weight_update = np.zeros(len(self.current_weights)) num_minibatch_updates = 0 for returned_args in pool.imap_unordered( self.minibatch_epoch, args): if returned_args is None: print "Failed to decode minibatch", b continue sum_weight_update, num_machine_updates = returned_args if sum_weight_update is not None: weight_update += sum_weight_update num_minibatch_updates += num_machine_updates if num_minibatch_updates > 0: weight_update /= num_minibatch_updates else: weight_update = None self.update_weights(weight_update) print if not master_oversees: del self.temp_instances del self.temp_params interrupt = self.finish_epoch(**kwargs) if interrupt: break
def train_master_serial(self, instances, machine_idx=0, num_epochs=20, num_relaxed_epochs=0, master_oversees=False, **kwargs): """Run the master process of a round-robin distributed learner. """ # Sanity check assert machine_idx == 0 if master_oversees: print "WARNING: Master cannot oversee in serial mode" # Runtime parameters that are not saved with the model kwargs.update(self.params) # Map out splits for each slave and generate a mapper from instances # to slaves. spans_per_machine = self.machine_init_serial(instances, machine_idx=machine_idx, **kwargs) num_instances = len(instances) instance_mapper = [None] * num_instances for machine_idx, span in enumerate(spans_per_machine): for instance_idx in range(*span): instance_mapper[instance_idx] = machine_idx # Initialize slave proxies all_machines = self.master_init_slaves(len(self.current_weights), **kwargs) instance_idxs = range(num_instances) for n in range(self.current_epoch, num_epochs): random.shuffle(instance_idxs) with timer.AvgTimer(num_instances): prev_machine_idx = -1 for i, instance_idx in enumerate(instance_idxs): machine_idx = instance_mapper[instance_idx] if machine_idx == 0: instance = instances[instance_idx] weight_update = self.decode_instance( instance, self.current_weights, i, epoch=n, num_instances=num_instances, relax=n < num_relaxed_epochs, **kwargs) else: # If the last instance was decoded by the same slave # we don't need to send the weights again since it # already has them. weight_update, num_updates = \ self.master_decode_instance( all_machines[machine_idx], [instance_idx], self.current_weights, self.learning_rate.value(), relax=n < num_relaxed_epochs, slave_has_weights=( machine_idx==prev_machine_idx), slave_keeps_weights=True) prev_machine_idx = machine_idx self.update_weights(weight_update) print interrupt = self.finish_epoch(**kwargs) if interrupt: break
def train_multi(self, instances, num_epochs=20, num_relaxed_epochs=0, max_processes=None, **kwargs): """Trains a model with the perceptron-style algorithm from Collins (2002). Attempts to speed up computation by utilizing multiple cores with iterative parameter scaling as described in McDonald et al. (2009). Saves model state after every epoch. """ # External models that are needed for instance initialization may # be too large to save. We assume that these models will be supplied # directly to train(); these are then added to the saved parameters # for initialization. kwargs.update(self.params) # TODO: These arguments are NOT passed to decoding when using # multiple processes to avoid a deep copy of the models for each # process. Shared memory across processes can be implemented using # Value, Array or, more generally, shared ctypes. This is currently # not implemented. If needed, use parallel_decoding=False. self.initialize(instances, **kwargs) # Determine number of parallel processes to use num_processes = multiprocessing.cpu_count() if max_processes is None \ else min(max_processes, multiprocessing.cpu_count()) # The size of each parallel shard is taken as the floor of the number # of instances per process in order to prevent biases in the # parameter mixing num_instances = len(instances) num_shards = num_processes shard_size = np.floor(num_instances / num_shards) shard_spans = [[i * shard_size, (i + 1) * shard_size] for i in range(num_shards)] shard_spans[-1][1] = num_instances for n in range(self.current_epoch, num_epochs): # Randomly shuffle training instances and break them into shards # using the indices created earlier random.shuffle(instances) shards = [instances[begin:end] for begin, end in shard_spans] # Pack shard indices and current weight vector together because # Pool doesn't support multiple function arguments for parallel # maps. Note that local keyword arguments that are not in # self.params will not be available for decoding. args = [(n, num_relaxed_epochs, shard, self.current_weights) for shard in shards] # Initialize a process pool to run a single epoch on # each shard asychronously pool = multiprocessing.Pool(processes=num_processes) next_w = np.zeros(len(self.features)) with timer.AvgTimer(num_instances): for returned_args in pool.imap_unordered( self.per_shard_epoch, args): # Unpack returned values w, sum_w, num_shard_updates = returned_args # The final weight vector from the shard, normalized by # the size of the shard, contributes to the input weight # vector for the next iteration normalizer = num_shard_updates / len(instances) next_w += (np.array(w) * normalizer) # Update the running sum of shard weights # TODO: move perceptron-specific update code to subclass self.sum_weights += sum_w self.num_updates_seen += num_shard_updates print # Terminate pool (processes should already be finished) pool.terminate() # Set weight vector for next parallel epoch self.current_weights = next_w[:] interrupt = self.finish_epoch(**kwargs) if interrupt: break