Пример #1
0
    def initialize(self,
                   instances,
                   parallel_init=False,
                   max_processes=None,
                   **kwargs):
        """Initialize instances with the feature set. This can be optionally
        used by instances to preprocess for efficiency.
        """
        num_instances = len(instances)

        if parallel_init:
            num_processes = multiprocessing.cpu_count() \
                    if max_processes is None \
                    else min(max_processes, multiprocessing.cpu_count())
            print "Using", num_processes, "processes for initialization"

            pool = multiprocessing.Pool(processes=num_processes)
            args = [(idx, kwargs) for idx in range(len(instances))]
            self.instances = instances

            for i, returned_args in enumerate(
                    pool.imap_unordered(self.per_instance_init, args)):
                idx, instance = returned_args
                self.instances[idx] = instance
                sys.stdout.write("Initializing " + str(num_instances) +
                                 " instances: " + str(i + 1) + '\r')
                print

            instances = self.instances
            delattr(self, 'instances')
        else:
            with timer.AvgTimer(num_instances):
                for i, instance in enumerate(instances):
                    instance.initialize(self.features, **kwargs)
                    sys.stdout.write("Initializing " + str(num_instances) +
                                     " instances: " + str(i + 1) + '\r')
                print
Пример #2
0
    def train_single(self,
                     instances,
                     num_epochs=20,
                     num_relaxed_epochs=0,
                     **kwargs):
        """Trains a model with the perceptron-style algorithm following
        Collins (2002). Saves model state after every epoch.
        """
        # External models that are needed for instance initialization and/or
        # decoding may be too large to save. We assume that these models
        # will be supplied directly to train(); these are then added to the
        # saved parameters for initialization and decoding.
        kwargs.update(self.params)
        self.initialize(instances, **kwargs)

        # Initialize instances with the feature set. This can be optionally
        # used by instances to preprocess for efficiency.
        num_instances = len(instances)
        for n in range(self.current_epoch, num_epochs):
            random.shuffle(instances)

            with timer.AvgTimer(num_instances):
                for i, instance in enumerate(instances):
                    weight_update = self.decode_instance(
                        instance,
                        self.current_weights,
                        i,
                        epoch=n,
                        num_instances=num_instances,
                        relax=n < num_relaxed_epochs,
                        **kwargs)
                    self.update_weights(weight_update)
                print

            interrupt = self.finish_epoch(**kwargs)
            if interrupt:
                break
Пример #3
0
    def evaluate(self,
                 learner,
                 partition='test',
                 debug_idxs=None,
                 skip_idxs=(),
                 decoder='ilp',
                 n_eval=(1, 2, 3, 4),
                 streaming=True,
                 overwritten_params=(),
                 eval_path=None,
                 output_path=None,
                 lm_proxy=None,
                 **kwargs):
        """Run the transduction model on designated test instances and report
        performance metrics.
        """
        # When evaluating multiple iterations of the same model over a fixed
        # partition, decoding should ensure that initialization isn't
        # unnecessarily repeated.
        #print(kwargs)

        if partition == 'test' and kwargs['subcorpus'] is not None and kwargs[
                'subcorpus'] == 'final':
            #if kwargs['subcorpus'] == 'final':
            print("FINAL")
            eval_instances = self.get_instances(partition=partition,
                                                debug_idxs=debug_idxs,
                                                skip_idxs=skip_idxs)
            system_name = learner.name

        elif learner is not None:
            eval_instances = self.decode_instances(learner,
                                                   partition=partition,
                                                   debug_idxs=debug_idxs,
                                                   skip_idxs=skip_idxs,
                                                   decoder=decoder,
                                                   streaming=streaming,
                                                   overwritten_params=\
                                                           overwritten_params,
                                                   **kwargs)
            system_name = learner.name
        else:
            eval_instances = self.get_instances(partition=partition,
                                                debug_idxs=debug_idxs,
                                                skip_idxs=skip_idxs)
            system_name = 'baseline'

        num_instances = len(eval_instances)

        # Record overwritten parameters in the filenames
        overwriting_str = None
        if len(overwritten_params) > 0:
            overwriting_str = '_OW-'
            i = 0
            for param_name, value in overwritten_params.iteritems():
                if isinstance(value, list) or isinstance(value, tuple):
                    overwriting_str += '+'.join(str(v) for v in sorted(value))
                else:
                    overwriting_str += str(value)
                i += 1
                if i < len(overwritten_params):
                    overwriting_str += '-'

        if output_path is not None:
            output_filename = ''.join(
                (output_path, '/', '_'.join((partition, 'under', system_name)),
                 overwriting_str if overwriting_str is not None else '', '_',
                 decoder, '.out'))
            outf = open(output_filename, 'wb')

        # Determine the evaluations to run by looking at a representative
        # instance
        i = 0
        while i < len(eval_instances) and \
                not hasattr(eval_instances[i], 'output_sent'):
            i += 1
        if i == len(eval_instances):
            print "WARNING: all instances failed; skipping evaluation"
            sys.exit()
        some_instance = eval_instances[i]
        has_labels = hasattr(some_instance, 'label_sentences')
        has_rasp = hasattr(some_instance.gold_sentences[0], 'relgraph')
        has_outtrees = hasattr(some_instance.output_sent, 'outtree')
        has_outframes = hasattr(some_instance.output_sent, 'outframes')

        # FIXME TEMPORARY! MUST MAKE "False" FOR TEST!
        skip_failed = False

        # Initialize the evaluations
        eval_obj = evaluation.Evaluation(title='TRANSDUCTION_EVAL')
        output_sents = []
        with timer.AvgTimer(num_instances):
            for i, instance in enumerate(eval_instances):
                sys.stdout.write("Evaluating " + str(num_instances) +
                                 (" " +
                                  partition if partition is not None else "") +
                                 " instances: " + str(i + 1) + '\r')

                # Duration and failure status
                eval_obj.include(
                        system=system_name,
                        corpus='other',
                        decode_time=instance.decode_times[-1],
                        solution_time=instance.solution_times[-1] \
                                if len(instance.solution_times) > 0 else 0,
                        inputs=len(instance.input_sents),
                        _failed=int(not hasattr(instance, 'output_sent')),
                        )

                if skip_failed and not hasattr(instance, 'output_sent'):
                    print "WARNING: Skipping failed instance", instance.idx
                    continue

                # POS tag recall
                for use_labels in set([False]) | set([has_labels]):
                    #for prefix in ('NN', 'VB', 'JJ', 'RB'):
                    #    p, r, f = instance.score_content_words(
                    #            use_labels=use_labels, prefixes=(prefix,))
                    #    eval_obj.add_metrics(
                    #            precision=p,
                    #            recall=r,
                    #            system=system_name,
                    #            corpus=('LBLs ' + prefix) if use_labels \
                    #                    else ('GOLD ' + prefix),
                    #            )
                    p, r, f = instance.score_content_words(
                        use_labels=use_labels, prefixes=('NN', 'VB'))
                    eval_obj.add_metrics(
                            precision=p,
                            recall=r,
                            system=system_name,
                            corpus=('LBLs ' + 'NN+VB') if use_labels \
                                else ('GOLD ' + 'NN+VB'),
                            )

                try:
                    if lm_proxy is not None:
                        output_tokens = instance.output_sent.tokens \
                                if hasattr(instance, 'output_sent') else []
                        eval_obj.include(system=system_name,
                                         corpus='other',
                                         lm=lm_proxy.score_sent(output_tokens))
                except jsonrpc.RPCTransportError:
                    print "ERROR: JSON-RPC hiccups; skipping LM scoring"
                    pass

                if decoder.startswith('dp+'):
                    # Record convergence of dual decomposition or
                    # bisection. Will be 0 if neither are used.
                    eval_obj.include(
                        system=system_name,
                        corpus='other',
                        convergence_=int(instance.converged),
                        iterations=instance.num_iterations,
                    )

                if len(instance.sentences) == 1:
                    # Paraphrasing or compression-specific metrics
                    eval_obj.include(
                        system=system_name,
                        corpus='STATS gold',
                        comp_=instance.get_gold_compression_rate(),
                        length=instance.avg_gold_len,
                        proj_=avg(
                            int(gold_sent.dparse.is_projective())
                            for gold_sent in instance.gold_sentences),
                        overlap_=avg(
                            instance.get_overlap(gold_sent)
                            for gold_sent in instance.gold_sentences),
                    )
                    eval_obj.include(
                        system=system_name,
                        corpus='STATS input',
                        comp_=1.0,
                        length=instance.avg_len,
                        proj_=int(
                            instance.sentences[0].dparse.is_projective()),
                        overlap_=instance.get_overlap(instance.sentences[0]))
                    eval_obj.include(
                        system=system_name,
                        corpus='STATS output',
                        comp_=instance.get_compression_rate(),
                        length=len(instance.output_sent.tokens) if hasattr(
                            instance, 'output_sent') else 0,
                    )
                    if hasattr(instance, 'output_sent') and has_outtrees:
                        eval_obj.include(
                                system=system_name,
                                corpus='STATS output',
                                proj_=int(instance.output_sent.\
                                          outtree.is_projective())
                                      if hasattr(instance.output_sent.outtree,\
                                                 'is_projective')
                                      else 0,
                                overlap_=instance.get_overlap(
                                    instance.output_sent,
                                    parse_type='outtree')
                                )

#                    print "INSTANCE ", instance.idx
#                    crossing_edges = \
#                        instance.output_sent.outtree.get_crossing_edges()
#                    print "\n\nINPUT:",
#                    self.dump_parse(instance.sentences[0])
#
#                    for gs, gold_sent in enumerate(
#                            instance.gold_sentences):
#                        # get output indices for gold
#                        gold_idxs = []
#                        i = 0
#                        for token in gold_sent.tokens:
#                            while instance.sentences[0].tokens[i] != token:
#                                i += 1
#                            gold_idxs.append((0,i))
#
#                        print "\nGOLD:", gs,
#                        self.dump_parse(gold_sent,
#                            idx_mapper=gold_idxs)
#
#                    print "\n\nOUTPUT:",
#                    self.dump_parse(instance.output_sent,
#                            parse_type='outtree',
#                            crossing_edges=crossing_edges,
#                            idx_mapper=instance.output_idxs)

# n-gram precision and recall
                for use_labels in set([False]) | set([has_labels]):
                    for n in n_eval:
                        p, r, f = instance.score_ngrams(n=n,
                                                        use_labels=use_labels)
                        eval_obj.add_metrics(
                            precision=p,
                            recall=r,
                            system=system_name,
                            corpus='LBLs n=' +
                            str(n) if use_labels else 'GOLD n=' + str(n),
                        )
                if hasattr(instance, 'output_sent') and has_outframes:
                    # Precision and recall for frames
                    p, r, f = instance.score_frames(fes=False,
                                                    frames_type='outframes',
                                                    use_labels=use_labels)
                    eval_obj.add_metrics(
                        precision=p,
                        recall=r,
                        system=system_name,
                        corpus="GOLD frames",
                    )

                    # Precision and recall for frame elements
                    p, r, f = instance.score_frames(fes=True,
                                                    frames_type='outframes',
                                                    use_labels=use_labels)
                    eval_obj.add_metrics(
                        precision=p,
                        recall=r,
                        system=system_name,
                        corpus="GOLD fes",
                    )

                # Parse output sentences for syntactic evaluation. The
                # 100 token limit is intended for the Stanford parser.
                if hasattr(instance, 'output_sent') and \
                        len(instance.output_sent.tokens) <= 100:
                    output_sents.append(instance.output_sent)

                # Write the output to a file
                if output_path is not None:
                    outf.write(instance.get_display_string())
#            print
            if output_path is not None:
                outf.close()

            # Parse-based evaluations
            try:
                parse_types = ['dparse']
                if has_outtrees:
                    parse_types.append('outtree')

                # Get annotations. Only run RASP if the inputs have RASP
                # annotations since it's slow
                annotations.annotate(output_sents, 'Stanford')
                if has_rasp:
                    annotations.annotate(output_sents, 'Rasp')
                    parse_types.append('relgraph')

                # Add dependency results to evaluations
                for i, instance in enumerate(eval_instances):
                    if skip_failed and not hasattr(instance, 'output_sent'):
                        print "WARNING: Skipping failed instance",
                        print instance.idx, "again"
                        continue

                    for parse_type in parse_types:
                        for use_labels in set([False]) | set([has_labels]):
                            name = ('LBLs ' if use_labels else 'GOLD ') + \
                                parse_type
                            p, r, f = instance.score_dependencies(
                                parse_type=parse_type, use_labels=use_labels)
                            eval_obj.add_metrics(
                                precision=p,
                                recall=r,
                                system=system_name,
                                corpus=name,
                                _failed=int(not instance.has_output_parses(
                                    parse_type=parse_type)))
            except OSError:
                print "Skipping parser evaluations"

        print eval_obj.title
        print eval_obj.table(skip_single_keys=True)
        if eval_path is not None and debug_idxs is None:
            eval_filename = ''.join(
                (eval_path, '/', '_'.join((partition, 'under', system_name)),
                 overwriting_str if overwriting_str is not None else '', '_',
                 decoder, '.eval'))
            eval_obj.save(eval_filename, append=False)
Пример #4
0
    def train_master_minibatch(self,
                               instances,
                               machine_idx=0,
                               num_epochs=20,
                               num_relaxed_epochs=0,
                               master_oversees=False,
                               **kwargs):
        """Run the master process of a round-robin distributed learner.
        """
        # Sanity check
        assert machine_idx == 0

        # Runtime parameters that are not saved with the model
        kwargs.update(self.params)

        # Divide instances into minibatches
        num_instances = len(instances)
        minibatches = self.machine_init_minibatch(
            instances,
            machine_idx=machine_idx,
            master_oversees=master_oversees,
            **kwargs)

        # Initialize slave proxies
        all_machines = self.master_init_slaves(len(self.current_weights),
                                               master_oversees=master_oversees,
                                               **kwargs)

        pool = multiprocessing.Pool(processes=len(all_machines),
                                    maxtasksperchild=1)
        #        manager = multiprocessing.Manager()
        #        instance_queue = manager.Queue() if not master_oversees else None
        #        instance_queue = None

        for n in range(self.current_epoch, num_epochs):
            if n > 0:  # for comparing and debugging
                random.shuffle(minibatches)

            if not master_oversees:
                print "WARNING: minibatching will duplicate the heap",
                print "for the master."
                print "Consider running with --master_oversees instead."
                self.temp_instances = instances
                self.temp_params = kwargs

            with timer.AvgTimer(num_instances):
                for b, minibatch in enumerate(minibatches):

                    # Print current epoch and minibatch
                    sys.stdout.write("[Epoch " + str(n) + "] Batch " +
                                     str(b + 1) + "/" + str(len(minibatches)) +
                                     "\r")
                    #                        # Put instances assigned to the master in a queue
                    #                        for master_idx in minibatch[0]:
                    #                            instance_queue.put(instances[master_idx])

                    args = [
                        (n, num_relaxed_epochs, all_machines[m], instance_idxs)
                        #                            self.current_weights, self.learning_rate.value(),
                        #                            kwargs, instance_queue)
                        for m, instance_idxs in enumerate(minibatch)
                    ]

                    weight_update = np.zeros(len(self.current_weights))
                    num_minibatch_updates = 0
                    for returned_args in pool.imap_unordered(
                            self.minibatch_epoch, args):
                        if returned_args is None:
                            print "Failed to decode minibatch", b
                            continue

                        sum_weight_update, num_machine_updates = returned_args
                        if sum_weight_update is not None:
                            weight_update += sum_weight_update
                            num_minibatch_updates += num_machine_updates

                    if num_minibatch_updates > 0:
                        weight_update /= num_minibatch_updates
                    else:
                        weight_update = None
                    self.update_weights(weight_update)
                print

            if not master_oversees:
                del self.temp_instances
                del self.temp_params

            interrupt = self.finish_epoch(**kwargs)
            if interrupt:
                break
Пример #5
0
    def train_master_serial(self,
                            instances,
                            machine_idx=0,
                            num_epochs=20,
                            num_relaxed_epochs=0,
                            master_oversees=False,
                            **kwargs):
        """Run the master process of a round-robin distributed learner.
        """
        # Sanity check
        assert machine_idx == 0
        if master_oversees:
            print "WARNING: Master cannot oversee in serial mode"

        # Runtime parameters that are not saved with the model
        kwargs.update(self.params)

        # Map out splits for each slave and generate a mapper from instances
        # to slaves.
        spans_per_machine = self.machine_init_serial(instances,
                                                     machine_idx=machine_idx,
                                                     **kwargs)
        num_instances = len(instances)
        instance_mapper = [None] * num_instances
        for machine_idx, span in enumerate(spans_per_machine):
            for instance_idx in range(*span):
                instance_mapper[instance_idx] = machine_idx

        # Initialize slave proxies
        all_machines = self.master_init_slaves(len(self.current_weights),
                                               **kwargs)

        instance_idxs = range(num_instances)
        for n in range(self.current_epoch, num_epochs):
            random.shuffle(instance_idxs)

            with timer.AvgTimer(num_instances):
                prev_machine_idx = -1
                for i, instance_idx in enumerate(instance_idxs):
                    machine_idx = instance_mapper[instance_idx]

                    if machine_idx == 0:
                        instance = instances[instance_idx]
                        weight_update = self.decode_instance(
                            instance,
                            self.current_weights,
                            i,
                            epoch=n,
                            num_instances=num_instances,
                            relax=n < num_relaxed_epochs,
                            **kwargs)
                    else:
                        # If the last instance was decoded by the same slave
                        # we don't need to send the weights again since it
                        # already has them.
                        weight_update, num_updates = \
                                self.master_decode_instance(
                                all_machines[machine_idx],
                                [instance_idx],
                                self.current_weights,
                                self.learning_rate.value(),
                                relax=n < num_relaxed_epochs,
                                slave_has_weights=(
                                    machine_idx==prev_machine_idx),
                                slave_keeps_weights=True)

                    prev_machine_idx = machine_idx
                    self.update_weights(weight_update)
                print
            interrupt = self.finish_epoch(**kwargs)
            if interrupt:
                break
Пример #6
0
    def train_multi(self,
                    instances,
                    num_epochs=20,
                    num_relaxed_epochs=0,
                    max_processes=None,
                    **kwargs):
        """Trains a model with the perceptron-style algorithm from
        Collins (2002). Attempts to speed up computation by utilizing
        multiple cores with iterative parameter scaling as described in
        McDonald et al. (2009). Saves model state after every epoch.
        """
        # External models that are needed for instance initialization may
        # be too large to save. We assume that these models will be supplied
        # directly to train(); these are then added to the saved parameters
        # for initialization.
        kwargs.update(self.params)
        # TODO: These arguments are NOT passed to decoding when using
        # multiple processes to avoid a deep copy of the models for each
        # process. Shared memory across processes can be implemented using
        # Value, Array or, more generally, shared ctypes. This is currently
        # not implemented. If needed, use parallel_decoding=False.
        self.initialize(instances, **kwargs)

        # Determine number of parallel processes to use
        num_processes = multiprocessing.cpu_count() if max_processes is None \
                else min(max_processes, multiprocessing.cpu_count())

        # The size of each parallel shard is taken as the floor of the number
        # of instances per process in order to prevent biases in the
        # parameter mixing
        num_instances = len(instances)
        num_shards = num_processes
        shard_size = np.floor(num_instances / num_shards)
        shard_spans = [[i * shard_size, (i + 1) * shard_size]
                       for i in range(num_shards)]
        shard_spans[-1][1] = num_instances

        for n in range(self.current_epoch, num_epochs):
            # Randomly shuffle training instances and break them into shards
            # using the indices created earlier
            random.shuffle(instances)
            shards = [instances[begin:end] for begin, end in shard_spans]

            # Pack shard indices and current weight vector together because
            # Pool doesn't support multiple function arguments for parallel
            # maps. Note that local keyword arguments that are not in
            # self.params will not be available for decoding.
            args = [(n, num_relaxed_epochs, shard, self.current_weights)
                    for shard in shards]

            # Initialize a process pool to run a single epoch on
            # each shard asychronously
            pool = multiprocessing.Pool(processes=num_processes)
            next_w = np.zeros(len(self.features))
            with timer.AvgTimer(num_instances):
                for returned_args in pool.imap_unordered(
                        self.per_shard_epoch, args):
                    # Unpack returned values
                    w, sum_w, num_shard_updates = returned_args

                    # The final weight vector from the shard, normalized by
                    # the size of the shard, contributes to the input weight
                    # vector for the next iteration
                    normalizer = num_shard_updates / len(instances)
                    next_w += (np.array(w) * normalizer)

                    # Update the running sum of shard weights
                    # TODO: move perceptron-specific update code to subclass
                    self.sum_weights += sum_w
                    self.num_updates_seen += num_shard_updates
                print

            # Terminate pool (processes should already be finished)
            pool.terminate()

            # Set weight vector for next parallel epoch
            self.current_weights = next_w[:]
            interrupt = self.finish_epoch(**kwargs)
            if interrupt:
                break