示例#1
0
def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher, quotes, quoter):
    '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.'''
    with file(ccg_auto_out, 'w') as ccg_out:
        with file(ccg_parg_out, 'w') as parg_out:
            penn_trees = list(PTBReader(ptb_file))
            ccg_trees  = list(CCGbankReader(ccg_file))
            deps       = list(CCGbankDepsReader(deps_file))
            
            matched_penn_trees = match_trees(penn_trees, ccg_trees)

            for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees, ccg_trees, deps):
                ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation

                quote_spans = spans(ptb_tree)
                while quote_spans:
                    value = quote_spans.pop(0)
                    span_start, span_end, quote_type = value
                    if span_start is None and span_end is None: continue
                    
                    info("Reinstating quotes to %s (%s, %s)", ccg_bundle.label(), span_start, span_end)
                    
                    ccg_tree, quote_indices = quoter.attach_quotes(ccg_tree, span_start, span_end, quote_type, higher, quotes)
                    # In case a new root has been installed, re-assign the new root to the CCGbank bundle
                    ccg_bundle.derivation = ccg_tree
                    
                    # Shift remaining quote span indices by the number of quotes that have been inserted
                    quote_spans = fix_quote_spans(quote_spans, quote_indices)
                    dep = fix_dependencies(dep, quote_indices)
                    
                print >> parg_out, dep
                print >> ccg_out,  ccg_bundle
示例#2
0
 def __iter__(self):
     for left, right in izip(self.reader(self.leftdir), self.reader(self.rightdir)):
         info("Processing %s/%s", left.label(), right.label())
         deriv = Derivation(left, right)
         yield deriv
         
         del deriv
         del left
         del right
示例#3
0
    def run_filters(self, filters, files):
        # If all given filters were not found or had wrong argument count, do nothing
        if not filters: return
        
        reader_args = {}
        if self.reader_class_name:
            try:
                reader_class = globals()[self.reader_class_name]
                info("Using reader class %s.", self.reader_class_name)
                
                reader_args['reader_class'] = reader_class
            except KeyError:
                raise RuntimeError("Reader class %s not found." % self.reader_class_name)
        
        for file in self.transform(files):
            if self.is_pair_spec(file):
                meta_reader = PairedReader
            else:
                meta_reader = DirFileGuessReader
                
            try:
                self.last_exceptions = []
                
                for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args):
                    if self.verbose: info("Processing %s...", derivation_bundle.label())
                    try:
                        for filter in filters:
                            filter.context = derivation_bundle

                        if filter.accept_leaf is not None:
                            for leaf in leaves(derivation_bundle.derivation):
                                for filter in filters:
                                    filter.accept_leaf(leaf)

                                    if filter.accept_comb_and_slash_index is not None:
                                        try:
                                            for slash_index, comb in enumerate(applications_per_slash(leaf)):
                                                filter.accept_comb_and_slash_index(leaf, comb, slash_index)
                                        except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too
                                            pass

                        for filter in filters:
                            filter.accept_derivation(derivation_bundle)
                            filter.context = None
                            
                    except IOError, e:
                        # If output is going to a pager, and the user requests an interrupt (^C)
                        # the filter fails with IOError: Broken pipe
                        # In that case, running filters on further derivations will continue to
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return
                            
                    except Exception, e:
                        self.last_exceptions.append( (derivation_bundle, sys.exc_info()) )
                        
                        if self._break_on_exception:
                            raise FilterException(e, None)
示例#4
0
def main(argv):
    parser = OptionParser()

    register_builtin_switches(parser)                        
    opts, args = parser.parse_args(argv)
    
    if not all_required_args_present(opts):
        parser.print_help()
        sys.exit(1)
    
    quoter_class = {
        'span': SpanQuoter,
        'lca' : LCAQuoter
    }[opts.quote_method]
    punct_class = {
        'swap' : SwapComma,
        'shift': ShiftComma
    }.get(opts.punct_method, None)
    quoter = quoter_class(punct_class)
    
    remaining_args = args[1:]
    if not remaining_args:
        # If no sec/doc specifiers are given, assume 'all sections all documents'
        remaining_args.append(':')
        
    ptb_files_spec = parse_requested_derivs(remaining_args)
    
    for sec_glob, doc_glob in ptb_files_spec:
        for ptb_file in glob(os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))):
            info("Processing %s", ptb_file)
            
            matches = PTBFileRegex.search(ptb_file)
            if matches and len(matches.groups()) == 2:
                sec, doc = matches.groups()
                
                ccg_file =  os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc))
                deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc))
                
                if not opts.quiet:
                    if not os.path.exists(ccg_file):
                        warn("No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file)
                    if not os.path.exists(deps_file):
                        warn("No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file)
                        
                ccg_auto_dir, ccg_parg_dir = [os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG')]
                if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir)
                if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir)
                
                ccg_auto_out, ccg_parg_out = (os.path.join(ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)),
                                              os.path.join(ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc)))
                                              
                process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, 
                                     opts.higher, opts.quotes, quoter)
                
            else:
                warn("Could not find, so ignoring %s", ptb_file)
示例#5
0
    def __iter__(self):
        for left, right in izip(self.reader(self.leftdir),
                                self.reader(self.rightdir)):
            info("Processing %s/%s", left.label(), right.label())
            deriv = Derivation(left, right)
            yield deriv

            del deriv
            del left
            del right
示例#6
0
 def __iter__(self):
     # TODO: duplication
     for section_path in self.sections:
         # If _topdir_ has directories under, expand to use the files it contains
         if os.path.isdir(section_path):
             docs = glob(os.path.join(section_path, '*'))
             for doc_path in docs:
                 if self.verbose: info("Processing %s...", doc_path)
                 reader = self.reader(doc_path)
                 for deriv_bundle in reader:
                     yield deriv_bundle
                 del reader
         # Otherwise _topdir_ is flat: read the files it contains
         else:
             reader = self.reader(section_path)
             for deriv_bundle in reader:
                 yield deriv_bundle
             del reader
示例#7
0
def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher,
            quotes, quoter):
    '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.'''
    with file(ccg_auto_out, 'w') as ccg_out:
        with file(ccg_parg_out, 'w') as parg_out:
            penn_trees = list(PTBReader(ptb_file))
            ccg_trees = list(CCGbankReader(ccg_file))
            deps = list(CCGbankDepsReader(deps_file))

            matched_penn_trees = match_trees(penn_trees, ccg_trees)

            for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees,
                                                     ccg_trees, deps):
                ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation

                quote_spans = spans(ptb_tree)
                while quote_spans:
                    value = quote_spans.pop(0)
                    span_start, span_end, quote_type = value
                    if span_start is None and span_end is None: continue

                    info("Reinstating quotes to %s (%s, %s)",
                         ccg_bundle.label(), span_start, span_end)

                    ccg_tree, quote_indices = quoter.attach_quotes(
                        ccg_tree, span_start, span_end, quote_type, higher,
                        quotes)
                    # In case a new root has been installed, re-assign the new root to the CCGbank bundle
                    ccg_bundle.derivation = ccg_tree

                    # Shift remaining quote span indices by the number of quotes that have been inserted
                    quote_spans = fix_quote_spans(quote_spans, quote_indices)
                    dep = fix_dependencies(dep, quote_indices)

                print >> parg_out, dep
                print >> ccg_out, ccg_bundle
示例#8
0
def main(argv):
    parser = OptionParser()

    register_builtin_switches(parser)
    opts, args = parser.parse_args(argv)

    if not all_required_args_present(opts):
        parser.print_help()
        sys.exit(1)

    quoter_class = {'span': SpanQuoter, 'lca': LCAQuoter}[opts.quote_method]
    punct_class = {
        'swap': SwapComma,
        'shift': ShiftComma
    }.get(opts.punct_method, None)
    quoter = quoter_class(punct_class)

    remaining_args = args[1:]
    if not remaining_args:
        # If no sec/doc specifiers are given, assume 'all sections all documents'
        remaining_args.append(':')

    ptb_files_spec = parse_requested_derivs(remaining_args)

    for sec_glob, doc_glob in ptb_files_spec:
        for ptb_file in glob(
                os.path.join(opts.penn_in, sec_glob,
                             "wsj_%s%s.mrg" % (sec_glob, doc_glob))):
            info("Processing %s", ptb_file)

            matches = PTBFileRegex.search(ptb_file)
            if matches and len(matches.groups()) == 2:
                sec, doc = matches.groups()

                ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec,
                                        "wsj_%s%s.auto" % (sec, doc))
                deps_file = os.path.join(opts.ccg_in, 'PARG', sec,
                                         "wsj_%s%s.parg" % (sec, doc))

                if not opts.quiet:
                    if not os.path.exists(ccg_file):
                        warn(
                            "No corresponding CCGbank file %s for Penn file %s",
                            ccg_file, ptb_file)
                    if not os.path.exists(deps_file):
                        warn(
                            "No corresponding CCGbank dependency file %s for CCG file %s",
                            deps_file, ccg_file)

                ccg_auto_dir, ccg_parg_dir = [
                    os.path.join(opts.outdir, part, sec)
                    for part in ('AUTO', 'PARG')
                ]
                if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir)
                if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir)

                ccg_auto_out, ccg_parg_out = (os.path.join(
                    ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)),
                                              os.path.join(
                                                  ccg_parg_dir,
                                                  'wsj_%s%s.parg' %
                                                  (sec, doc)))

                process(ptb_file, ccg_file, deps_file, ccg_auto_out,
                        ccg_parg_out, opts.higher, opts.quotes, quoter)

            else:
                warn("Could not find, so ignoring %s", ptb_file)
示例#9
0
    def run_filters(self, filters, files):
        # If all given filters were not found or had wrong argument count, do nothing
        if not filters: return

        reader_args = {}
        if self.reader_class_name:
            try:
                reader_class = globals()[self.reader_class_name]
                info("Using reader class %s.", self.reader_class_name)

                reader_args['reader_class'] = reader_class
            except KeyError:
                raise RuntimeError("Reader class %s not found." %
                                   self.reader_class_name)

        for file in self.transform(files):
            if self.is_pair_spec(file):
                meta_reader = PairedReader
            else:
                meta_reader = DirFileGuessReader

            try:
                self.last_exceptions = []

                for derivation_bundle in meta_reader(file,
                                                     verbose=self.verbose,
                                                     **reader_args):
                    if self.verbose:
                        info("Processing %s...", derivation_bundle.label())
                    try:
                        for filter in filters:
                            filter.context = derivation_bundle

                        if filter.accept_leaf is not None:
                            for leaf in leaves(derivation_bundle.derivation):
                                for filter in filters:
                                    filter.accept_leaf(leaf)

                                    if filter.accept_comb_and_slash_index is not None:
                                        try:
                                            for slash_index, comb in enumerate(
                                                    applications_per_slash(
                                                        leaf)):
                                                filter.accept_comb_and_slash_index(
                                                    leaf, comb, slash_index)
                                        except AttributeError:  # TODO: hacky and inefficient, need this to work for PTB too
                                            pass

                        for filter in filters:
                            filter.accept_derivation(derivation_bundle)
                            filter.context = None

                    except IOError, e:
                        # If output is going to a pager, and the user requests an interrupt (^C)
                        # the filter fails with IOError: Broken pipe
                        # In that case, running filters on further derivations will continue to
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return

                    except Exception, e:
                        self.last_exceptions.append(
                            (derivation_bundle, sys.exc_info()))

                        if self._break_on_exception:
                            raise FilterException(e, None)